-
Notifications
You must be signed in to change notification settings - Fork 0
/
valueIterationAgents.py
249 lines (207 loc) · 7.51 KB
/
valueIterationAgents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# valueIterationAgents.py
# -----------------------
# Licensing Information: Please do not distribute or publish solutions to this
# project. You are free to use and extend these projects for educational
# purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
# John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
# For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
import mdp, util
import math
import random
from featureExtractors import *
from learningAgents import ValueEstimationAgent
INF = 2**31-1 #float('inf')
class ValueIterationAgent(ValueEstimationAgent):
"""
* Please read learningAgents.py before reading this.*
A ValueIterationAgent takes a Markov decision process
(see mdp.py) on initialization and runs value iteration
for a given number of iterations using the supplied
discount factor.
"""
def __init__(self, mdp, discount = 0.9, iterations = 100):
"""
Your value iteration agent should take an mdp on
construction, run the indicated number of iterations
and then act according to the resulting policy.
Some useful mdp methods you will use:
mdp.getStates()
mdp.getPossibleActions(state)
mdp.getTransitionStatesAndProbs(state, action)
mdp.getReward(state, action, nextState)
"""
self.mdp = mdp
self.discount = discount
self.iterations = iterations
self.values = util.Counter() # A Counter is a dict with default 0
"*** YOUR CODE HERE ***"
for time in range(iterations):
values = util.Counter()
for state in mdp.getStates():
if mdp.isTerminal(state):
values[state] = 0
else:
maxValue = -INF
for action in mdp.getPossibleActions(state):
maxValue = max(maxValue, self.getQValue(state, action))
values[state] = maxValue
self.values = values
def getValue(self, state):
"""
Return the value of the state (computed in __init__).
"""
return self.values[state]
def getQValue(self, state, action):
"""
The q-value of the state action pair
(after the indicated number of value iteration
passes). Note that value iteration does not
necessarily create this quantity and you may have
to derive it on the fly.
"""
"*** YOUR CODE HERE ***"
q = 0
for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
q += prob * (self.mdp.getReward(state, action, nextState) + self.discount * self.getValue(nextState))
return q
def getPolicy(self, state):
"""
The policy is the best action in the given state
according to the values computed by value iteration.
You may break ties any way you see fit. Note that if
there are no legal actions, which is the case at the
terminal state, you should return None.
"""
"*** YOUR CODE HERE ***"
bestActions = []
maxValue = -INF
for action in self.mdp.getPossibleActions(state):
q = self.getQValue(state, action)
if q >= maxValue:
maxValue = q
bestActions.append (action)
return random.choice (bestActions)
def getAction(self, state):
"Returns the policy at the state (no exploration)."
return self.getPolicy(state)
def pickAction(self, state):
"Returns the action according to probability distribution."
return util.chooseFromDistribution()
def getQValues(self):
"""
Returns the Q counter - calculated asynchronously.
Make sure this function is called for collecting data.
"""
qValues = util.Counter()
for state in self.mdp.getStates():
for action in self.mdp.getPossibleActions(state):
qValues[state, action] = self.getQValue(state, action)
return qValues
class ApproximateValueIterAgent(ValueIterationAgent):
"""
Similar to ApproximateQAgent.
Using extractors, but only generalize on values of states,
not state, action pairs.
"""
def __init__(self, extractor='IdentityExtractor', **args):
if extractor in args:
extractor = args['extractor']
self.featExtractor = util.lookup(extractor, globals())()
# You might want to initialize weights here.
"*** YOUR CODE HERE ***"
self.mdp = args['mdp']
self.discount = args['gamma']
self.iterations = args['iterations']
self.alpha = args['alpha']
self.weights = util.Counter()
self.times = 0
if False: #extractor == 'BairdsExtractor':
# doing evil thing here
self.weights[0] = 1
self.weights[1] = 1
self.weights[2] = 1
self.weights[3] = 1
self.weights[4] = 1
self.weights[5] = 1
self.weights[6] = 1
# do update, full backup (sweep every state)
for time in range(self.iterations):
for state in self.mdp.getStates():
if not self.mdp.isTerminal(state):
# find the best action
maxValue = None
bestAction = None
for action in self.mdp.getPossibleActions(state):
thisValue = self.getQValue(state, action)
if bestAction == None or thisValue > maxValue:
maxValue = thisValue
bestAction = action
for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, bestAction):
self.update(state, action, nextState, self.mdp.getReward(state, action, nextState), prob)
self.outputWeights(time)
self.outputValues(time)
self.outputMSE(time)
"""
def getQValue(self, state, action):
# Don't need to override getQvalue.
# Make sure the Q values are calculated from values of states
# which actually with function approximation applied
"""
def getValue(self, state):
"""
Should return V(state) = w * featureVector
where * is the dotProduct operator
"""
"*** YOUR CODE HERE ***"
v = 0.0
# this feature should be designed for not caring about action.
# passing None here. FIXME
for feature, value in self.featExtractor.getFeatures(state, None).items():
# weight * feature
v += self.weights[feature] * value
return v
#util.raiseNotDefined()
def update(self, state, action, nextState, reward, prob = 1):
"""
Should update your weights based on transition
"""
"*** YOUR CODE HERE ***"
correction = (reward + self.discount * self.getValue(nextState)) - self.getValue(state)
for feature, value in self.featExtractor.getFeatures(state, None).items():
self.weights[feature] += 1.0 / time * correction * value * prob
#util.raiseNotDefined()
def outputWeights(self, time):
f = open("weights", "a")
output = str(time) + ' '
for weight in self.weights.values():
output += str(weight) + ' '
output += '\n'
f.write(output)
f.close()
def outputValues(self, time):
f = open("values", "a")
values = [self.getValue(state) for state in self.mdp.getStates()]
output = str(time) + ' '
for value in values:
output += str(value) + ' '
output += '\n'
f.write(output)
f.close()
def outputMSE(self, time):
"""
for bairds problem only!
"""
#FIXME
values = [self.getValue(state) for state in self.mdp.getStates()]
mean = 1.0 * sum(values) / len(values)
error = math.sqrt(sum([(x - mean) ** 2 for x in values]))
f = open("errors", "a")
f.write(str(time) + ' ' + str(error) + '\n')
f.close
def final(self, state):
"Called at the end of each game."
# did we finish training?
if self.episodesSoFar == self.numTraining:
# you might want to print your weights here for debugging
"*** YOUR CODE HERE ***"
pass