forked from wboag/Obstacles
-
Notifications
You must be signed in to change notification settings - Fork 0
/
empiricalMDP.py
executable file
·165 lines (111 loc) · 4.65 KB
/
empiricalMDP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#
# Empirical MDP
#
# Purpose: Estimate MDP based on empirical observations of transitions and rewards
#
# Author: Willie Boag
#
from collections import defaultdict
import state as State
class EmpiricalMDP:
def __init__(self, all_qstate_results, rewardValues, skills):
# Parameters
self.alpha = 0.5
# Constant rewards for each terrain type
self.rewardValues = rewardValues
# Empirical estimate of transition model
# Initially, assume every q-state result is equally likely
counts = defaultdict(lambda:defaultdict(lambda:{}))
for state,action,nextState in all_qstate_results:
counts[state][action][nextState] = 1
self.frequencies = counts
# Inferred skills
self.skills = skills
# Convergence streaks
self.streaks = { k:0 for k in self.skills }
self.completed = []
def getPossibleActions(self, state):
return self.frequencies[state].keys()
def getSuccessors(self, state):
retVal = []
for action in self.frequencies[state]:
retVal += self.frequencies[state][action].keys()
return list(retVal)
def getStates(self):
return self.frequencies.keys()
def getReward(self, state, action, nextState):
if action == 'finish':
return 1000
x, y = state.getPosition()
manDist = (abs(y - 9) + abs(x - 0))
terrain = state.getTerrainType()
skillScore = self.skills[terrain] * self.rewardValues[terrain]
return skillScore + manDist
def isTerminal(self, state):
return (self.frequencies[state].keys() == ['finish'])
def getTransitionStatesAndProbs(self, state, action):
if action not in self.getPossibleActions(state):
raise "Illegal action!"
x, y = state.getPosition()
t = state.terrain
if action == 'finish':
return [(state, 1)]
# Store mapping from state to likelihood
possibles = defaultdict(lambda:0)
chanceToSlideLeft = 0.1 - (0.01 * (abs(x - 9)))
if x != 9:
possibles[State.state((x+1,y),t)] += chanceToSlideLeft
else:
possibles[state] += chanceToSlideLeft
chanceToSlideDown = 0.1 - (0.01 * (abs(y - 0)))
if y != 0:
possibles[State.state((x,y-1),t)] += chanceToSlideDown
else:
possibles[state] += chanceToSlideDown
terrainElement = state.getTerrainType()
if terrainElement == 'mountain':
chanceToFall = abs(self.skills[terrainElement] - 1) / 2
else:
chanceToFall = abs(self.skills[terrainElement] - 1) / 2
if x != 9 and y != 0:
possibles[State.state((x+1,y-1),t)] += chanceToFall
elif x != 9:
possibles[State.state((x+1,y ),t)] += chanceToFall
elif y != 0:
possibles[State.state((x ,y-1),t)] += chanceToFall
elif x == 9 and y == 0:
possibles[State.state((x ,y ),t)] += chanceToFall
else:
raise 'didnt account for this'
if action == 'north':
newState = State.state((x ,y-1),t)
if action == 'east':
newState = State.state((x+1,y ),t)
if action == 'west':
newState = State.state((x-1,y ),t)
if action == 'south':
newState = State.state((x ,y+1),t)
possibles[newState] += 1 - (chanceToFall + chanceToSlideLeft + chanceToSlideDown)
# Probabilities must sum to 1
assert abs(sum(possibles.values()) - 1) < .001
return possibles.items()
def converged(self):
return len(self.completed) == 4
def update(self, state, action, nextState, reward, terrain):
# If skill for terrain has already convereged
if terrain in self.completed:
return
# Get empirical skill estimate
x,y = state.getPosition()
skillScore = reward - (abs(y - 9) + abs(x - 0))
skillSample = skillScore/self.rewardValues[terrain]
difference = skillSample - self.skills[terrain]
if difference < .01:
self.streaks[terrain] += 1
if self.streaks[terrain] >= 25:
self.completed.append(terrain)
else:
self.streaks[terrain] = 0
self.skills[terrain] = (1 - self.alpha) * self.skills[terrain] + \
self.alpha * skillSample
#print self.skills