/
Agents.py
253 lines (190 loc) · 8 KB
/
Agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from collections import Counter
import numpy as np
import utilities
"""
Model-Free Agent class
"""
class RLAgent(object):
def __init__(self, Maze, alpha, gamma, epsilon, action_cost, learning):
self.Maze = Maze
self.action_cost = action_cost
self.posCounter, self.qValues = Counter(), Counter()
self.alpha, self.gamma, self.epsilon = alpha, gamma, epsilon
self.learning_mode = learning
self.position, self.orientation = None, None
def change_maze(self, maze):
"""
Changes agent's environment and sets agent position to the new environment's start state
"""
self.Maze = maze
self.position = maze.start[0]
self.orientation = maze.start[1]
def reset_agent(self):
"""
Reset Agent's state to start state
"""
self.position = self.Maze.start[0]
self.orientation = self.Maze.start[1]
self.posCounter[self.position] += 1
for state in self.Maze.exploreVal.keys():
self.Maze.exploreVal[state] += 1
def update_agent_state(self, next_state, action):
"""
Update agent's orientation and state, given an action
"""
self.position = next_state
# if 'backwards' action, flip orientation
if utilities.is_forwards(self.orientation, action):
self.orientation = action
else:
self.orientation = utilities.oppositeAction(action)
self.Maze.exploreVal[self.position] = 0
self.posCounter[self.position] += 1
def finished_maze(self):
return self.Maze.is_terminal(self.position)
def reset_Qvalues(self):
self.qValues = Counter()
def get_action_cost(self, action):
return -max(self.action_cost[a] for a in action)
def get_value(self, state):
"""
Returns max_action Q(state,action)
where the max is over legal actions.
"""
legalActions = self.Maze.get_legal_dirs(state)
lst = [self.qValues[(state, act)] for act in legalActions]
return max(lst)
def take_action(self, action):
pass
def get_action(self):
pass
def update_Qvalues(self):
pass
"""
Q-Learning Agent
"""
class QLAgent(RLAgent):
def take_action(self, action):
new_state, taken_action = self.Maze.take_action(self.position, action)
self.update_Qvalues(taken_action, new_state)
self.update_agent_state(new_state, taken_action)
def update_Qvalues(self, action, nextPos):
"""
Update Qvalues based on learning_mode
"""
currVal = self.qValues[(self.position, action)]
nextVal = self.get_value(nextPos)
if self.learning_mode == 1: # std
reward = self.Maze.get_value(nextPos) + self.get_action_cost(action)
elif self.learning_mode == 2: # RD
reward = self.Maze.get_discount_value(nextPos) + self.get_action_cost(action)
elif self.learning_mode == 3: # ER
reward = self.Maze.get_value(nextPos) + self.get_action_cost(action) + \
self.Maze.get_exploration_bonus(nextPos)
elif self.learning_mode == 4: # RDER
reward = self.Maze.get_discount_value(nextPos) + self.get_action_cost(action) + \
self.Maze.get_exploration_bonus(nextPos)
self.qValues[(self.position, action)] = currVal + self.alpha*(reward + nextVal - currVal)
"""
SARSA Agent
"""
class SarsaAgent(RLAgent):
def __init__(self, Maze, alpha, gamma, epsilon, action_cost, learning):
super(SarsaAgent, self).__init__(Maze, alpha, gamma, epsilon, action_cost, learning)
self.prev_state, self.prev_action = None, None
def reset_agent(self):
super(SarsaAgent, self).reset_agent()
self.prev_state, self.prev_action = None, None
def update_agent_state(self, next_state, action):
self.prev_state = self.position
self.prev_action = action
super(SarsaAgent, self).update_agent_state(next_state, action)
def take_action(self, action):
new_state, taken_action = self.Maze.take_action(self.position, action)
# we don't update until the second move
if self.prev_state is not None and self.prev_action is not None:
self.update_Qvalues(self.prev_state, self.prev_action, new_state, taken_action)
# if we finish the maze, update Q-values assuming next action is "exit"
if self.Maze.is_terminal(new_state):
self.update_Qvalues(self.prev_state, self.prev_action, self.position, "exit")
self.update_agent_state(new_state, taken_action)
def update_Qvalues(self, s1, a1, s2, a2):
"""
Update Qvalues based on learning_mode
"""
currVal = self.qValues[(s1, a1)]
nextVal = 0
if (a2 != "exit"):
nextVal = self.qValues[(s2, a2)]
if self.learning_mode == 1:
reward = self.Maze.get_value(s2) + self.get_action_cost(self.prev_action)
elif self.learning_mode == 2:
reward = self.Maze.get_discount_value(s2) + self.get_action_cost(self.prev_action)
elif self.learning_mode == 3:
reward = self.Maze.get_value(s2) + self.get_action_cost(self.prev_action) + \
self.Maze.get_exploration_bonus(s2)
elif self.learning_mode == 4:
reward = self.Maze.get_discount_value(s2) + self.get_action_cost(self.prev_action) + \
self.Maze.get_exploration_bonus(s2)
self.qValues[(s1, a1)] = currVal + self.alpha*(reward + nextVal - currVal)
"""
Epsilon Greedy Agent
"""
class EpsilonGreedyAgent(RLAgent):
def get_probability(self, action):
legal_actions = self.Maze.get_legal_dirs(self.position)
lst = [(self.qValues[(self.position, action)], action) for action in legal_actions]
best_action = max(lst)[0]
if action == best_action:
return 1 - self.epsilon + self.epsilon/len(legal_actions)
else:
return self.epsilon/len(legal_actions)
def get_action(self):
"""
Compute epsilon greedy move
"""
legal_actions = self.Maze.get_legal_dirs(self.position)
if utilities.rand_bool(self.epsilon):
return utilities.rand_choice(legal_actions)
# get mapping from move to value
lst = [(self.qValues[(self.position, action)], action) for action in legal_actions]
best = max(lst)[0]
tiedMoves = [move for val, move in lst if val == best]
return utilities.rand_choice(tiedMoves)
"""
Epsilon Soft Agent (Boltzmann Distribution)
"""
class EpsilonSoftAgent(RLAgent):
def softmax(self, lst):
"""Compute softmax values for each sets of scores in lst"""
return np.exp(lst) / np.sum(np.exp(lst), axis=0)
def get_probability(self, action):
legal_actions = self.Maze.get_legal_dirs(self.position)
boltz_values = self.softmax([self.qValues[(self.position, a)] for a in legal_actions])
return (1 - self.epsilon) * boltz_values[legal_actions.index(action)] + (self.epsilon) * (1.0 / len(legal_actions))
def get_action(self):
legal_actions = self.Maze.get_legal_dirs(self.position)
if utilities.rand_bool(self.epsilon):
return utilities.rand_choice(legal_actions)
boltz_values = self.softmax([self.qValues[(self.position, a)] for a in legal_actions])
return np.random.choice(legal_actions, p=boltz_values)
"""
Epsilon Greedy Q-Learning
"""
class GreedyQLAgent(QLAgent, EpsilonGreedyAgent):
pass
"""
Epsilon Soft Q-learning
"""
class SoftQLAgent(QLAgent, EpsilonSoftAgent):
pass
"""
Epsilon Greedy SARSA
"""
class GreedySarsaAgent(SarsaAgent, EpsilonGreedyAgent):
pass
"""
Epsilon Soft SARSA
"""
class SoftSarsaAgent(SarsaAgent, EpsilonSoftAgent):
pass