/
agent.py
328 lines (255 loc) · 11.9 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import itertools
import random
import numpy as np
from collections import deque
import pdb
import tensorflow as tf
from toolz import accumulate
from utils import wrap_graph_c as wrap_graph
from utils import encode_one_hot
class Policy(object):
def __init__(self):
pass
class RandomPolicy(Policy):
def __init__(self, action_space):
self.action_space = action_space
def act(self, state):
return np.random.choice(self.action_space, size=1)[0]
class PongAgent(object):
def __init__(self):
self.VALID_ACTIONS = [2,3]
self.MAX_LEN = 100 # Max capacity of experience buffer
#Experience buffers: state_buffer stores raw observations, internal_state_buffer stores preprocessed states that are fed to the policy
self._state_buffer = deque(maxlen=self.MAX_LEN)
self._internal_state_buffer = deque(maxlen=self.MAX_LEN)
self._action_logits_buffer = deque(maxlen=self.MAX_LEN)
self._action_buffer = deque(maxlen=self.MAX_LEN)
self._reward_buffer = deque(maxlen=self.MAX_LEN)
self._discounted_reward_buffer = deque(maxlen=self.MAX_LEN)
def _preprocess(self, I):
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float).ravel()
def act(self, state):
raise NotImplementedError
@property
def num_episodes(self):
assert len(self._state_buffer) == len(self._action_buffer) == len(self._reward_buffer)
return len(self._state_buffer)
def discount(self, rs, discount_rate):
discounted = accumulate(lambda prev, curr: discount_rate * prev + curr, reversed(rs))
return np.fromiter(discounted,'float')[::-1]
def partition_rewards(self, rewards):
'''Partition episode of rewards into list of iterables where each sequence ends when reward != 0
Returns:
list of lists: each list is a reward sequence within the episode
'''
rewards = np.array(rewards)
bounds = np.zeros(np.count_nonzero(rewards) + 1, dtype=int)
bounds[1:] = np.argwhere(rewards).ravel() + 1
return [rewards[bounds[i]:bounds[i+1]] for i in range(len(bounds) - 1)]
def discount_rewards(self, rewards, discount_rate):
rewards = self.partition_rewards(rewards)
discounted_seqs = [self.discount(rs, discount_rate) for rs in rewards]
return np.concatenate(discounted_seqs).reshape((-1,1)).ravel()
def normalize(self, data):
data -= np.mean(data)
data /= np.std(data)
return data
class RandomAgent(PongAgent):
def act(self, state):
a = np.random.choice(self.VALID_ACTIONS, size=1)[0]
if a == 2:
aprob = [.5] * 2
else:
aprob = [.5] * 2
return aprob, a
class PGAgent(PongAgent):
def __init__(self, g, sess, state_dim, action_net_ctor, action_net_params, optimizer_params, gamma=.99, epsilon=1, optimizer=tf.train.RMSPropOptimizer):
super(PGAgent, self).__init__()
self.state_dim = state_dim
self.gamma = gamma #rate at which to discount rewards
self.epsilon = epsilon #exploration, 0 corresponds to always explore, 1 corresponds to stochastic policy
assert sess.graph == g
self.g = g
self.sess = sess
self.optimizer = optimizer(**optimizer_params)
#Create input ops, action network
self._create_variables()
self._build_action_network(action_net_ctor, action_net_params)
self._calculate_loss()
#Housekeeping: initialize variables, set up tensorflow summaries
self._initialize()
@wrap_graph
def _initialize(self):
init_op = tf.initialize_all_variables()
self.sess.run(init_op)
@wrap_graph
def _create_variables(self):
with tf.name_scope("inputs"):
self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")
self.actions = tf.placeholder(tf.int32, shape=[None], name="actions")
self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards")
self.discounted_r = tf.placeholder(tf.float32, shape=[None], name="discounted_rewards")
self.labels = tf.placeholder(tf.float32, shape=[None, 2], name="labels")
@wrap_graph
def _build_action_network(self, action_net_ctor, ctor_params):
with tf.variable_scope("action_network"):
self.action_net = action_net_ctor(self.states, **ctor_params)
self.action_logits, self.action_probs = self.action_net
@property
@wrap_graph
def x_entropy_loss(self):
if not hasattr(self, "_x_entropy_loss"):
with tf.name_scope("loss_functions"):
self._x_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.action_logits, self.labels, name="x_entropy_loss")
return self._x_entropy_loss
def advantage(self):
'''Advantage function
Naive advantage: discounted rewards
TODO - implement variance reduction:
-normalize rewards
-baseline
-value function
-actor-critic
'''
return self.discounted_r
@wrap_graph
def _calculate_loss(self):
'''Policy Gradient loss
TODO:
change calculation of logprobs to log(<label inputs, action logits>)
regularization
gradient clipping
advantage estimation
actor-critic
'''
with tf.name_scope("pg_gradient"):
self.actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="action_network")
#Logits
with tf.variable_scope("action_network", reuse=True), tf.name_scope("loss"):
#Calculate log probabilities from actions scores
probs = self.probs = tf.nn.softmax(self.action_logits)
self.logprob = tf.log(tf.reduce_max(tf.mul(self.labels, probs), reduction_indices=1),name="logprob")
#Modulate logprobs by advantage
self.logprob_advantage = self.logprob * self.discounted_r
#Sum across time
self.loss = -tf.reduce_sum(self.logprob_advantage)
#Check
# self.loss_x_ent = tf.reduce_sum(self.x_entropy_loss * self.discounted_r)
with tf.name_scope("gradient_calc"):
self.gradients = self.optimizer.compute_gradients(self.loss, self.actor_vars)
# # with tf.name_scope("summaries"):
# tf.scalar_summary("actor_loss", self.pg_loss)
# #Gradient summaries
# for grad, var in self.gradients:
# tf.histogram_summary(var.name, var)
# if grad is not None:
# tf.histogram_summary(var.name + '/gradients', grad)
# self.summarize = tf.merge_all_summaries()
with tf.name_scope("train"):
self.train_op = self.optimizer.apply_gradients(self.gradients)
def act(self, state):
"""Policy implementation: given state, returns action logits and actions
Args:
state: should be in preprocessed form
Returns:
2-tuple consisting of logits and action
"""
#Preprocess raw observation
#state = self._preprocess(state)
if len(state.shape) == 1:
state = state.reshape((1, self.state_dim))
#Run policy to get action
action_logits = self.action_logits.eval(session=self.sess, feed_dict={self.states: state})
action_probs = self.action_probs.eval(session=self.sess, feed_dict={self.states: state})[0]
if np.random.uniform() > self.epsilon:
action = np.random.choice(self.VALID_ACTIONS)
else:
action = np.random.choice(self.VALID_ACTIONS, p=action_probs)
#np.argmax(action_logits) + 2 #map to discrete state 2 (up) or 3 (down)
return action_logits, action_probs, action
def run_trajectory(self, env):
states, prep_states = [], []
action_logits, actions = [], []
rewards = []
#Start the simulation by getting initial state
state = env.reset()
done = False
while not done:
#Agent action
prep_state = self._preprocess(state)
alog, aprobs, a = self.act(prep_state)
#Advance simulation wrt agent action and record
next_state, reward, done, info = env.step(a)
states.append(state)
prep_states.append(prep_state)
action_logits.append(alog)
actions.append(a)
rewards.append(reward)
#Update state
state = next_state
#Write to experience buffer
self._state_buffer.append(np.array(states))
self._internal_state_buffer.append(np.array(prep_states))
self._action_logits_buffer.append(np.array(action_logits))
self._action_buffer.append(np.array(actions))
self._reward_buffer.append(np.array(rewards))
self._discounted_reward_buffer.append(self.discount_rewards(rewards, self.gamma))
return states, actions, rewards
def rollout(self, N, env):
'''Run N rollouts (trajectories)
Returns list of N (states, actions, rewards) tuples where N is the number of rollouts
'''
trajectories = []
done = False
for i in range(N):
states, actions, rewards = self.run_trajectory(env)
trajectories.append((states, actions, rewards))
return trajectories
@property
def _episodes(self):
return zip(self._state_buffer, self._internal_state_buffer, self._action_logits_buffer, self._action_buffer, self._reward_buffer, self._discounted_reward_buffer)
def _in_order_sampler(self, idx_len, sample_size):
idx_iter = itertools.cycle(np.random.permutation(idx_len))
while True:
yield list(itertools.islice(idx_iter, sample_size))
def _random_sampler(self, idx_len, sample_size):
while True:
yield random.sample(range(idx_len),sample_size)
def sample_experiences(self, sample_size, use_all=False):
'''Draw sample_size experiences from replay buffer
'''
#Concatenate episodes into single array
episodes = zip(*list(self.batch_iter()))
prep_states, actions, discounted_rewards = [np.concatenate(eps) for eps in episodes]
idx_len = len(prep_states)
if use_all:
sampler = self._in_order_sampler(idx_len, sample_size)
else:
sampler = self._random_sampler(idx_len, sample_size)
while True:
sample_idx = sampler.next()
yield prep_states[sample_idx], actions[sample_idx], discounted_rewards[sample_idx]
def shuffle_episodes(self):
'''Retrieve min(N, num_episodes) random episodes from experience buffer
Returns list of (states, actions, rewards) episode tuples
'''
episodes = self._episodes
indices = np.random.permutation(len(episodes))
shuffled = [episodes[i] for i in indices]
return shuffled
def train(self, batch):
'''Accumulate gradient over batch_size episodes and apply to weights
batch is tuple of states, labels (one-hot), and discounted rewards
Length of batch should be length of concatenated trajectories of the episodes
'''
pass
def batch_iter(self):
shuffled_eps = self.shuffle_episodes()
for eps in shuffled_eps:
_, prep_state, _, actions, _, discounted_rewards = eps
yield prep_state, actions, discounted_rewards