/
puff.py
326 lines (272 loc) · 11.9 KB
/
puff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#! /usr/bin/env python3
"""
PPO: Proximal Policy Optimization
Written by Patrick Coady (pat-coady.github.io)
PPO uses a loss function and gradient descent to approximate
Trust Region Policy Optimization (TRPO). See these papers for
details:
TRPO / PPO:
https://arxiv.org/pdf/1502.05477.pdf (Schulman et al., 2016)
Distributed PPO:
https://arxiv.org/abs/1707.02286 (Heess et al., 2017)
Generalized Advantage Estimation:
https://arxiv.org/pdf/1506.02438.pdf
And, also, this GitHub repo which was helpful to me during
implementation:
https://github.com/joschu/modular_rl
This implementation learns policies for continuous environments
in the OpenAI Gym (https://gym.openai.com/). Testing was focused on
the MuJoCo control tasks.
"""
import gym
import gym_remote.client as grc
import numpy as np
from gym import wrappers
from policy import Policy
from value_function import NNValueFunction
import scipy.signal
from utils import Logger, Scaler
from datetime import datetime
import os
import argparse
import signal
from retro_contest.local import make
from baselines.common.atari_wrappers import WarpFrame, FrameStack
from sonic_util import AllowBacktracking, make_env, SonicDiscretizer
class GracefulKiller:
""" Gracefully exit program on CTRL-C """
def __init__(self):
self.kill_now = False
signal.signal(signal.SIGINT, self.exit_gracefully)
signal.signal(signal.SIGTERM, self.exit_gracefully)
def exit_gracefully(self, signum, frame):
self.kill_now = True
def init_gym():
"""
Initialize gym environment, return dimension of observation
and action spaces.
Args:
env_name: str environment name (e.g. "Humanoid-v1")
Returns: 3-tuple
gym environment (object)
number of observation dimensions (int)
number of action dimensions (int)
"""
#env = gym.make(env_name)
#env = grc.RemoteEnv('tmp/sock')
env = make(game='SonicAndKnuckles3-Genesis', state='AngelIslandZone.Act1')
env = SonicDiscretizer(env)
env = WarpFrame(env)
#obs_dim = env.observation_space.shape[0]
obs_dim = 0
#act_dim = env.action_space.shape[0]
act_dim = env.action_space.n
return env, obs_dim, act_dim
def run_episode(env, policy, scaler, animate=False):
""" Run single episode with option to animate
Args:
env: ai gym environment
policy: policy object with sample() method
scaler: scaler object, used to scale/offset each observation dimension
to a similar range
animate: boolean, True uses env.render() method to animate episode
Returns: 4-tuple of NumPy arrays
observes: shape = (episode len, obs_dim)
actions: shape = (episode len, act_dim)
rewards: shape = (episode len,)
unscaled_obs: useful for training scaler, shape = (episode len, obs_dim)
"""
obs = env.reset()
observes, actions, rewards, unscaled_obs = [], [], [], []
done = False
step = 0.0
scale, offset = scaler.get()
scale[-1] = 1.0 # don't scale time step feature
offset[-1] = 0.0 # don't offset time step feature
while not done:
env.render()
obs = obs.astype(np.float32).reshape((1, -1))
obs = np.append(obs, [[step]], axis=1) # add time step feature
unscaled_obs.append(obs)
obs = (obs - offset) * scale # center and scale observations
observes.append(obs)
action = policy.sample(obs).reshape((1, -1)).astype(np.float32)
print("action :", action)
means = policy.test_sample(obs)
print("means :", means)
actions.append(action)
obs, reward, done, _ = env.step(np.squeeze(action, axis=0))
if not isinstance(reward, float):
reward = np.asscalar(reward)
rewards.append(reward)
step += 1e-3 # increment time step feature
return (np.concatenate(observes), np.concatenate(actions),
np.array(rewards, dtype=np.float64), np.concatenate(unscaled_obs))
def run_policy(env, policy, scaler, episodes):
""" Run policy and collect data for a minimum of min_steps and min_episodes
Args:
env: ai gym environment
policy: policy object with sample() method
scaler: scaler object, used to scale/offset each observation dimension
to a similar range
logger: logger object, used to save stats from episodes
episodes: total episodes to run
Returns: list of trajectory dictionaries, list length = number of episodes
'observes' : NumPy array of states from episode
'actions' : NumPy array of actions from episode
'rewards' : NumPy array of (un-discounted) rewards from episode
'unscaled_obs' : NumPy array of (un-discounted) rewards from episode
"""
total_steps = 0
trajectories = []
for e in range(episodes):
observes, actions, rewards, unscaled_obs = run_episode(env, policy, scaler)
total_steps += observes.shape[0]
trajectory = {'observes': observes,
'actions': actions,
'rewards': rewards,
'unscaled_obs': unscaled_obs}
trajectories.append(trajectory)
unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories])
scaler.update(unscaled) # update running statistics for scaling observations
return trajectories
def discount(x, gamma):
""" Calculate discounted forward sum of a sequence at each point """
return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]
def add_disc_sum_rew(trajectories, gamma):
""" Adds discounted sum of rewards to all time steps of all trajectories
Args:
trajectories: as returned by run_policy()
gamma: discount
Returns:
None (mutates trajectories dictionary to add 'disc_sum_rew')
"""
for trajectory in trajectories:
if gamma < 0.999: # don't scale for gamma ~= 1
rewards = trajectory['rewards'] * (1 - gamma)
else:
rewards = trajectory['rewards']
disc_sum_rew = discount(rewards, gamma)
trajectory['disc_sum_rew'] = disc_sum_rew
def add_value(trajectories, val_func):
""" Adds estimated value to all time steps of all trajectories
Args:
trajectories: as returned by run_policy()
val_func: object with predict() method, takes observations
and returns predicted state value
Returns:
None (mutates trajectories dictionary to add 'values')
"""
for trajectory in trajectories:
observes = trajectory['observes']
values = val_func.predict(observes)
trajectory['values'] = values
def add_gae(trajectories, gamma, lam):
""" Add generalized advantage estimator.
https://arxiv.org/pdf/1506.02438.pdf
Args:
trajectories: as returned by run_policy(), must include 'values'
key from add_value().
gamma: reward discount
lam: lambda (see paper).
lam=0 : use TD residuals
lam=1 : A = Sum Discounted Rewards - V_hat(s)
Returns:
None (mutates trajectories dictionary to add 'advantages')
"""
for trajectory in trajectories:
if gamma < 0.999: # don't scale for gamma ~= 1
rewards = trajectory['rewards'] * (1 - gamma)
else:
rewards = trajectory['rewards']
values = trajectory['values']
# temporal differences
tds = rewards - values + np.append(values[1:] * gamma, 0)
advantages = discount(tds, gamma * lam)
trajectory['advantages'] = advantages
def build_train_set(trajectories):
"""
Args:
trajectories: trajectories after processing by add_disc_sum_rew(),
add_value(), and add_gae()
Returns: 4-tuple of NumPy arrays
observes: shape = (N, obs_dim)
actions: shape = (N, act_dim)
advantages: shape = (N,)
disc_sum_rew: shape = (N,)
"""
observes = np.concatenate([t['observes'] for t in trajectories])
actions = np.concatenate([t['actions'] for t in trajectories])
disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories])
advantages = np.concatenate([t['advantages'] for t in trajectories])
# normalize advantages
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)
return observes, actions, advantages, disc_sum_rew
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar):
""" Main training loop
Args:
env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
num_episodes: maximum number of episodes to run
gamma: reward discount factor (float)
lam: lambda from Generalized Advantage Estimate
kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
batch_size: number of episodes per policy training batch
hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
policy_logvar: natural log of initial policy variance
"""
killer = GracefulKiller()
env, obs_dim, act_dim = init_gym()
env_name = "retro"
obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode())
#obs_dim = 215041
obs_dim = 7057
now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories
#aigym_path = os.path.join('/tmp', env_name, now)
#env = wrappers.Monitor(env, aigym_path, force=True)
scaler = Scaler(obs_dim)
val_func = NNValueFunction(obs_dim, hid1_mult)
policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
# run a few episodes of untrained policy to initialize scaler:
run_policy(env, policy, scaler, episodes=5)
episode = 0
while episode < num_episodes:
trajectories = run_policy(env, policy, scaler, episodes=batch_size)
episode += len(trajectories)
add_value(trajectories, val_func) # add estimated values to episodes
add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs
add_gae(trajectories, gamma, lam) # calculate advantage
# concatenate all episodes into single NumPy arrays
observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
# add various stats to training log:
log_batch_stats(observes, actions, advantages, disc_sum_rew, episode)
policy.update(observes, actions, advantages) # update policy
val_func.fit(observes, disc_sum_rew) # update value function
if killer.kill_now:
if input('Terminate training (y/[n])? ') == 'y':
break
killer.kill_now = False
policy.close_sess()
val_func.close_sess()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=('Train policy on OpenAI Gym environment '
'using Proximal Policy Optimizer'))
#parser.add_argument('env_name', type=str, help='OpenAI Gym environment name')
parser.add_argument('-n', '--num_episodes', type=int, help='Number of episodes to run',
default=2000000)
parser.add_argument('-g', '--gamma', type=float, help='Discount factor', default=0.995)
parser.add_argument('-l', '--lam', type=float, help='Lambda for Generalized Advantage Estimation',
default=0.98)
parser.add_argument('-k', '--kl_targ', type=float, help='D_KL target value',
default=0.003)
parser.add_argument('-b', '--batch_size', type=int,
help='Number of episodes per training batch',
default=20)
parser.add_argument('-m', '--hid1_mult', type=int,
help='Size of first hidden layer for value and policy NNs'
'(integer multiplier of observation dimension)',
default=10)
parser.add_argument('-v', '--policy_logvar', type=float,
help='Initial policy log-variance (natural log of variance)',
default=-1.0)
args = parser.parse_args()
main(**vars(args))