-
Notifications
You must be signed in to change notification settings - Fork 0
/
vpg.py
220 lines (180 loc) · 8.18 KB
/
vpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
Policy Gradients
1. Sample paths.
2. Process paths (compute advantage, baseline, rewards, etc)
3. Run the paths through the policy (function approximator)
4. Compute gradients/update policy model weights
5. Profit?!?!
How we optimize the policy
--------------------------
L(theta) = sum t=0 to T-1 log policy(action_t | state_t, theta) * A_t
R_t = (sum u=t to T reward_u)
B_t = E [ sum u=t to T lambda^(u-t) * reward_u | state_t]
A_t = R_t - B_t
R_t = reward
A_t = advantage
B_t = baseline
theta = parameters of our policy, most like neural network weights.
The baseline can be thought of as the value function (V). When we evaluate the baseline
of a state we're predict how good our future returns will be given our current state.
So, intuitively if A_t > 0 that means the path we sampled is better than the expectation of
paths from the current state. Likewise, if A_t < 0, it's worse. Concretely, if A_t > 0 we want
more paths like that, if A_t < 0 we want less paths like that. Theta will be updated during training
to reflect this.
Types of parameterized policies
-------------------------------
Map s (state) to an output vector u
1. If the action is from a discrete set, the network maps s to a vector of probabilities (softmax)
2. If the action is continuous, then we map s to the mean/variance of a Gaussian distribution
(diagonal covariance that does not depend on s)
3. If a is binary valued, we use a single output, the probability of outputting 1 (although
we could also just use 1.)
TODO: implement baseline
TODO: implement generalized advantage estimation
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from six.moves import range
from gym.spaces import Box, Discrete
from scipy.signal import lfilter
import gym
import tensorflow as tf
import numpy as np
import argparse
def flatten_space(space):
if isinstance(space, Box):
return np.prod(space.shape)
elif isinstance(space, Discrete):
return space.n
else:
raise ValueError("Env must be either Box or Discrete.")
def discount_cumsum(x, gamma):
return lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
class CategoricalPolicy(object):
def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
# Placeholder Inputs
self._observations = tf.placeholder(tf.float32, shape=[None, in_dim], name="observations")
self._actions = tf.placeholder(tf.int32, name="actions")
self._advantages = tf.placeholder(tf.float32, name="advantages")
self._opt = optimizer
self._sess = session
h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax)
# I believe this is faster if on the CPU
with tf.device("/cpu:0"):
# NOTE: Doesn't currently work due to gather_nd gradient not being currently implemented
# inds = tf.transpose(tf.pack([tf.range(tf.shape(probs)[0]), self._actions]))
# log_lik = tf.log(tf.gather_nd(probs, inds))
idxs_flattened = tf.range(0, tf.shape(probs)[0]) * tf.shape(probs)[1] + self._actions
probs_vec = tf.gather(tf.reshape(probs, [-1]), idxs_flattened)
log_lik = tf.log(probs_vec + 1e-8)
act_op = probs[0, :]
surr_loss = -tf.reduce_mean(log_lik * self._advantages, name="loss_op")
grads_and_vars = self._opt.compute_gradients(surr_loss)
train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")
self._act_op = act_op
self._loss_op = surr_loss
self._train_op = train_op
def act(self, observation):
# expect observation to be shape(1, self.observation_space)
a = self._sess.run(self._act_op, feed_dict={self._observations: observation})
cs = np.cumsum(a)
idx = sum(cs < np.random.rand(len(cs)))
return idx
def train(self, observations, actions, advantages):
loss, _ = self._sess.run([self._loss_op, self._train_op], feed_dict={self._observations:observations, self._actions:actions, self._advantages:advantages})
return loss
class PolicyOptimizer(object):
def __init__(self, env, policy, baseline, n_iter, n_episode, path_length,
gamma=.99):
self.policy = policy
self.baseline = baseline
self.env = env
self.n_iter = n_iter
self.n_episode = n_episode
self.path_length = path_length
self.gamma = gamma
def sample_path(self):
obs = []
actions = []
rewards = []
ob = self.env.reset()
for _ in range(self.path_length):
a = self.policy.act(ob.reshape(1, -1))
next_ob, r, done, _ = self.env.step(a)
obs.append(ob)
actions.append(a)
rewards.append(r)
ob = next_ob
if done:
break
return dict(
observations=np.array(obs),
actions=np.array(actions),
rewards=np.array(rewards),
)
def process_paths(self, paths):
for p in paths:
# TODO: compute baseline
# b = self.baseline.predict(p)
b = 0
r = discount_cumsum(p["rewards"], self.gamma)
a = r - b
p["returns"] = r
# p["advantages"] = (a - a.mean()) / (a.std() + 1e-8) # normalize
p["advantages"] = a
p["baselines"] = b
obs = np.concatenate([ p["observations"] for p in paths ])
actions = np.concatenate([ p["actions"] for p in paths ])
rewards = np.concatenate([ p["rewards"] for p in paths ])
advantages = np.concatenate([ p["advantages"] for p in paths ])
# TODO: fit baseline
# self.baseline.fit(paths)
return dict(
observations=obs,
actions=actions,
rewards=rewards,
advantages=advantages,
)
def train(self):
for i in range(1, self.n_iter+1):
paths = []
for _ in range(self.n_episode):
paths.append(self.sample_path())
data = self.process_paths(paths)
loss = self.policy.train(data["observations"], data["actions"], data["advantages"])
avg_return = np.mean([sum(p["rewards"]) for p in paths])
print("Iteration {}: Loss = {}, Average Return = {}".format(i, loss, avg_return))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--n_iter', default=100, type=int, help='number of iterations')
parser.add_argument('--n_episode', default=10, type=int, help='number of episodes/iteration')
parser.add_argument('--path_length', default=1000, type=int, help='number of steps')
parser.add_argument('--learning_rate', default=0.03, help='learning rate for Adam Optimizer')
parser.add_argument('--env', default='HalfCheetah-v2', help='gym environment for training')
parser.add_argument('--algorithm', default='VPG', help='algorithm identifier')
parser.add_argument('--outdir', default='vpg', type=str, help='output directory where results are saved (/tmp/{outdir}-{env} )')
parser.add_argument('--upload', action='store_true', help='upload results via OpenAI Gym API')
parser.add_argument('--seed', default=0, type=int, help='random seed')
args = parser.parse_args()
np.random.seed(args.seed)
tf.set_random_seed(args.seed)
env = gym.make(args.env)
outdir = '/tmp/' + args.outdir + '-' + args.env
#env.monitor.start(outdir, force=True)
print("******* WILL SAVE RESULTS TO", outdir, " *******")
sess = tf.Session()
in_dim = flatten_space(env.observation_space)
out_dim = flatten_space(env.action_space)
hidden_dim = 8
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)
po = PolicyOptimizer(env, policy, 0, args.n_iter, args.n_episode, args.path_length)
sess.run(tf.initialize_all_variables())
# train the policy optimizer
po.train()
env.monitor.close()
# make sure to setup your OPENAI_GYM_API_KEY environment variable
if args.upload:
gym.upload(outdir, algorithm_id=args.algorithm)