def __init__(self, env: gym.Env, actor: Model, critic: Model, preprocess: Model = None, memory=None, noise: callable = ornstein_uhlenbeck_noise, replay_start=5000, training_repeats=1): self.warmup_time = replay_start self.training_repeats = training_repeats assert isinstance( env.action_space, spaces.Box), "The environment's action space has to be continuous" sa = env.action_space.shape so = env.observation_space.shape self.memory = memory or ReplayMemory(1000000) self.env = env if not isinstance(actor, Model): actor = Model(actor, optimizer=tf.train.AdamOptimizer(.0001), tracker=tf.train.ExponentialMovingAverage(1 - .001)) if not isinstance(critic, Model): critic = Model(critic, optimizer=tf.train.AdamOptimizer(.001), tracker=tf.train.ExponentialMovingAverage(1 - .001)) preprocess = preprocess or Model( lambda x: x, optimizer=tf.train.AdamOptimizer(.001), tracker=tf.train.ExponentialMovingAverage(1 - 0.001)) def act(o: [so], noisy=True): with arg_scope([layers.batch_norm], is_training=False): s = preprocess(o) a = actor(s, noise=noisy) a = smart_cond(noisy, lambda: noise(a), lambda: a) q = critic(s, a) layers.summarize_tensors([s, a, q]) return a self.act = Function(act) def train_actor(o: [so]): s = preprocess(o) a0 = actor(s) q = critic(tf.stop_gradient(s), a0) # stop gradients from critic to preprocessor loss = -tf.reduce_mean(q, axis=0) return loss def train_critic(o: [so], a: [sa], r, t: tf.bool, o2: [so]): s = preprocess(o) q2 = critic(s, a) s2 = preprocess.tracked(o2) qt = critic.tracked(s2, actor.tracked(s2)) qtt = tf.where(t, r, r + 0.99 * qt) qtt = tf.stop_gradient(qtt) mse = tf.reduce_mean(tf.square(q2 - qtt), axis=0) return mse def train(o: [so], a: [sa], r, t: tf.bool, o2: [so]): al = train_actor(o) mse = train_critic(o, a, r, t, o2) return actor.minimize(al), critic.minimize( mse), preprocess.minimize([mse, al]) self.train = Function( train, prefetch_fctn=lambda: self.memory.sample_batch()[:-1], prefetch_capacity=training_repeats, async=True) def log_return(r: []): layers.summarize_tensor(r, 'Return') self.log_return = Function(log_return, async=True) self.t = 0
def __init__(self, env: gym.Env, actor: callable, critic: callable, preprocess: Model = None, memory=None, heads=2, replay_start=2000): self.replay_start = replay_start assert isinstance(env.observation_space, spaces.Box), 'action space has to be continuous' assert isinstance(env.action_space, spaces.Box), 'observation space has to be continuous' so = env.observation_space.shape sa = env.action_space.shape self.env = env self.memory = memory or ReplayMemory(1000000) self.heads = heads self.t = 0 preprocess = preprocess or Model( lambda x: x, optimizer=tf.train.AdamOptimizer(.001), tracker=tf.train.ExponentialMovingAverage(1 - 0.001)) def actors(x, noise=False): actions = [actor(x) for i in range(heads)] return actions actors = Model(actors, optimizer=tf.train.AdamOptimizer(0.0001), tracker=tf.train.ExponentialMovingAverage(1 - 0.001)) def critics(x, actions): qs = [critic(x, a) for a in actions] return qs critics = Model(critics, optimizer=tf.train.AdamOptimizer(.001), tracker=tf.train.ExponentialMovingAverage(1 - 0.001)) def act(o: [so], noise=True): with arg_scope([layers.batch_norm], is_training=False): s = preprocess(o) a = actors(s, noise=noise) q = critics(s, a) layers.summarize_tensors([s, *a, *q]) return a self.act = Function(act) def train_actor(o: [so]): s = preprocess(o) a0 = actors(s) q = critics(tf.stop_gradient(s), a0) loss = sum((-tf.reduce_mean(_, axis=0) for _ in q)) / heads return loss bootstrap = False def train_critic(o: [so], a: [sa], r, t: tf.bool, o2: [so], i: tf.int32): s = preprocess(o) q2 = critics(s, [a for _ in range(heads)]) s2 = preprocess.tracked(o2) qt = critics.tracked(s2, actors.tracked(s2)) qtt = [tf.where(t, r, r + 0.99 * tf.stop_gradient(_)) for _ in qt] # def loss(_i, _q2, _qtt): # sel = tf.equal(i, _i) if bootstrap else tf.fill(tf.shape(i), True) # e = tf.where(sel, tf.square(_q2 - _qtt), tf.zeros_like(_q2)) # mse = tf.reduce_sum(e, axis=0) / tf.reduce_sum(tf.cast(sel, tf.float32), axis=0) # return mse mse = sum([ tf.reduce_mean(tf.square(_q2 - _qtt)) for _q2, _qtt in zip(q2, qtt) ]) / heads return mse def train(o: [so], a: [sa], r, t: tf.bool, o2: [so], i: tf.int32): al = train_actor(o) mse = train_critic(o, a, r, t, o2, i) return actors.minimize(al), critics.minimize( mse), preprocess.minimize([mse, al]) self.train = Function(train, prefetch_fctn=lambda: self.memory.sample_batch(), prefetch_capacity=3, async=True) def log_return(r: []): layers.summarize_tensor(r, 'Return') self.log_return = Function(log_return)
class DdpgAgent: def __init__(self, env: gym.Env, actor: Model, critic: Model, preprocess: Model = None, memory=None, noise: callable = ornstein_uhlenbeck_noise, replay_start=5000, training_repeats=1): self.warmup_time = replay_start self.training_repeats = training_repeats assert isinstance( env.action_space, spaces.Box), "The environment's action space has to be continuous" sa = env.action_space.shape so = env.observation_space.shape self.memory = memory or ReplayMemory(1000000) self.env = env if not isinstance(actor, Model): actor = Model(actor, optimizer=tf.train.AdamOptimizer(.0001), tracker=tf.train.ExponentialMovingAverage(1 - .001)) if not isinstance(critic, Model): critic = Model(critic, optimizer=tf.train.AdamOptimizer(.001), tracker=tf.train.ExponentialMovingAverage(1 - .001)) preprocess = preprocess or Model( lambda x: x, optimizer=tf.train.AdamOptimizer(.001), tracker=tf.train.ExponentialMovingAverage(1 - 0.001)) def act(o: [so], noisy=True): with arg_scope([layers.batch_norm], is_training=False): s = preprocess(o) a = actor(s, noise=noisy) a = smart_cond(noisy, lambda: noise(a), lambda: a) q = critic(s, a) layers.summarize_tensors([s, a, q]) return a self.act = Function(act) def train_actor(o: [so]): s = preprocess(o) a0 = actor(s) q = critic(tf.stop_gradient(s), a0) # stop gradients from critic to preprocessor loss = -tf.reduce_mean(q, axis=0) return loss def train_critic(o: [so], a: [sa], r, t: tf.bool, o2: [so]): s = preprocess(o) q2 = critic(s, a) s2 = preprocess.tracked(o2) qt = critic.tracked(s2, actor.tracked(s2)) qtt = tf.where(t, r, r + 0.99 * qt) qtt = tf.stop_gradient(qtt) mse = tf.reduce_mean(tf.square(q2 - qtt), axis=0) return mse def train(o: [so], a: [sa], r, t: tf.bool, o2: [so]): al = train_actor(o) mse = train_critic(o, a, r, t, o2) return actor.minimize(al), critic.minimize( mse), preprocess.minimize([mse, al]) self.train = Function( train, prefetch_fctn=lambda: self.memory.sample_batch()[:-1], prefetch_capacity=training_repeats, async=True) def log_return(r: []): layers.summarize_tensor(r, 'Return') self.log_return = Function(log_return, async=True) self.t = 0 def play_episode(self): ob = self.env.reset() done = False R = 0 self.act.initialize_local( ) # reset local variables (e.g. the noise state) while not done: # a = act(ob, False) if np.random.rand() > .1 else acsp.sample() a = self.act(ob) ob2, r, done, info = self.env.step(a) self.memory.enqueue(ob, a, r, done) ob = ob2 R += info.get('unwrapped_reward', r) debug_training = self.t == 512 # fail fast ;) if self.t > self.warmup_time or debug_training: for _ in range(1 if debug_training else self.training_repeats): self.train() self.t += 1 self.log_return(R) return R, {}
def __init__(self, n_actions, observation_shape, q_network: tt.Model, double_dqn=True, replay_start=50000, clip_td=False, logdir="", clip_gradients=10): self.logdir = logdir self.replay_start = replay_start self.n_actions = n_actions self.observation_shape = observation_shape self.memory = ShardedMemory() self.discount = .99 self.step = 0 @tt.model( tracker=tf.train.ExponentialMovingAverage( 1 - .0005), # TODO: replace with original weight freeze optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01)) def q_network(x): x /= 255 x = layers.conv2d(x, 32, 8, 4) x = layers.conv2d(x, 64, 4, 2) x = layers.conv2d(x, 64, 3, 1) x = layers.flatten(x) xv = layers.fully_connected(x, 512) val = layers.fully_connected(xv, 1, activation_fn=None) # val = tf.squeeze(val, 1) xa = layers.fully_connected(x, 512) adv = layers.fully_connected(xa, env.action_space.n, activation_fn=None) q = val + adv - tf.reduce_mean(adv, axis=1, keep_dims=True) q = tf.identity(q, name='Q') return q, x def act(x: [observation_shape]): qs = q_network(x) a = tf.argmax(qs, axis=1) # qm = tf.reduce_max(qs, axis=1) return a, qs self.act = Function(act) def train_step(o: [observation_shape], a: (tf.int32, [[]]), r, t: tf.bool, o2: [observation_shape]): q = q_network(o) # ac = tf.argmax(q, axis=1) # compute targets q2 = q_network.tracked(o2) if double_dqn: a2 = tf.argmax( q_network(o2), axis=1) # yep, that's really the only difference else: a2 = tf.argmax(q2, axis=1) mask2 = tf.one_hot(a2, n_actions, 1.0, 0.0, axis=1) q_target = tf.where( t, r, r + self.discount * tf.reduce_sum(q2 * mask2, axis=1)) q_target = tf.stop_gradient(q_target) # compute loss mask = tf.one_hot(a, n_actions, 1.0, 0.0, axis=1) qs = tf.reduce_sum(q * mask, axis=1, name='q_max') td = tf.subtract(q_target, qs, name='td') if clip_td: td = tf.clip_by_value(td, -.5, .5, name='clipped_td') # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae') # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber') loss = tf.reduce_mean(tf.square(td), axis=0, name='mse') gav = q_network.compute_gradients(loss) if clip_gradients: gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav] loss_update = q_network.apply_gradients(gav) # logging layers.summarize_tensors([ td, loss, r, o, a, tf.subtract(o2, o, name='state_dif'), tf.reduce_mean(tf.cast(t, tf.float32), name='frac_terminal'), tf.subtract(tf.reduce_max(q, 1, True), q, name='av_advantage') ]) # layers.summarize_tensors(chi.activations()) # layers.summarize_tensors(chi.gradients()) return loss_update self.train_step = Function( train_step, prefetch_fctn=lambda: self.memory.sample_batch()[:-1], prefetch_capacity=10, prefetch_threads=3) def log_weigths(): v = q_network.trainable_variables() # print(f'log weights {v}') f = q_network.tracker_variables # print(f'log weights EMA {f}') difs = [] for g in v: a = q_network.tracker.average(g) difs.append(tf.subtract(g, a, name=f'ema/dif{g.name[:-2]}')) layers.summarize_tensors(v + f + difs) self.log_weights = Function(log_weigths, async=True)
def __init__(self, env: gym.Env, q_network: tt.Model, memory=None, double_dqn=True, replay_start=50000, annealing_time=1000000): self.annealing_time = annealing_time self.replay_start = replay_start so = env.observation_space.shape self.env = env self.memory = memory or chi.rl.ReplayMemory(1000000) def act(x: [so]): qs = q_network(x) a = tf.argmax(qs, axis=1) # qm = tf.reduce_max(qs, axis=1) layers.summarize_tensor(a) return a, qs self.act = Function(act) def train(o: [so], a: (tf.int32, [[]]), r, t: tf.bool, o2: [so]): q = q_network(o) # ac = tf.argmax(q, axis=1) # compute targets q2 = q_network.tracked(o2) if double_dqn: a2 = tf.argmax( q_network(o2), axis=1) # yep, that's really the only difference else: a2 = tf.argmax(q2, axis=1) mask2 = tf.one_hot(a2, env.action_space.n, 1.0, 0.0, axis=1) q_target = tf.where(t, r, r + 0.99 * tf.reduce_sum(q2 * mask2, axis=1)) q_target = tf.stop_gradient(q_target) # compute loss mask = tf.one_hot(a, env.action_space.n, 1.0, 0.0, axis=1) qs = tf.reduce_sum(q * mask, axis=1, name='q_max') td = tf.subtract(q_target, qs, name='td') # td = tf.clip_by_value(td, -10, 10) # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae') # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber') loss = tf.reduce_mean(tf.square(td), axis=0, name='mse') loss = q_network.minimize(loss) # logging layers.summarize_tensors([ td, loss, r, o, a, tf.subtract(o2, o, name='state_dif'), tf.reduce_mean(tf.cast(t, tf.float32), name='frac_terminal'), tf.subtract(tf.reduce_max(q, 1, True), q, name='av_advantage') ]) # layers.summarize_tensors(chi.activations()) # layers.summarize_tensors(chi.gradients()) return loss self.train = Function( train, prefetch_fctn=lambda: self.memory.sample_batch()[:-1], prefetch_capacity=3, async=True) def log_weigths(): v = q_network.trainable_variables() # print(f'log weights {v}') f = q_network.tracker_variables # print(f'log weights EMA {f}') difs = [] for g in v: a = q_network.tracker.average(g) difs.append(tf.subtract(g, a, name=f'ema/dif{g.name[:-2]}')) layers.summarize_tensors(v + f + difs) self.log_weights = Function(log_weigths, async=True) def log_returns(real_return: [], ret: [], qs): layers.summarize_tensors( [real_return, ret, qs, tf.subtract(ret, qs, name='R-Q')]) self.log_returns = Function(log_returns, async=True) self.t = 0
def __init__(self, n_actions, observation_shape, q_network: tt.Model, double_dqn=True, replay_start=50000, clip_td=False, logdir="", clip_gradients=10): self.logdir = logdir self.replay_start = replay_start self.n_actions = n_actions self.observation_shape = observation_shape self.memory = ShardedMemory() self.discount = .99 self.step = 0 def act(x: [observation_shape]): qs = q_network(x) a = tf.argmax(qs, axis=1) # qm = tf.reduce_max(qs, axis=1) return a, qs self.act = Function(act) def train_step(o: [observation_shape], a: (tf.int32, [[]]), r, t: tf.bool, o2: [observation_shape]): q = q_network(o) # ac = tf.argmax(q, axis=1) # compute targets q2 = q_network.tracked(o2) if double_dqn: a2 = tf.argmax(q_network(o2), axis=1) # yep, that's really the only difference else: a2 = tf.argmax(q2, axis=1) mask2 = tf.one_hot(a2, n_actions, 1.0, 0.0, axis=1) q_target = tf.where(t, r, r + self.discount * tf.reduce_sum(q2 * mask2, axis=1)) q_target = tf.stop_gradient(q_target) # compute loss mask = tf.one_hot(a, n_actions, 1.0, 0.0, axis=1) qs = tf.reduce_sum(q * mask, axis=1, name='q_max') td = tf.subtract(q_target, qs, name='td') if clip_td: td = tf.clip_by_value(td, -.5, .5, name='clipped_td') # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae') # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber') loss = tf.reduce_mean(tf.square(td), axis=0, name='mse') gav = q_network.compute_gradients(loss) if clip_gradients: gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav] loss_update = q_network.apply_gradients(gav) # logging layers.summarize_tensors([td, loss, r, o, a, tf.subtract(o2, o, name='state_dif'), tf.reduce_mean(tf.cast(t, tf.float32), name='frac_terminal'), tf.subtract(tf.reduce_max(q, 1, True), q, name='av_advantage')]) # layers.summarize_tensors(chi.activations()) # layers.summarize_tensors(chi.gradients()) return loss_update self.train_step = Function(train_step, prefetch_fctn=lambda: self.memory.sample_batch()[:-1], prefetch_capacity=10, prefetch_threads=3) def log_weigths(): v = q_network.trainable_variables() # print(f'log weights {v}') f = q_network.tracker_variables # print(f'log weights EMA {f}') difs = [] for g in v: a = q_network.tracker.average(g) difs.append(tf.subtract(g, a, name=f'ema/dif{g.name[:-2]}')) layers.summarize_tensors(v + f + difs) self.log_weights = Function(log_weigths, async=True)
def __init__(self, n_actions, observation_shape, pp: Model, heads: Model, double_dqn=True, replay_start=50000, logdir=None, clip_gradients=10.): self.logdir = logdir self.observation_shape = observation_shape self.n_actions = n_actions self.replay_start = replay_start self.n_heads = None self.memory = ShardedMemory() self.discount = .99 self.step = 0 self.n_state = None def act(x: [observation_shape]): s = pp(x) self.n_state = int(s.shape[1]) qs = heads(s) self.n_heads = len(qs) return qs, s self.act = Function(act) @tt.model(optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01)) def pred(x, a: tf.int32): x = tf.concat((x, layers.one_hot_encoding(a, self.n_actions)), axis=1) x = layers.fully_connected(x, 100) x = layers.fully_connected(x, 50) x = layers.fully_connected(x, 50) x = layers.fully_connected(x, 100) x = layers.fully_connected(x, self.n_state, None) return x def train_step(o: [observation_shape], a: (tf.int32, [[]]), r, t: tf.bool, o2: [observation_shape]): s = pp(o) qs = heads(s) self.n_heads = len(qs) # compute targets q2s = heads.tracked(pp.tracked(o2)) s2 = pp(o2) # transition model sp = pred(s, a) loss_pred = tf.reduce_mean(tf.square(sp-s2)) if double_dqn: a2s = [tf.argmax(_, axis=1) for _ in heads(s2)] else: a2s = [tf.argmax(_, axis=1) for _ in q2s] losses = [] for a2, q2, q in zip(a2s, q2s, qs): mask2 = tf.one_hot(a2, n_actions, 1.0, 0.0, axis=1) q_target = tf.stop_gradient(tf.where(t, r, r + self.discount * tf.reduce_sum(q2 * mask2, axis=1))) # compute loss mask = tf.one_hot(a, n_actions, 1.0, 0.0, axis=1) q = tf.reduce_sum(q * mask, axis=1, name='q_max') td = tf.subtract(q_target, q, name='td') # td = tf.clip_by_value(td, -10, 10) # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae') # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber') losses.append(tf.reduce_mean(tf.square(td), axis=0, name='mse')) loss = tf.add_n(losses) gav = heads.compute_gradients(loss) if clip_gradients: gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav] th = heads.apply_gradients(gav) gav = pp.compute_gradients(loss) if clip_gradients: gav = [(tf.clip_by_norm(g / self.n_heads, clip_gradients), v) for g, v in gav] tp = pp.apply_gradients(gav) gav = pred.compute_gradients(loss_pred) if clip_gradients: gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav] tpred = pred.apply_gradients(gav) return th, tp, tpred self.train_step = Function(train_step, prefetch_fctn=lambda: self.memory.sample_batch()[:-1], prefetch_capacity=5, prefetch_threads=3, async=False) assert self.n_state def predict(s: [[self.n_state]], a: (tf.int32, [[]])): sp = pred(s, a) return sp self.predict = Function(predict)