def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Networks and optimizer self.local_network = MlpPolicy(state_size, action_size, seed).to(device) self.target_network = MlpPolicy(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_network.parameters(), lr=LR) # Replay memory self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 self.ok = 1
def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2)
# Reward Function reward_fun = rf_info2d_pos # Policy pi_non_linear = F.relu pi_hid_layers = 1 pi_hid_dim = 20 pi_noise = 0.2 pi_noise_clip = 0.5 policy = MlpPolicy(state_dim, action_dim=action_dim, act_min=act_min, act_max=act_max, non_linearity=pi_non_linear, hidden_layers=pi_hid_layers, hidden_dim=pi_hid_dim, output_non_linearity=None, noise=pi_noise, noise_clip=pi_noise_clip) # Baseline Function bl = True bl_lr = 1e-4 bl_non_linear = F.relu bl_hid_layers = 1 bl_hid_dim = 20 bl_fun = StateValueFunction(state_dim,
def __init__(self, ob_space, ac_space, c_entropy, c_vf, session, max_grad_norm=0.5): sess = session agent_model = MlpPolicy('Mlp_agent', ob_space, ac_space, session) pi = agent_model.pi old_pi = agent_model.oldpi v = agent_model.vf critic = agent_model.critic #r = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="reward") a = tf.placeholder(dtype=tf.float32, shape=[None]+list(ac_space.shape), name="a") adv = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="advantage") target_v = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="target_v") old_v = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="old_v") LR = tf.placeholder(dtype=tf.float32, name="lr") CLIP_RANGE = tf.placeholder(dtype=tf.float32, shape=(), name="cliprange") TAU_LOCAL = tf.placeholder(dtype=tf.float32, shape=(), name="TAU_LOCAL") TAU_GLOBAL = tf.placeholder(dtype=tf.float32, shape=(), name="TAU_GLOBAL") with tf.variable_scope('losses'): NegLogPac = pi.neglogp(a) OldNegLogPac = old_pi.neglogp(a) ratio = tf.exp(OldNegLogPac - NegLogPac) surr1 = adv * ratio surr2 = adv * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE, 1.0 + CLIP_RANGE) pg_loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) entropy = tf.reduce_mean(pi.entropy()) v_clipped = old_v + tf.clip_by_value(v - old_v, -CLIP_RANGE, CLIP_RANGE) vf_losses1 = tf.square(v - target_v) vf_losses2 = tf.square(v_clipped - target_v) vf_loss = 0.5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) simple_vf_loss = tf.reduce_mean(vf_losses1) approxkl = 0.5 * tf.reduce_mean(tf.square(NegLogPac - OldNegLogPac)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIP_RANGE))) # critic loss q_loss = tf.reduce_mean(tf.square(critic - target_v)) # actor loss m = tf.reduce_mean(critic, keepdims=True) devs_squared = tf.square(critic - m) reduced_var = tf.reduce_mean(devs_squared) reduced_std = tf.sqrt(reduced_var) normalized_q = (critic - m) / reduced_std actor_loss = -tf.reduce_mean(normalized_q) #loss = pg_loss - entropy * c_entropy + vf_loss * c_vf loss = pg_loss + simple_vf_loss - entropy * c_entropy + actor_loss*0.5 #tf.summary.scalar('total_loss', loss) #tf.summary.scalar('pol_loss', pg_loss) #tf.summary.scalar('vf_loss', simple_vf_loss) def _grads_placeholder_trainopt(los, para): grads = tf.gradients(los, para) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) flatten_grads = tf.concat(axis=0, values=[tf.reshape(gg, shape=[int(np.prod(gg.shape))]) for gg in grads]) feed_grads = tf.placeholder(dtype=tf.float32, shape=flatten_grads.shape, name='feed_grads') with tf.name_scope("Apply_grads"): update_list = [] start = 0 for p in para: end = start + int(np.prod(p.shape)) update_list.append(tf.reshape(feed_grads[start:end], shape=p.shape)) start = end # create grad-params pair list grads_list = list(zip(update_list, para)) optimizer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) train = optimizer.apply_gradients(grads_list) return flatten_grads, feed_grads, train # update old pi pi_params = agent_model.get_pol_variables() oldpi_params = agent_model.get_oldpol_variables() with tf.variable_scope('update_old_pi'): _updatepi = [old.assign(old*(1.0-TAU_LOCAL) + new*TAU_LOCAL) for old, new in zip(oldpi_params, pi_params)] self.cri_params = agent_model.get_critic_variables() self.pol_and_v_params = agent_model.get_ppo_variables() self.all_params = agent_model.get_variables() #self.train_params = tf.trainable_variables(scope=agent_model.scope) with tf.name_scope("critic_grads"): critic_grads, feed_critic, c_train = _grads_placeholder_trainopt(q_loss, self.cri_params) with tf.name_scope("pol_grads"): pol_grads, feed_pol, p_train = _grads_placeholder_trainopt(loss, self.pol_and_v_params) # get flattened agent parameters self.flat_params = tf.concat(axis=0, values=[tf.reshape(ap, shape=[int(np.prod(ap.shape))]) for ap in self.all_params]) # placeholder for flatten params feed_params = tf.placeholder(dtype=tf.float32, shape=self.flat_params.shape, name='feed_params') ## opt for params assignment with tf.name_scope("Apply_params"): p_list = [] start = 0 for p in self.all_params: end = start + int(np.prod(p.shape)) p_list.append(tf.reshape(feed_params[start:end], shape=p.shape)) start = end _apply_params = [old.assign(new*TAU_GLOBAL + (1-TAU_GLOBAL)*old) for old, new in zip(self.all_params, p_list)] # minibatch train def train(lr, cliprange, mb_obs, mb_acs, mb_adv, mb_vs, mb_targv, use_global_grad, apply_noise, scale_by_procs=True): mb_adv = (mb_adv - mb_adv.mean()) / mb_adv.std() def _train(grads, grads_placeholder, opt, feeddict): local_grad = sess.run(grads, feed_dict=feeddict) assert local_grad.ndim == 1 if apply_noise: local_grad += np.random.normal(loc=0, scale=0.05, size=local_grad.shape) final_grad = local_grad.copy() if use_global_grad: MPI.COMM_WORLD.Allreduce(local_grad, final_grad, op=MPI.SUM) if scale_by_procs: final_grad = final_grad / MPI.COMM_WORLD.Get_size() sess.run(opt, feed_dict={LR: lr, grads_placeholder: final_grad}) c_train_dict = {agent_model.ob: mb_obs, agent_model.pi: mb_acs, target_v: mb_targv} _train(critic_grads, feed_critic, c_train, c_train_dict) pol_train_dict = {agent_model.ob: mb_obs, a: mb_acs, adv: mb_adv, target_v: mb_targv, old_v: mb_vs, CLIP_RANGE: cliprange} # get loss ploss, vloss = sess.run([pg_loss, simple_vf_loss], feed_dict=pol_train_dict) _train(pol_grads, feed_pol, p_train, pol_train_dict) return ploss, vloss # update old pi with pi def update_old_pi(tau=1.0): sess.run(_updatepi, feed_dict={TAU_LOCAL: tau}) def sync_params(tau=1.0): # get local params local_p = sess.run(self.flat_params) # prepare global buffer global_p = np.zeros_like(local_p) # sync MPI.COMM_WORLD.Allreduce(local_p, global_p, op=MPI.SUM) # scale params with agent_number global_p = global_p / MPI.COMM_WORLD.Get_size() sess.run(_apply_params, feed_dict={feed_params: global_p, TAU_GLOBAL:tau}) def apply_noise(sd=0.05): p = sess.run(self.flat_params) p += np.random.normal(loc=0, scale=sd, size=p.shape) sess.run(_apply_params, feed_dict={feed_params: p, TAU_GLOBAL: 1.0}) def get_params(): return sess.run(self.flat_params) def apply_params(p, tau=1.0): sess.run(_apply_params, feed_dict={feed_params: p, TAU_GLOBAL: tau}) self.train = train self.update_old_pi = update_old_pi self.sync_params = sync_params self.agent_model = agent_model self.get_params = get_params self.apply_params = apply_params self.apply_noise = apply_noise
class DQN: def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Networks and optimizer self.local_network = MlpPolicy(state_size, action_size, seed).to(device) self.target_network = MlpPolicy(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_network.parameters(), lr=LR) # Replay memory self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 self.ok = 1 def step(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) self.t_step += 1 if self.t_step % UPDATE_EVERY == 0: if len(self.replay_buffer) > BATCH_SIZE: # if enough samples self.learn(self.replay_buffer.sample(), GAMMA) def predict(self, state, eps=0.): """Returns action from e-greedy policy""" state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Forward pass (no gradient) self.local_network.eval() with torch.no_grad(): action_values = self.local_network(state) self.local_network.train() # Action pick if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experience_batch, gamma): states, actions, rewards, next_states, dones = experience_batch # Get vector of max predicted Q values for next states (from target model) # 'max' gets you [64,1], 'unsqueeze' and get [64] Q_targets_next = self.target_network(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get predicted Q values from local model Q_predictions = self.local_network(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_predictions, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network self.soft_update(self.local_network, self.target_network, TAU) def soft_update(self, local_model, target_model, tau): """Slowly update target model's parameters.""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, ob_space, ac_space, c_entropy, c_vf, session, max_grad_norm=0.5): sess = session agent_model = MlpPolicy('Mlp_agent', ob_space, ac_space, session) pi = agent_model.pi old_pi = agent_model.oldpi v = agent_model.vf a = tf.placeholder(dtype=tf.float32, shape=[None] + list(ac_space.shape), name="a") adv = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="advantage") target_v = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="target_v") old_v = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="old_v") LR = tf.placeholder(dtype=tf.float32, name="lr") CLIP_RANGE = tf.placeholder(dtype=tf.float32, shape=(), name="cliprange") TAU = tf.placeholder(dtype=tf.float32, shape=(), name="TAU") with tf.variable_scope('losses'): NegLogPac = pi.neglogp(a) OldNegLogPac = old_pi.neglogp(a) ratio = tf.exp(OldNegLogPac - NegLogPac) surr1 = adv * ratio surr2 = adv * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE, 1.0 + CLIP_RANGE) pg_loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) entropy = tf.reduce_mean(pi.entropy()) v_clipped = old_v + tf.clip_by_value(v - old_v, -CLIP_RANGE, CLIP_RANGE) vf_losses1 = tf.square(v - target_v) vf_losses2 = tf.square(v_clipped - target_v) vf_loss = 0.5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) simple_vf_loss = tf.reduce_mean(vf_losses1) approxkl = 0.5 * tf.reduce_mean( tf.square(NegLogPac - OldNegLogPac)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIP_RANGE))) #loss = pg_loss - entropy * c_entropy + vf_loss * c_vf loss = pg_loss + simple_vf_loss - entropy * c_entropy pi_params = agent_model.get_pol_variables() oldpi_params = agent_model.get_oldpol_variables() with tf.variable_scope('update_old_pi'): _updatepi = [ old.assign(old * (1.0 - TAU) + new * TAU) for old, new in zip(oldpi_params, pi_params) ] params = tf.trainable_variables(scope=agent_model.scope) grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) self.optimizer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = self.optimizer.apply_gradients(grads) """ def global_train(*, lr, cliprange, bobs, bacs, badv, bvs, btargv, scale_by_procs = True): badv = (badv - badv.mean()) / (badv.std() + 1e-8) feeddict={pi.ob: bobs, old_pi.ob: bobs, a: bacs, adv: badv, old_v: bvs, target_v: btargv, LR: lr, CLIP_RANGE: cliprange} localg = sess.run(tf.gradients(loss, params), feed_dict=feeddict) globalg = np.zeros_like(localg) MPI.COMM_WORLD.Allreduce(localg, globalg, op=MPI.SUM) if scale_by_procs: globalg /= MPI.COMM_WORLD.Get_size() if max_grad_norm is not None: globalg, _grad_norm = tf.clip_by_global_norm(globalg, max_grad_norm) grads = list(zip(globalg, params)) sess.run(optimizer.apply_gradients(grads)) """ def train(lr, cliprange, mb_obs, mb_acs, mb_adv, mb_vs, mb_targv): mb_adv = (mb_adv - mb_adv.mean()) / mb_adv.std() feeddict = { agent_model.ob: mb_obs, a: mb_acs, adv: mb_adv, target_v: mb_targv, old_v: mb_vs, LR: lr, CLIP_RANGE: cliprange } sess.run(_train, feed_dict=feeddict) return sess.run([pg_loss, simple_vf_loss], feed_dict=feeddict) def update_old_pi(tau=0.5): sess.run(_updatepi, feed_dict={TAU: tau}) self.train = train #self.global_train = global_train self.update_old_pi = update_old_pi self.agent_model = agent_model