def update(): # Prepare hessian func, gradient eval inputs = {k: v for k, v in zip(all_phs, buf.get())} Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = sess.run(get_pi_params) old_penalty = env.penalty(env.s) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): old_penalty = env.penalty(env.s) kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) penalty = env.penalty(env.s) #print("Old Penalty {}, Penalty {}".format(old_penalty,penalty)) #if kl <= delta and pi_l_new <= pi_l_old: if penalty == 0 or penalty < old_penalty: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) logger.store(penalty=penalty, old_penalty=old_penalty) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) logger.store(penalty=penalty, old_penalty=old_penalty) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} #TODO: Next step is to calculate the hessian using safe distance #Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old, ent = sess.run( [gradient, pi_loss, v_loss, approx_ent], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) #x = cg(Hx, g) #x = optimize.fmin_cg(pi_l_old, x0, fprime=g) x = g old_params = sess.run(get_pi_params) old_penalty = env.penalty(env.s) alpha = np.sqrt(2 * delta / (np.dot(x, g) + EPS)) # backtracking line search, hard constraint check on env penalty for j in range(backtrack_iters): step = backtrack_coeff**j sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) pi_l_new = sess.run([pi_loss], feed_dict=inputs) penalty = env.penalty(env.s) #print("Old Penalty {}, Penalty {}".format(old_penalty,penalty)) if penalty == 0 or penalty < old_penalty: #if pi_l_new <= pi_l_old: logger.log('Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) logger.store(penalty=penalty, old_penalty=old_penalty) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) logger.store(penalty=penalty, old_penalty=old_penalty) # Policy gradient step #sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update #pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict={v_ph: old_params - alpha * x * step}) logger.store(LossPi=pi_l_old, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old))
def update(self): logger = self.logger inputs = {k: v for k, v in zip(self.all_phs, self.buf.get())} pi_l_old, v_l_old, ent = self.sess.run( [self.pi_loss, self.v_loss, self.approx_ent], feed_dict=inputs) # Training for i in range(self.train_pi_iters): _, kl = self.sess.run([self.train_pi, self.approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * self.target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(self.train_v_iters): self.sess.run(self.train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = self.sess.run( [self.pi_loss, self.v_loss, self.approx_kl, self.clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def demo_update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v_pi(data).item() for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) for i in range(train_v_iters): vf_pi_optimizer.zero_grad() loss_v = compute_loss_v_pi(data) loss_v.backward() mpi_avg_grads(ac.v_pi) vf_pi_optimizer.step() print("Pi loss: {}".format(pi_l_old)) kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): if args.alpha == 'auto': sess.run(train_alpha_op, feed_dict=inputs) _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # for _ in range(train_v_iters): # sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old), Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha)
def update(epoch): #inputs = {k:v for k,v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run( [pi_loss, v_loss, approx_ent], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) #pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) summary = tf.Summary( value=[tf.Summary.Value(tag="loss", simple_value=pi_l_old)]) test_writer.add_summary(summary, epoch) # Training for i in range(train_pi_iters): _, kl = sess.run( [train_pi, approx_kl], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict={ logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # lllogp, mmmu, llog_std = sess.run([logp, mu, log_std], feed_dict=inputs) # logp is basically the same as logp_old_ph, the error starts from 1e-6, # and this error is a little strange... # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } #all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): #vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs) logger.store( LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), #更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std=std)
def update(): # 把input形成字典,等下便于使用 # 通过搜集到的数据,进行梯度下降,更新参数 inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # 上部分的train是policy,这部分是值函数 for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): obs, act, adv, ret, logp_old = [torch.Tensor(x) for x in buf.get()] # Training policy _, logp, _ = actor_critic.policy(obs, act) ratio = (logp - logp_old).exp() min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv) pi_l_old = -(torch.min(ratio * adv, min_adv)).mean() ent = (-logp).mean() # a sample estimate for entropy for i in range(train_pi_iters): # Output from policy function graph _, logp, _ = actor_critic.policy(obs, act) # PPO policy objective ratio = (logp - logp_old).exp() min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv) pi_loss = -(torch.min(ratio * adv, min_adv)).mean() # Policy gradient step train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() _, logp, _ = actor_critic.policy(obs, act) kl = (logp_old - logp).mean() kl = mpi_avg(kl.item()) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) # Training value function v = actor_critic.value_function(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): # Output from value function graph v = actor_critic.value_function(obs) # PPO value function objective v_loss = F.mse_loss(v, ret) # Value function gradient step train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Log changes from update _, logp, _, v = actor_critic(obs, act) ratio = (logp - logp_old).exp() min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv) pi_l_new = -(torch.min(ratio * adv, min_adv)).mean() v_l_new = F.mse_loss(v, ret) kl = (logp_old - logp).mean() # a sample estimate for KL-divergence clipped = (ratio > (1+clip_ratio)) | (ratio < (1-clip_ratio)) cf = (clipped.float()).mean() logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): data = buf.get() v_l_old, pi_l_old, pi_info_old = compute_loss(data) pi_l_old = pi_l_old.item() vl_l_old = v_l_old.item() # Train policy with multiple steps of gradient descent for i in range(train_iters): optimizer.zero_grad() loss_v, loss_pi, pi_info = compute_loss(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( f'Early stopping at step {i} due to reaching max kl.') break loss = loss_pi + loss_v * v_loss_coeff loss.backward() mpi_avg_grads(ac.ac) # average grads across MPI processes optimizer.step() logger.store(StopIter=i) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } # zip([x_ph, a_ph, adv_ph, ret_ph, logp_old_ph], #[self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf] ) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # 输入上述数据,计算俩loss和熵 # Training for i in range(train_pi_iters): #策略迭代 _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # 计算kl kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break #提前停止策略训练 logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # 训练评价网络 # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) #重新计算loss和kl,cf logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # 输出旧loss,kl,cf 和 delta loss
def update(): print("======= update!") #get aux data from the buffer and match it with its respective placeholders buf_data = buf.get(aux_vars_only=True) aux_inputs = {k: v for k, v in zip(new_phs, buf_data)} #for the training, the actions taken during the experience loop are also inputs to the network extra_dict = {k: v for k, v in buf.act_buf.items() if k is not 'vpred'} for k, v in extra_dict.items(): if k == 'action_movement': extra_dict[k] = np.expand_dims(v, 1) #actions and aux variables from the buffer are joined and passed to compute_metrics (observations are joined within the functions) extra_dict.update(aux_inputs) pi_l_old, v_l_old, ent, kl, cf = compute_metrics(extra_dict) # Policy training loop for i in range(train_pi_iters): if i % 10 == 0: print("training pi iter ", i) kl = train('pi', extra_dict) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) print("") # Value training loop for j in range(train_v_iters): if j % 10 == 0: print("training v iter ", j) train('v', extra_dict) # Log changes from update with a new run on compute_metrics pi_l_new, v_l_new, ent, kl, cf = compute_metrics(extra_dict) #Store information logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) #Reset experience varibales o, ep_ret, ep_len = env.reset(), 0, 0 #Reset policy for policy in policies: policy.reset() print("======= update finished!")
def set_and_eval(step): # set pi params with v_ph # old_params - alpha * x * step sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) # return average of d_kl and pi_loss operation return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))
def mpi_avg_grads(module): """ Average contents of gradient buffers across MPI processes. """ if num_procs() == 1: return for p in module.parameters(): p_grad_numpy = p.grad.detach().numpy() # numpy view of tensor data avg_p_grad = mpi_avg(p.grad.detach()) p_grad_numpy[:] = avg_p_grad[:]
def update(): epsilon = 0.1 data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break # Manually update pi.parameters # loss_pi.backward() for l in ac.pi.logits_net: for x in l.parameters(): y, = torch.autograd.grad(loss_pi, x, create_graph=True, retain_graph=True) w = torch.zeros(y.size(), requires_grad=True) g, = torch.autograd.grad(y, x, grad_outputs=w, create_graph=True) r, = torch.autograd.grad(g, w, grad_outputs=y, create_graph=False) x.grad = y - epsilon * r mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def update(self): inputs = {k: v for k, v in zip(self.all_phs, self.buf.get())} # Training for i in range(self.train_pi_iters): _, kl = self.sess.run([self.train_pi, self.approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * self.target_kl: print( 'process %d: Early stopping at step %d due to reaching max kl.' % (proc_id(), i)) break for _ in range(self.train_v_iters): self.sess.run(self.train_v, feed_dict=inputs)
def update_tmp(self, sess, data): inputs = {k: v for k, v in zip(self.all_phs, data)} # Training for i in range(self.config["train_pi_iters"]): _, kl, entropy = sess.run( [self.train_pi_tmp, self.approx_kl_tmp, self.approx_ent_tmp], feed_dict=inputs, ) kl = mpi_avg(kl) if kl > 1.5 * self.config["target_kl"]: print("Early stopping at step %d due to reaching max kl." % i) break if entropy < 1.5: self.config["entropy_coeff"] = 0.02 if entropy < 1.3: self.config["entropy_coeff"] = 0.04 if entropy < 1.1: self.config["entropy_coeff"] = 0.08 # self.logger.store(StopIterTmp=i) # Log changes from update total_l_new, pi_l_new, v_l_new, kl, ratio_tmp, cf = sess.run( [ self.loss_tmp, self.pi_loss_tmp, self.mean_vf_loss_tmp, self.approx_kl_tmp, self.ratio_tmp, self.clipfrac_tmp, ], feed_dict=inputs, ) return ( total_l_new, pi_l_new, v_l_new, kl, entropy, ratio_tmp, cf, )
def update(): inputs = {} for k, v in zip(all_phs, buf.get()): if type(k) is not dict: inputs[k] = v else: for k_, v_ in zip(k.values(), v.values()): inputs[k_] = v_ pi_l_old, v_l_old, ent = sess.run([pi_loss_sum, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) for k in kl: kl[k] = mpi_avg(kl[k]) if max(list(kl.values())) > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss_sum, v_loss, approx_kl, clipfrac], feed_dict=inputs) sum_dict = lambda x: x if type(x) is not dict else np.sum( list(x.values())) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=sum_dict(kl), Entropy=sum_dict(ent), ClipFrac=sum_dict(cf), DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Max reached at step %d ' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): # create a dictionary of values, which specify to tensorflow what # to input for the placeholders: tensors containing the data from # the trajectory we have stored in buf inputs = {k:v for k, v in zip(all_phs, buf.get())} # calculate these for logging later pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): # run a training step for the policy, and estimate the kl-divergence # (ie. how much the policy changed) on this step _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) # if the kl divergence is too high, stop training on this step # TODO: understand better why it is important to do this if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) # train our value function mlp for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # "Log changes from update" -OpenAI # TODO: This could be made a bit more computationally efficient by not recalculating pi_l_old each loop # after having calculated the same thing as pi_l_new the previous run through the loop! # Plus, does it really make the most sense to output pi_l_old and v_l_old as LossPi and LossV # instead of pi_l_new and v_l_new? pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update_original_policy(self, sess, data): inputs = {k: v for k, v in zip(self.all_phs, data)} # Training for i in range(self.config["train_pi_iters"]): mini_batch = sample_batch(data, self.config["batch_size"]) mini_batch_input = {k: v for k, v in zip(self.all_phs, mini_batch)} _, kl = sess.run([self.train_pi, self.approx_kl], feed_dict=mini_batch_input) kl = mpi_avg(kl) if kl > 1.5 * self.config["target_kl"]: print("Early stopping at step %d due to reaching max kl." % i) break # self.logger.store(StopIter=i) # Log changes from update total_l_new, pi_l_new, v_l_new, kl, ratio, ent, cf = sess.run( [ self.loss, self.pi_loss, self.mean_vf_loss, self.approx_kl, self.ratio, self.approx_ent, self.clipfrac, ], feed_dict=inputs, ) return ( total_l_new, pi_l_new, v_l_new, kl, ent, ratio, cf, )
def update(): # Prepare hessian func, gradient eval # get inputs as a dictionary, all phs and buffer inputs = {k: v for k, v in zip(all_phs, buf.get())} # calculate Hx Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) # get g, pi_l_old, v_l_old g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) # get g and pi_l_old averages g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG # get x x = cg(Hx, g) # get alpha alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) # get old paramers old_params = sess.run(get_pi_params) def set_and_eval(step): # set pi params with v_ph # old_params - alpha * x * step sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) # return average of d_kl and pi_loss operation return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl # for backtrack iterations for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates # for train_v_iterations for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) # update v_l_new with v_loss operation v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old))
def update(): # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data[ 'adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old))
def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))
def average_gradients(param_groups): for param_group in param_groups: for p in param_group['params']: if p.requires_grad: p.grad.data.copy_(torch.Tensor(mpi_avg(p.grad.data.numpy())))
def ppo(env_fn, actor_critic=core_2.mlp_actor_critic, beta=1, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() # game environment obs_dim = env.observation_space.shape # get the observe dimension from environment act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #print(env.action_space) x_ph, a_ph = core_2.placeholders_from_spaces(env.observation_space, env.action_space) # 构建神经网络的时候,a_ph还没有 adv_ph, ret_ph, logp_old_ph, log_old_ph_all = core_2.placeholders(None, None, None, 18) #print(logp_old_ph) #print(log_old_ph_all) # Main outputs from computation graph pi, logp, logp_pi, v, logp_all = actor_critic(x_ph, a_ph, **ac_kwargs) # 目前这里的状态和action都还是放的placeholder # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, log_old_ph_all] # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action) get_action_ops = [pi, v, logp_pi, logp_all] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core_2.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) #print((tf.exp(log_old_ph_all) * (logp - logp_old_ph))) kl = tf.reduce_mean(tf.multiply(tf.exp(log_old_ph_all),tf.transpose([logp - logp_old_ph]))) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) #pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # 两部分的loss pi_loss = -tf.reduce_mean(ratio * adv_ph - beta * kl) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes # 同步参数 sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # 主循环 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, logp_all = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log # 把数据放进 buffer pool 里 buf.store(o, a, r, v_t, logp_t, logp_all) logger.store(VVals=v_t) # o 应该代表observation o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! # 打完一局游戏,执行一次更新 #update() inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kld = sess.run([train_pi, kl], feed_dict=inputs) kld = mpi_avg(kld) if kld > 1.5 * target_kl: beta = 2 * beta if kld < target_kl / 1.5: beta = beta / 2 # logger.log('Early stopping at step %d due to reaching max kl.' % i) # break logger.store(StopIter=i) # 上部分的train是policy,这部分是值函数 for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()