def update():
        # Prepare hessian func, gradient eval
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))
        old_params = sess.run(get_pi_params)
        old_penalty = env.penalty(env.s)

        def set_and_eval(step):
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            for j in range(backtrack_iters):
                old_penalty = env.penalty(env.s)
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                penalty = env.penalty(env.s)
                #print("Old Penalty {}, Penalty {}".format(old_penalty,penalty))
                #if kl <= delta and pi_l_new <= pi_l_old:
                if penalty == 0 or penalty < old_penalty:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    logger.store(penalty=penalty, old_penalty=old_penalty)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    logger.store(penalty=penalty, old_penalty=old_penalty)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)
        v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
Beispiel #2
0
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        #TODO: Next step is to calculate the hessian using safe distance
        #Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old, ent = sess.run(
            [gradient, pi_loss, v_loss, approx_ent], feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)
        #x = cg(Hx, g)
        #x = optimize.fmin_cg(pi_l_old, x0, fprime=g)
        x = g
        old_params = sess.run(get_pi_params)
        old_penalty = env.penalty(env.s)
        alpha = np.sqrt(2 * delta / (np.dot(x, g) + EPS))

        # backtracking line search, hard constraint check on env penalty
        for j in range(backtrack_iters):
            step = backtrack_coeff**j
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})
            pi_l_new = sess.run([pi_loss], feed_dict=inputs)
            penalty = env.penalty(env.s)
            #print("Old Penalty {}, Penalty {}".format(old_penalty,penalty))

            if penalty == 0 or penalty < old_penalty:
                #if pi_l_new <= pi_l_old:
                logger.log('Accepting new params at step %d of line search.' %
                           j)
                logger.store(BacktrackIters=j)
                logger.store(penalty=penalty, old_penalty=old_penalty)
                break

            if j == backtrack_iters - 1:
                logger.log('Line search failed! Keeping old params.')
                logger.store(BacktrackIters=j)
                logger.store(penalty=penalty, old_penalty=old_penalty)

        # Policy gradient step
        #sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        #pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict={v_ph: old_params - alpha * x * step})
        logger.store(LossPi=pi_l_old,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old))
    def update(self):
        logger = self.logger

        inputs = {k: v for k, v in zip(self.all_phs, self.buf.get())}
        pi_l_old, v_l_old, ent = self.sess.run(
            [self.pi_loss, self.v_loss, self.approx_ent], feed_dict=inputs)

        # Training
        for i in range(self.train_pi_iters):
            _, kl = self.sess.run([self.train_pi, self.approx_kl],
                                  feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * self.target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(self.train_v_iters):
            self.sess.run(self.train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = self.sess.run(
            [self.pi_loss, self.v_loss, self.approx_kl, self.clipfrac],
            feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
Beispiel #4
0
 def demo_update():
     data = buf.get()
     pi_l_old, pi_info_old = compute_loss_pi(data)
     pi_l_old = pi_l_old.item()
     v_l_old = compute_loss_v_pi(data).item()
     for i in range(train_pi_iters):
         pi_optimizer.zero_grad()
         loss_pi, pi_info = compute_loss_pi(data)
         kl = mpi_avg(pi_info['kl'])
         if kl > 1.5 * target_kl:
             # logger.log('Early stopping at step %d due to reaching max kl.' % i)
             break
         loss_pi.backward()
         mpi_avg_grads(ac.pi)  # average grads across MPI processes
         pi_optimizer.step()
     logger.store(StopIter=i)
     for i in range(train_v_iters):
         vf_pi_optimizer.zero_grad()
         loss_v = compute_loss_v_pi(data)
         loss_v.backward()
         mpi_avg_grads(ac.v_pi)
         vf_pi_optimizer.step()
     print("Pi loss:     {}".format(pi_l_old))
     kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
     logger.store(LossPi=pi_l_old,
                  LossV=v_l_old,
                  KL=kl,
                  Entropy=ent,
                  ClipFrac=cf,
                  DeltaLossPi=(loss_pi.item() - pi_l_old),
                  DeltaLossV=(loss_v.item() - v_l_old))
Beispiel #5
0
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            if args.alpha == 'auto':
                sess.run(train_alpha_op, feed_dict=inputs)
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        # for _ in range(train_v_iters):
        #     sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old),
                     Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha)
Beispiel #6
0
    def update(epoch):
        #inputs = {k:v for k,v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run(
            [pi_loss, v_loss, approx_ent],
            feed_dict={
                logp_old_ph: buf.logp_buf,
                x_ph: o,
                a_ph: a,
                adv_ph: buf.adv_buf,
                ret_ph: buf.ret_buf
            })
        #pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)
        summary = tf.Summary(
            value=[tf.Summary.Value(tag="loss", simple_value=pi_l_old)])
        test_writer.add_summary(summary, epoch)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run(
                [train_pi, approx_kl],
                feed_dict={
                    logp_old_ph: buf.logp_buf,
                    x_ph: o,
                    a_ph: a,
                    adv_ph: buf.adv_buf,
                    ret_ph: buf.ret_buf
                })
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v,
                     feed_dict={
                         logp_old_ph: buf.logp_buf,
                         x_ph: o,
                         a_ph: a,
                         adv_ph: buf.adv_buf,
                         ret_ph: buf.ret_buf
                     })

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac],
            feed_dict={
                logp_old_ph: buf.logp_buf,
                x_ph: o,
                a_ph: a,
                adv_ph: buf.adv_buf,
                ret_ph: buf.ret_buf
            })
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)
        # lllogp, mmmu, llog_std = sess.run([logp, mu, log_std], feed_dict=inputs)

        # logp is basically the same as logp_old_ph, the error starts from 1e-6,
        # and this error is a little strange...

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
    def update():
        inputs = {k: v
                  for k, v in zip(all_phs, buf.get())
                  }  #all_phsは各バッファーに対応するプレースホルダー辞書
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training#ここも変える必要あり? おそらく変えなくて良い
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:  #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):  #vの更新
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update(新しいロスの計算)
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)

        std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs)
        logger.store(
            LossPi=pi_l_old,
            LossV=v_l_old,
            KL=kl,
            Entropy=std_ent,
            ClipFrac=cf,
            DeltaLossPi=(pi_l_new - pi_l_old),  #更新での改善量
            DeltaLossV=(v_l_new - v_l_old),
            Std=std)
Beispiel #9
0
    def update():
        # 把input形成字典,等下便于使用
        # 通过搜集到的数据,进行梯度下降,更新参数
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        # 上部分的train是policy,这部分是值函数
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
Beispiel #10
0
    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Beispiel #11
0
    def update():
        obs, act, adv, ret, logp_old = [torch.Tensor(x) for x in buf.get()]

        # Training policy
        _, logp, _ = actor_critic.policy(obs, act)
        ratio = (logp - logp_old).exp()
        min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv)
        pi_l_old = -(torch.min(ratio * adv, min_adv)).mean()
        ent = (-logp).mean()  # a sample estimate for entropy

        for i in range(train_pi_iters):
            # Output from policy function graph
            _, logp, _ = actor_critic.policy(obs, act)
            # PPO policy objective
            ratio = (logp - logp_old).exp()
            min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv)
            pi_loss = -(torch.min(ratio * adv, min_adv)).mean()

            # Policy gradient step
            train_pi.zero_grad()
            pi_loss.backward()
            average_gradients(train_pi.param_groups)
            train_pi.step()

            _, logp, _ = actor_critic.policy(obs, act)
            kl = (logp_old - logp).mean()
            kl = mpi_avg(kl.item())
            if kl > 1.5 * target_kl:
                logger.log('Early stopping at step %d due to reaching max kl.'%i)
                break
        logger.store(StopIter=i)

        # Training value function
        v = actor_critic.value_function(obs)
        v_l_old = F.mse_loss(v, ret)
        for _ in range(train_v_iters):
            # Output from value function graph
            v = actor_critic.value_function(obs)
            # PPO value function objective
            v_loss = F.mse_loss(v, ret)

            # Value function gradient step
            train_v.zero_grad()
            v_loss.backward()
            average_gradients(train_v.param_groups)
            train_v.step()

        # Log changes from update
        _, logp, _, v = actor_critic(obs, act)
        ratio = (logp - logp_old).exp()
        min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv)
        pi_l_new = -(torch.min(ratio * adv, min_adv)).mean()
        v_l_new = F.mse_loss(v, ret)
        kl = (logp_old - logp).mean() # a sample estimate for KL-divergence
        clipped = (ratio > (1+clip_ratio)) | (ratio < (1-clip_ratio))
        cf = (clipped.float()).mean()
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
Beispiel #12
0
    def update():
        data = buf.get()

        v_l_old, pi_l_old, pi_info_old = compute_loss(data)
        pi_l_old = pi_l_old.item()
        vl_l_old = v_l_old.item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_iters):
            optimizer.zero_grad()
            loss_v, loss_pi, pi_info = compute_loss(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    f'Early stopping at step {i} due to reaching max kl.')
                break

            loss = loss_pi + loss_v * v_loss_coeff
            loss.backward()
            mpi_avg_grads(ac.ac)  # average grads across MPI processes
            optimizer.step()

        logger.store(StopIter=i)

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Beispiel #13
0
    def update():
        inputs = {k: v
                  for k, v in zip(all_phs, buf.get())
                  }  # zip([x_ph, a_ph, adv_ph, ret_ph, logp_old_ph],
        #[self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf] )
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)  # 输入上述数据,计算俩loss和熵

        # Training
        for i in range(train_pi_iters):  #策略迭代
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)  # 计算kl
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break  #提前停止策略训练
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)  # 训练评价网络

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac],
            feed_dict=inputs)  #重新计算loss和kl,cf
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new -
                                 v_l_old))  # 输出旧loss,kl,cf 和 delta loss
Beispiel #14
0
    def update():

        print("======= update!")

        #get aux data from the buffer and match it with its respective placeholders
        buf_data = buf.get(aux_vars_only=True)
        aux_inputs = {k: v for k, v in zip(new_phs, buf_data)}

        #for the training, the actions taken during the experience loop are also inputs to the network
        extra_dict = {k: v for k, v in buf.act_buf.items() if k is not 'vpred'}

        for k, v in extra_dict.items():
            if k == 'action_movement':
                extra_dict[k] = np.expand_dims(v, 1)

        #actions and aux variables from the buffer are joined and passed to compute_metrics (observations are joined within the functions)
        extra_dict.update(aux_inputs)
        pi_l_old, v_l_old, ent, kl, cf = compute_metrics(extra_dict)

        # Policy training loop
        for i in range(train_pi_iters):
            if i % 10 == 0:
                print("training pi iter ", i)
            kl = train('pi', extra_dict)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break

        logger.store(StopIter=i)
        print("")

        # Value training loop
        for j in range(train_v_iters):
            if j % 10 == 0:
                print("training v iter ", j)
            train('v', extra_dict)

        # Log changes from update with a new run on compute_metrics
        pi_l_new, v_l_new, ent, kl, cf = compute_metrics(extra_dict)

        #Store information
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

        #Reset experience varibales
        o, ep_ret, ep_len = env.reset(), 0, 0

        #Reset policy
        for policy in policies:
            policy.reset()

        print("======= update finished!")
Beispiel #15
0
        def set_and_eval(step):
            # set pi params with v_ph
            # old_params - alpha * x * step
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})

            # return average of d_kl and pi_loss operation
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))
Beispiel #16
0
def mpi_avg_grads(module):
    """ Average contents of gradient buffers across MPI processes. """
    if num_procs() == 1:
        return
    for p in module.parameters():
        p_grad_numpy = p.grad.detach().numpy()  # numpy view of tensor data
        avg_p_grad = mpi_avg(p.grad.detach())
        p_grad_numpy[:] = avg_p_grad[:]
Beispiel #17
0
    def update():
        epsilon = 0.1
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            # Manually update pi.parameters
            # loss_pi.backward()
            for l in ac.pi.logits_net:
                for x in l.parameters():
                    y, = torch.autograd.grad(loss_pi,
                                             x,
                                             create_graph=True,
                                             retain_graph=True)
                    w = torch.zeros(y.size(), requires_grad=True)
                    g, = torch.autograd.grad(y,
                                             x,
                                             grad_outputs=w,
                                             create_graph=True)
                    r, = torch.autograd.grad(g,
                                             w,
                                             grad_outputs=y,
                                             create_graph=False)
                    x.grad = y - epsilon * r

            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Beispiel #18
0
    def update(self):
        inputs = {k: v for k, v in zip(self.all_phs, self.buf.get())}

        # Training
        for i in range(self.train_pi_iters):
            _, kl = self.sess.run([self.train_pi, self.approx_kl],
                                  feed_dict=inputs)
            kl = mpi_avg(kl)

            if kl > 1.5 * self.target_kl:
                print(
                    'process %d: Early stopping at step %d due to reaching max kl.'
                    % (proc_id(), i))
                break

        for _ in range(self.train_v_iters):
            self.sess.run(self.train_v, feed_dict=inputs)
Beispiel #19
0
    def update_tmp(self, sess, data):
        inputs = {k: v for k, v in zip(self.all_phs, data)}
        # Training
        for i in range(self.config["train_pi_iters"]):
            _, kl, entropy = sess.run(
                [self.train_pi_tmp, self.approx_kl_tmp, self.approx_ent_tmp],
                feed_dict=inputs,
            )
            kl = mpi_avg(kl)
            if kl > 1.5 * self.config["target_kl"]:
                print("Early stopping at step %d due to reaching max kl." % i)
                break
            if entropy < 1.5:
                self.config["entropy_coeff"] = 0.02
            if entropy < 1.3:
                self.config["entropy_coeff"] = 0.04
            if entropy < 1.1:
                self.config["entropy_coeff"] = 0.08
        # self.logger.store(StopIterTmp=i)

        # Log changes from update
        total_l_new, pi_l_new, v_l_new, kl, ratio_tmp, cf = sess.run(
            [
                self.loss_tmp,
                self.pi_loss_tmp,
                self.mean_vf_loss_tmp,
                self.approx_kl_tmp,
                self.ratio_tmp,
                self.clipfrac_tmp,
            ],
            feed_dict=inputs,
        )
        return (
            total_l_new,
            pi_l_new,
            v_l_new,
            kl,
            entropy,
            ratio_tmp,
            cf,
        )
Beispiel #20
0
    def update():

        inputs = {}
        for k, v in zip(all_phs, buf.get()):
            if type(k) is not dict:
                inputs[k] = v
            else:
                for k_, v_ in zip(k.values(), v.values()):
                    inputs[k_] = v_

        pi_l_old, v_l_old, ent = sess.run([pi_loss_sum, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            for k in kl:
                kl[k] = mpi_avg(kl[k])
            if max(list(kl.values())) > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss_sum, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        sum_dict = lambda x: x if type(x) is not dict else np.sum(
            list(x.values()))

        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=sum_dict(kl),
                     Entropy=sum_dict(ent),
                     ClipFrac=sum_dict(cf),
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
Beispiel #21
0
 def update():
     inputs = {k: v for k, v in zip(all_phs, buf.get())}
     pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                       feed_dict=inputs)
     for i in range(train_pi_iters):
         _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
         kl = mpi_avg(kl)
         if kl > 1.5 * target_kl:
             logger.log('Max reached at step %d ' % i)
             break
     logger.store(StopIter=i)
     for _ in range(train_v_iters):
         sess.run(train_v, feed_dict=inputs)
     pi_l_new, v_l_new, kl, cf = sess.run(
         [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
     logger.store(LossPi=pi_l_old,
                  LossV=v_l_old,
                  KL=kl,
                  Entropy=ent,
                  ClipFrac=cf,
                  DeltaLossPi=(pi_l_new - pi_l_old),
                  DeltaLossV=(v_l_new - v_l_old))
Beispiel #22
0
  def update():
    # create a dictionary of values, which specify to tensorflow what
    # to input for the placeholders: tensors containing the data from
    # the trajectory we have stored in buf
    inputs = {k:v for k, v in zip(all_phs, buf.get())}

    # calculate these for logging later
    pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

    # Training
    for i in range(train_pi_iters):
      # run a training step for the policy, and estimate the kl-divergence
      # (ie. how much the policy changed) on this step
      _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
      kl = mpi_avg(kl)

      # if the kl divergence is too high, stop training on this step
      # TODO: understand better why it is important to do this
      if kl > 1.5 * target_kl:
        logger.log('Early stopping at step %d due to reaching max kl.'%i)
        break

    logger.store(StopIter=i)

    # train our value function mlp
    for _ in range(train_v_iters):
      sess.run(train_v, feed_dict=inputs)

    # "Log changes from update" -OpenAI
    # TODO: This could be made a bit more computationally efficient by not recalculating pi_l_old each loop
    # after having calculated the same thing as pi_l_new the previous run through the loop!
    # Plus, does it really make the most sense to output pi_l_old and v_l_old as LossPi and LossV
    # instead of pi_l_new and v_l_new?
    pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
    logger.store(LossPi=pi_l_old, LossV=v_l_old,
        KL=kl, Entropy=ent, ClipFrac=cf,
        DeltaLossPi=(pi_l_new - pi_l_old),
        DeltaLossV=(v_l_new - v_l_old))
Beispiel #23
0
    def update_original_policy(self, sess, data):

        inputs = {k: v for k, v in zip(self.all_phs, data)}
        # Training
        for i in range(self.config["train_pi_iters"]):
            mini_batch = sample_batch(data, self.config["batch_size"])
            mini_batch_input = {k: v for k, v in zip(self.all_phs, mini_batch)}
            _, kl = sess.run([self.train_pi, self.approx_kl],
                             feed_dict=mini_batch_input)
            kl = mpi_avg(kl)
            if kl > 1.5 * self.config["target_kl"]:
                print("Early stopping at step %d due to reaching max kl." % i)
                break
        # self.logger.store(StopIter=i)

        # Log changes from update
        total_l_new, pi_l_new, v_l_new, kl, ratio, ent, cf = sess.run(
            [
                self.loss,
                self.pi_loss,
                self.mean_vf_loss,
                self.approx_kl,
                self.ratio,
                self.approx_ent,
                self.clipfrac,
            ],
            feed_dict=inputs,
        )

        return (
            total_l_new,
            pi_l_new,
            v_l_new,
            kl,
            ent,
            ratio,
            cf,
        )
Beispiel #24
0
    def update():
        # Prepare hessian func, gradient eval
        # get inputs as a dictionary, all phs and buffer
        inputs = {k: v for k, v in zip(all_phs, buf.get())}

        # calculate Hx
        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))

        # get g, pi_l_old, v_l_old
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)

        # get g and pi_l_old averages
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        # get x
        x = cg(Hx, g)

        # get alpha
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))

        # get old paramers
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            # set pi params with v_ph
            # old_params - alpha * x * step
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})

            # return average of d_kl and pi_loss operation
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            # for backtrack iterations
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        # for train_v_iterations
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)

        # update v_l_new with v_loss operation
        v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
Beispiel #25
0
        def update():

            # Set up function for computing PPO policy loss
            def compute_loss_pi(data):
                obs, act, adv, logp_old = data['obs'], data['act'], data[
                    'adv'], data['logp']

                # Policy loss
                pi, logp = ac.pi(obs, act)
                ratio = torch.exp(logp - logp_old)
                clip_adv = torch.clamp(ratio, 1 - clip_ratio,
                                       1 + clip_ratio) * adv
                loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

                # Useful extra info
                approx_kl = (logp_old - logp).mean().item()
                ent = pi.entropy().mean().item()
                clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
                clipfrac = torch.as_tensor(clipped,
                                           dtype=torch.float32).mean().item()
                pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

                return loss_pi, pi_info

            # Set up function for computing value loss
            def compute_loss_v(data):
                obs, ret = data['obs'], data['ret']
                return ((ac.v(obs) - ret)**2).mean()

            data = buf.get()

            pi_l_old, pi_info_old = compute_loss_pi(data)
            pi_l_old = pi_l_old.item()
            v_l_old = compute_loss_v(data).item()

            # Train policy with multiple steps of gradient descent
            for i in range(train_pi_iters):
                pi_optimizer.zero_grad()
                loss_pi, pi_info = compute_loss_pi(data)
                kl = mpi_avg(pi_info['kl'])
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break
                loss_pi.backward()
                mpi_avg_grads(ac.pi)  # average grads across MPI processes
                pi_optimizer.step()

            logger.store(StopIter=i)

            # Value function learning
            for i in range(train_v_iters):
                vf_optimizer.zero_grad()
                loss_v = compute_loss_v(data)
                loss_v.backward()
                mpi_avg_grads(ac.v)  # average grads across MPI processes
                vf_optimizer.step()

            # Log changes from update
            kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
            logger.store(LossPi=pi_l_old,
                         LossV=v_l_old,
                         KL=kl,
                         Entropy=ent,
                         ClipFrac=cf,
                         DeltaLossPi=(loss_pi.item() - pi_l_old),
                         DeltaLossV=(loss_v.item() - v_l_old))
Beispiel #26
0
 def set_and_eval(step):
     sess.run(set_pi_params,
              feed_dict={v_ph: old_params - alpha * x * step})
     return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))
Beispiel #27
0
def average_gradients(param_groups):
    for param_group in param_groups:
        for p in param_group['params']:
            if p.requires_grad:
                p.grad.data.copy_(torch.Tensor(mpi_avg(p.grad.data.numpy())))
Beispiel #28
0
def ppo(env_fn, actor_critic=core_2.mlp_actor_critic, beta=1, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()  # game environment
    obs_dim = env.observation_space.shape  # get the observe dimension from environment
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    #print(env.action_space)
    x_ph, a_ph = core_2.placeholders_from_spaces(env.observation_space, env.action_space)  # 构建神经网络的时候,a_ph还没有
    adv_ph, ret_ph, logp_old_ph, log_old_ph_all = core_2.placeholders(None, None, None, 18)
    #print(logp_old_ph)
    #print(log_old_ph_all)
    # Main outputs from computation graph
    pi, logp, logp_pi, v, logp_all = actor_critic(x_ph, a_ph, **ac_kwargs)  # 目前这里的状态和action都还是放的placeholder

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, log_old_ph_all]

    # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action)
    get_action_ops = [pi, v, logp_pi, logp_all]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core_2.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    #print((tf.exp(log_old_ph_all) * (logp - logp_old_ph)))
    kl = tf.reduce_mean(tf.multiply(tf.exp(log_old_ph_all),tf.transpose([logp - logp_old_ph])))
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    #pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))  # 两部分的loss
    pi_loss = -tf.reduce_mean(ratio * adv_ph - beta * kl)

    v_loss = tf.reduce_mean((ret_ph - v) ** 2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    # 同步参数
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    # 主循环
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t, logp_all = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            # 把数据放进 buffer pool 里
            buf.store(o, a, r, v_t, logp_t, logp_all)
            logger.store(VVals=v_t)
            # o 应该代表observation
            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        # 打完一局游戏,执行一次更新
        #update()

        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kld = sess.run([train_pi, kl], feed_dict=inputs)
            kld = mpi_avg(kld)
            if kld > 1.5 * target_kl:
                beta = 2 * beta
            if kld < target_kl / 1.5:
                beta = beta / 2
                # logger.log('Early stopping at step %d due to reaching max kl.' % i)
                # break
        logger.store(StopIter=i)
        # 上部分的train是policy,这部分是值函数
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()