Exemple #1
0
    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Exemple #2
0
    def update():
        data = buf.get()

        # Get loss and info values before update
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with a single step of gradient descent
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        mpi_avg_grads(ac.pi)  # average grads across MPI processes
        pi_optimizer.step()

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent = pi_info['kl'], pi_info_old['ent']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Exemple #3
0
    def update(self):
        data = self.buffer.get()

        actor_loss_old, actor_info_old = self.compute_actor_loss(data)
        actor_loss_old = actor_loss_old.item()
        critic_loss_old = self.compute_critic_loss(data).item()

        # train policy
        self.actor_optimizer.zero_grad()
        actor_loss, actor_info = self.compute_actor_loss(data)
        actor_loss.backward()
        mpi_avg_grads(self.ac.actor)
        self.actor_optimizer.step()

        # train critic
        for i in range(self.num_iter_train_critic):
            self.critic_optimizer.zero_grad()
            critic_loss = self.compute_critic_loss(data)
            critic_loss.backward()
            mpi_avg_grads(self.ac.critic)
            self.critic_optimizer.step()

        #log
        kl, entropy = actor_info['kl'], actor_info['entropy']
        self.logger.store(LossPi=actor_loss_old,
                          LossV=critic_loss_old,
                          KL=kl,
                          Entropy=entropy,
                          DeltaLossV=(critic_loss.item() - critic_loss_old),
                          DeltaLossPi=(actor_loss.item() - actor_loss_old))
Exemple #4
0
def update(buf, ac, train_v_iters, pi_optimizer, vf_optimizer, logger):
    # 提取一个周期的轨迹, data 是一个dict, 键包括obs, act, ret, adv, logp
    data = buf.get()  # data['obs'].shape=(4000, obs_dim), adv.shape=(4000,)

    # 计算更新之前的计算损失函数
    pi_l_old, pi_info_old = compute_loss_pi(data=data, actor=ac.pi)
    pi_l_old = pi_l_old.item()
    v_l_old = compute_loss_v(data, critic=ac.v).item()

    # 更新策略网络的参数
    pi_optimizer.zero_grad()
    loss_pi, pi_info = compute_loss_pi(data=data, actor=ac.pi)
    loss_pi.backward()
    mpi_avg_grads(ac.pi)  # average grads across MPI processes
    pi_optimizer.step()

    # 更新值网络的参数
    for i in range(train_v_iters):
        vf_optimizer.zero_grad()
        loss_v = compute_loss_v(data=data, critic=ac.v)
        loss_v.backward()
        mpi_avg_grads(ac.v)  # average grads across MPI processes
        vf_optimizer.step()

    # Log changes from update
    kl, ent = pi_info['kl'], pi_info_old['ent']
    logger.store(LossPi=pi_l_old,
                 LossV=v_l_old,
                 KL=kl,
                 Entropy=ent,
                 DeltaLossPi=(loss_pi.item() - pi_l_old),
                 DeltaLossV=(loss_v.item() - v_l_old))
Exemple #5
0
    def update():
        data = buf.get()

        v_l_old, pi_l_old, pi_info_old = compute_loss(data)
        pi_l_old = pi_l_old.item()
        vl_l_old = v_l_old.item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_iters):
            optimizer.zero_grad()
            loss_v, loss_pi, pi_info = compute_loss(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    f'Early stopping at step {i} due to reaching max kl.')
                break

            loss = loss_pi + loss_v * v_loss_coeff
            loss.backward()
            mpi_avg_grads(ac.ac)  # average grads across MPI processes
            optimizer.step()

        logger.store(StopIter=i)

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Exemple #6
0
 def demo_update():
     data = buf.get()
     pi_l_old, pi_info_old = compute_loss_pi(data)
     pi_l_old = pi_l_old.item()
     v_l_old = compute_loss_v_pi(data).item()
     for i in range(train_pi_iters):
         pi_optimizer.zero_grad()
         loss_pi, pi_info = compute_loss_pi(data)
         kl = mpi_avg(pi_info['kl'])
         if kl > 1.5 * target_kl:
             # logger.log('Early stopping at step %d due to reaching max kl.' % i)
             break
         loss_pi.backward()
         mpi_avg_grads(ac.pi)  # average grads across MPI processes
         pi_optimizer.step()
     logger.store(StopIter=i)
     for i in range(train_v_iters):
         vf_pi_optimizer.zero_grad()
         loss_v = compute_loss_v_pi(data)
         loss_v.backward()
         mpi_avg_grads(ac.v_pi)
         vf_pi_optimizer.step()
     print("Pi loss:     {}".format(pi_l_old))
     kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
     logger.store(LossPi=pi_l_old,
                  LossV=v_l_old,
                  KL=kl,
                  Entropy=ent,
                  ClipFrac=cf,
                  DeltaLossPi=(loss_pi.item() - pi_l_old),
                  DeltaLossV=(loss_v.item() - v_l_old))
Exemple #7
0
    def update():
        epsilon = 0.1
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            # Manually update pi.parameters
            # loss_pi.backward()
            for l in ac.pi.logits_net:
                for x in l.parameters():
                    y, = torch.autograd.grad(loss_pi,
                                             x,
                                             create_graph=True,
                                             retain_graph=True)
                    w = torch.zeros(y.size(), requires_grad=True)
                    g, = torch.autograd.grad(y,
                                             x,
                                             grad_outputs=w,
                                             create_graph=True)
                    r, = torch.autograd.grad(g,
                                             w,
                                             grad_outputs=y,
                                             create_graph=False)
                    x.grad = y - epsilon * r

            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Exemple #8
0
 def update_vf():
     data = buf.get()
     v_l_old = compute_loss_v(data).item()
     print("Loss for Value function: {}".format(v_l_old))
     for i in range(train_v_iters):
         vf_optimizer.zero_grad()
         loss_v = compute_loss_v(data)
         loss_v.backward()
         mpi_avg_grads(ac.v)
         vf_optimizer.step()
Exemple #9
0
    def update(episode_buffer):
        # Update
        if episode_buffer.dones[-1]:
            next_value = 0.0
        else:
            last_obs = episode_buffer.next_observations[-1]
            previous_reward = episode_buffer.rewards[-1]
            last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0)
            previous_reward_tensor = torch.tensor([previous_reward], dtype=torch.float32).unsqueeze(0)
            context = agent.get_context()
            next_value = target_agent.predict_value(obs_tensor=last_obs_tensor,
                                                    previous_reward_tensor=previous_reward_tensor,
                                                    goal_grid_code_tensor=goal_grid_code_tensor,
                                                    context=context).cpu().item()

        # Super critical!!
        optimizer.zero_grad()

        # Compute value and policy losses
        loss, info = agent.compute_loss(rewards=np.array(episode_buffer.rewards),
                                        dones=np.array(episode_buffer.dones),
                                        next_value=next_value,
                                        discount_factor=gamma,
                                        use_gae=use_gae,
                                        tau=tau,
                                        value_loss_coef=value_loss_coef,
                                        policy_loss_coef=policy_loss_coef,
                                        entropy_reg_coef=entropy_loss_coef,
                                        grid_layer_wreg_loss_coef=grid_layer_weight_reg_loss_coef)
        loss.backward()
        if use_MPI:
            mpi_pytorch.mpi_avg_grads(agent)

        # Optimize
        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        optimizer.step()

        # Log losses and info
        logger.store(**info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(agent.parameters(), target_agent.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
        if use_MPI:
            mpi_pytorch.sync_params(target_agent)
Exemple #10
0
    def update():
        data = buf.get()

        # Get loss and info values before update
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with a single step of gradient descent
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)

        # Manually update pi.parameters
        # loss_pi.backward()
        for l in ac.pi.logits_net:
            for x in l.parameters():
                y, = torch.autograd.grad(loss_pi,
                                         x,
                                         create_graph=True,
                                         retain_graph=True)
                w = torch.zeros(y.size(), requires_grad=True)
                g, = torch.autograd.grad(y,
                                         x,
                                         grad_outputs=w,
                                         create_graph=True)
                r, = torch.autograd.grad(g,
                                         w,
                                         grad_outputs=y,
                                         create_graph=False)
                x.grad = y - epsilon * r

        mpi_avg_grads(ac.pi)  # average grads across MPI processes
        pi_optimizer.step()

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent = pi_info['kl'], pi_info_old['ent']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))
Exemple #11
0
        def update():

            # Set up function for computing PPO policy loss
            def compute_loss_pi(data):
                obs, act, adv, logp_old = data['obs'], data['act'], data[
                    'adv'], data['logp']

                # Policy loss
                pi, logp = ac.pi(obs, act)
                ratio = torch.exp(logp - logp_old)
                clip_adv = torch.clamp(ratio, 1 - clip_ratio,
                                       1 + clip_ratio) * adv
                loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

                # Useful extra info
                approx_kl = (logp_old - logp).mean().item()
                ent = pi.entropy().mean().item()
                clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
                clipfrac = torch.as_tensor(clipped,
                                           dtype=torch.float32).mean().item()
                pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

                return loss_pi, pi_info

            # Set up function for computing value loss
            def compute_loss_v(data):
                obs, ret = data['obs'], data['ret']
                return ((ac.v(obs) - ret)**2).mean()

            data = buf.get()

            pi_l_old, pi_info_old = compute_loss_pi(data)
            pi_l_old = pi_l_old.item()
            v_l_old = compute_loss_v(data).item()

            # Train policy with multiple steps of gradient descent
            for i in range(train_pi_iters):
                pi_optimizer.zero_grad()
                loss_pi, pi_info = compute_loss_pi(data)
                kl = mpi_avg(pi_info['kl'])
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break
                loss_pi.backward()
                mpi_avg_grads(ac.pi)  # average grads across MPI processes
                pi_optimizer.step()

            logger.store(StopIter=i)

            # Value function learning
            for i in range(train_v_iters):
                vf_optimizer.zero_grad()
                loss_v = compute_loss_v(data)
                loss_v.backward()
                mpi_avg_grads(ac.v)  # average grads across MPI processes
                vf_optimizer.step()

            # Log changes from update
            kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
            logger.store(LossPi=pi_l_old,
                         LossV=v_l_old,
                         KL=kl,
                         Entropy=ent,
                         ClipFrac=cf,
                         DeltaLossPi=(loss_pi.item() - pi_l_old),
                         DeltaLossV=(loss_v.item() - v_l_old))
Exemple #12
0
    def update():
        data = buf.get()

        # compute old pi distribution
        obs, act = data['obs'], data['act']
        with torch.no_grad():
            old_pi, _ = ac.pi(obs, act)

        pi_loss = compute_loss_pi(data)
        pi_l_old = pi_loss.item()
        v_l_old = compute_loss_v(data).item()

        grads = core.flat_grads(
            torch.autograd.grad(pi_loss, ac.pi.parameters()))

        # Core calculations for TRPO or NPG
        Hx = lambda v: hessian_vector_product(data, old_pi, v)
        x = core.conjugate_gradients(Hx, grads, cg_iters)

        alpha = torch.sqrt(2 * delta / (torch.matmul(x, Hx(x)) + EPS))

        old_params = core.get_flat_params_from(ac.pi)

        def set_and_eval(step):
            new_params = old_params - alpha * x * step
            core.set_flat_params_to(ac.pi, new_params)
            loss_pi, kl_loss = compute_kl_loss_pi(data, old_pi)
            return kl_loss.item(), loss_pi.item()

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))