Beispiel #1
0
def measure(name, log_enable=True):
    import time
    s = time.time()
    yield
    e = time.time()
    if log_enable:
        logger.log("{}: {:.4f}sec".format(name, e - s))
Beispiel #2
0
    def __init__(self,
                 env,
                 pol,
                 num_parallel=8,
                 prepro=None,
                 seed=256,
                 worker_cls=None,
                 node_info={}):
        if not ray.is_initialized():
            logger.log(
                "Ray is not initialized. Initialize ray with no GPU resources")
            init_ray()

        pol = copy.deepcopy(pol)
        pol.to('cpu')

        pol = ray.put(pol)
        env = ray.put(env)

        resources = []
        for k, v in node_info.items():
            for _ in range(v):
                resources.append({k: 1})
        assert len(resources) <= num_parallel
        if len(resources) < num_parallel:
            for _ in range(num_parallel - len(resources)):
                resources.append(None)

        if worker_cls is None:
            worker_cls = DefaultSampleWorker

        self.workers = [
            worker_cls.as_remote(resources=r).remote(pol, env, seed, i, prepro)
            for i, r in zip(range(num_parallel), resources)
        ]
Beispiel #3
0
def train(traj,
          qf, lagged_qf, targ_qf1, targ_qf2,
          optim_qf,
          epoch, batch_size,  # optimization hypers
          tau=0.9999, gamma=0.9,  # advantage estimation
          loss_type='mse'
          ):
    """
    Train function for qtopt

    Parameters
    ----------
    traj : Traj
        Off policy trajectory.
    qf : SAVfunction
        Q function.
    lagged_qf : SAVfunction
        Lagged Q function.
    targ_qf1 : CEMSAVfunction
        Target Q function.
    targ_qf2 : CEMSAVfunction
        Lagged Target Q function.
    optim_qf : torch.optim.Optimizer
        Optimizer for Q function.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    tau : float
        Target updating rate.
    gamma : float
        Discounting rate.
    loss_type : string
        Type of belleman loss.
    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    qf_losses = []
    logger.log("Optimizing...")

    iterator = traj.random_batch(batch_size, epoch)
    for batch in iterator:
        qf_bellman_loss = lf.clipped_double_bellman(
            qf, targ_qf1, targ_qf2, batch, gamma, loss_type=loss_type)
        optim_qf.zero_grad()
        qf_bellman_loss.backward()
        optim_qf.step()

        for q, targ_q1 in zip(qf.parameters(), targ_qf1.parameters()):
            targ_q1.detach().copy_((1 - tau) * targ_q1.detach() + tau * q.detach())

        for lagged_q, targ_q2 in zip(lagged_qf.parameters(), targ_qf2.parameters()):
            targ_q2.detach().copy_((1 - tau) * targ_q2.detach() + tau * lagged_q.detach())

        qf_losses.append(qf_bellman_loss.detach().cpu().numpy())
    logger.log("Optimization finished!")
    return {'QfLoss': qf_losses}
Beispiel #4
0
def train_dm(traj,
             dyn_model,
             optim_dm,
             epoch=60,
             batch_size=512,
             target='next_obs',
             td=True,
             num_epi_per_seq=1,
             log_enable=True):
    """
    Train function for dynamics model.

    Parameters
    ----------
    traj : Traj
        On policy trajectory.
    dyn_model : Model
        dynamics model.
    optim_dm : torch.optim.Optimizer
        Optimizer for dynamics model.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    target : str
        Target of prediction is next_obs or rews.
    td : bool
        If True, dyn_model learn temporal differance of target.
    num_epi_per_seq : int
        Number of episodes in one sequence for rnn.
    log_enable: bool
        If True, enable logging

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    dm_losses = []
    if log_enable:
        logger.log("Optimizing...")

    batch_size = min(batch_size, traj.num_epi)
    if dyn_model.rnn:
        iterator = traj.random_batch_rnn(batch_size=batch_size, epoch=epoch)
    else:
        iterator = traj.random_batch(batch_size, epoch)

    for batch in iterator:
        dm_loss = update_dm(dyn_model, optim_dm, batch, target=target, td=td)
        dm_losses.append(dm_loss)
    if log_enable:
        logger.log("Optimization finished!")

    return dict(DynModelLoss=dm_losses)
Beispiel #5
0
def train(
    traj,
    pol,
    targ_pol,
    qf,
    targ_qf,
    optim_pol,
    optim_qf,
    epoch,
    batch_size,  # optimization hypers
    tau,
    gamma,
    log_enable=True,
):

    pol_losses = []
    qf_losses = []
    if log_enable:
        logger.log("Optimizing...")
    for batch, indices in traj.prioritized_random_batch(batch_size,
                                                        epoch,
                                                        return_indices=True):
        qf_bellman_loss = lf.bellman(qf,
                                     targ_qf,
                                     targ_pol,
                                     batch,
                                     gamma,
                                     reduction='none')
        td_loss = torch.sqrt(qf_bellman_loss * 2)
        qf_bellman_loss = torch.mean(qf_bellman_loss)
        optim_qf.zero_grad()
        qf_bellman_loss.backward()
        optim_qf.step()

        pol_loss = lf.ag(pol, qf, batch)
        optim_pol.zero_grad()
        pol_loss.backward()
        optim_pol.step()

        for p, targ_p in zip(pol.parameters(), targ_pol.parameters()):
            targ_p.detach().copy_((1 - tau) * targ_p.detach() +
                                  tau * p.detach())
        for q, targ_q in zip(qf.parameters(), targ_qf.parameters()):
            targ_q.detach().copy_((1 - tau) * targ_q.detach() +
                                  tau * q.detach())

        qf_losses.append(qf_bellman_loss.detach().cpu().numpy())
        pol_losses.append(pol_loss.detach().cpu().numpy())

        traj = tf.update_pris(traj, td_loss, indices)
    if log_enable:
        logger.log("Optimization finished!")

    return {'PolLoss': pol_losses, 'QfLoss': qf_losses}
Beispiel #6
0
def update_pol(pol,
               batch,
               make_kl=make_kl,
               max_kl=0.01,
               damping=0.1,
               num_cg=10,
               ent_beta=0):
    pol_loss = lf.pg(pol, batch, ent_beta)
    grads = torch.autograd.grad(pol_loss, pol.parameters(), create_graph=True)
    grads = [g.contiguous() for g in grads]
    flat_pol_loss_grad = nn.utils.parameters_to_vector(grads).detach()

    def Fvp(v):
        kl = make_kl(pol, batch)
        kl = torch.mean(kl)

        grads = torch.autograd.grad(kl, pol.parameters(), create_graph=True)
        grads = [g.contiguous() for g in grads]
        flat_grad_kl = nn.utils.parameters_to_vector(grads)
        gvp = torch.sum(flat_grad_kl * v)
        grads = torch.autograd.grad(gvp, pol.parameters())
        grads = [g.contiguous() for g in grads]
        fvp = nn.utils.parameters_to_vector(grads).detach()

        return fvp + v * damping

    stepdir = conjugate_gradients(Fvp, -flat_pol_loss_grad, num_cg)

    shs = 0.5 * torch.sum(stepdir * Fvp(stepdir), 0, keepdim=True)
    if (shs < 0).any():
        logger.log('invalid shs')
        return pol_loss.data.cpu().numpy()

    lm = torch.sqrt(shs / max_kl)
    fullstep = stepdir / lm[0]

    neggdotstepdir = torch.sum(-flat_pol_loss_grad * stepdir, 0, keepdim=True)

    prev_params = nn.utils.parameters_to_vector(
        [p.contiguous() for p in pol.parameters()]).detach()
    success, new_params = linesearch(pol,
                                     batch,
                                     lf.pg,
                                     prev_params,
                                     fullstep,
                                     neggdotstepdir / lm[0],
                                     ent_beta=ent_beta)
    nn.utils.vector_to_parameters(new_params, pol.parameters())

    return pol_loss.detach().cpu().numpy()
Beispiel #7
0
    def __init__(self,
                 env,
                 record_video=False,
                 video_schedule=None,
                 log_dir=None,
                 force_reset=False):

        if isinstance(env, str):
            env = gym.envs.make(env)
        self.env = env
        if hasattr(env, 'original_env'):
            self.original_env = env.original_env
        else:
            self.original_env = env
        if self.env.spec is not None:
            self.env_id = env.spec.id
        else:
            self.env_id = None

        if log_dir is None:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env,
                                            log_dir,
                                            video_callable=video_schedule,
                                            force=True)
            self.monitoring = True

        self.observation_space = env.observation_space
        logger.log("observation space: {}".format(self.observation_space))
        self.action_space = env.action_space
        logger.log("action space: {}".format(self.action_space))
        if self.env.spec is not None:
            self._horizon = env.spec.tags[
                'wrapper_config.TimeLimit.max_episode_steps']
        else:
            self._horizon = None
        self._log_dir = log_dir
        self._force_reset = force_reset
def train(traj,
          student_pol,
          teacher_pol,
          student_optim,
          epoch,
          batchsize,
          num_epi_per_seq=1):
    s_pol_losses = []
    logger.log("Optimizing...")
    iterator = traj.iterate(
        batchsize, epoch) if not student_pol.rnn else traj.iterate_rnn(
            batchsize=batchsize, num_epi_per_seq=num_epi_per_seq, epoch=epoch)
    for batch in iterator:
        s_pol_loss = update_pol(student_pol=student_pol,
                                teacher_pol=teacher_pol,
                                optim_pol=student_optim,
                                batch=batch)
        s_pol_losses.append(s_pol_loss)

    logger.log('Optimization finished')
    return dict(S_Pol_loss=s_pol_losses)
Beispiel #9
0
def train(traj,
          pol, qfs, targ_qfs, log_alpha,
          optim_pol, optim_qfs, optim_alpha,
          epoch, batch_size,  # optimization hypers
          tau, gamma, sampling, reparam=True,
          log_enable=True,
          max_grad_norm=0.5,
          ):
    """
    Train function for soft actor critic.

    Parameters
    ----------
    traj : Traj
        Off policy trajectory.
    pol : Pol
        Policy.
    qfs : list of SAVfunction
        Q function.
    targ_qfs : list of SAVfunction
        Target Q function.
    log_alpha : torch.Tensor
        Temperature parameter of entropy.
    optim_pol : torch.optim.Optimizer
        Optimizer for Policy.
    optim_qfs : list of torch.optim.Optimizer
        Optimizer for Q function.
    optim_alpha : torch.optim.Optimizer
        Optimizer for alpha.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    tau : float
        Target updating rate.
    gamma : float
        Discounting rate.
    sampling : int
        Number of samping in calculating expectation.
    reparam : bool
    log_enable: bool
        If True, enable logging
    max_grad_norm : float
        Maximum gradient norm.

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    _qf_losses = []
    alpha_losses = []
    if log_enable:
        logger.log("Optimizing...")
    for batch in traj.random_batch(batch_size, epoch):
        pol_loss, qf_losses, alpha_loss = lf.sac(
            pol, qfs, targ_qfs, log_alpha, batch, gamma, sampling, reparam)

        optim_pol.zero_grad()
        pol_loss.backward()
        torch.nn.utils.clip_grad_norm_(pol.parameters(), max_grad_norm)
        optim_pol.step()

        for qf, optim_qf, qf_loss in zip(qfs, optim_qfs, qf_losses):
            optim_qf.zero_grad()
            qf_loss.backward()
            torch.nn.utils.clip_grad_norm_(qf.parameters(), max_grad_norm)
            optim_qf.step()

        optim_alpha.zero_grad()
        alpha_loss.backward()
        optim_alpha.step()

        for qf, targ_qf in zip(qfs, targ_qfs):
            for q, targ_q in zip(qf.parameters(), targ_qf.parameters()):
                targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach())

        pol_losses.append(pol_loss.detach().cpu().numpy())
        _qf_losses.append(
            (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy())
        alpha_losses.append(alpha_loss.detach().cpu().numpy())

    if log_enable:
        logger.log("Optimization finished!")

    return dict(
        PolLoss=pol_losses,
        QfLoss=_qf_losses,
        AlphaLoss=alpha_losses
    )
Beispiel #10
0
def train(
    traj,
    pol,
    targ_pol,
    qfs,
    targ_qfs,
    optim_pol,
    optim_qfs,
    epoch,
    batch_size,  # optimization hypers
    tau,
    gamma,  # advantage estimation
    pol_update=True,
    log_enable=True,
    max_grad_norm=0.5,
    target_policy_smoothing_func=None,
):

    pol_losses = []
    _qf_losses = []
    if log_enable:
        logger.log("Optimizing...")

    for batch in traj.random_batch(batch_size, epoch):

        if (target_policy_smoothing_func is not None):
            qf_losses = lf.td3(
                qfs,
                targ_qfs,
                targ_pol,
                batch,
                gamma,
                continuous=True,
                deterministic=True,
                sampling=1,
                target_policy_smoothing_func=target_policy_smoothing_func)

        else:
            qf_losses = lf.td3(qfs,
                               targ_qfs,
                               targ_pol,
                               batch,
                               gamma,
                               continuous=True,
                               deterministic=True,
                               sampling=1)

        for qf, optim_qf, qf_loss in zip(qfs, optim_qfs, qf_losses):
            optim_qf.zero_grad()
            qf_loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(qf.parameters(), max_grad_norm)
            optim_qf.step()

        _qf_losses.append(
            (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy())

        if pol_update:
            pol_loss = lf.ag(pol, qfs[0], batch, no_noise=True)
            optim_pol.zero_grad()
            pol_loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(pol.parameters(), max_grad_norm)
            optim_pol.step()

            for p, targ_p in zip(pol.parameters(), targ_pol.parameters()):
                targ_p.detach().copy_((1 - tau) * targ_p.detach() +
                                      tau * p.detach())

            for qf, targ_qf in zip(qfs, targ_qfs):
                for q, targ_q in zip(qf.parameters(), targ_qf.parameters()):
                    targ_q.detach().copy_((1 - tau) * targ_q.detach() +
                                          tau * q.detach())

            pol_losses.append(pol_loss.detach().cpu().numpy())

    if log_enable:
        logger.log("Optimization finished!")
    if pol_update:

        return dict(
            PolLoss=pol_losses,
            QfLoss=_qf_losses,
        )

    else:

        return dict(QfLoss=_qf_losses, )
Beispiel #11
0
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
device = torch.device(device_name)
set_device(device)

score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)

env, center_env = create_env(args)

ob_space = env.observation_space
ac_space = env.action_space

pol_net = PolNetSNAILConstant(ob_space, ac_space, args.timestep, args.num_channels, num_keys=args.num_keys, num_tc_fils=args.num_tc_fils, no_attention=args.no_attention, use_pe=args.use_pe)

logger.log(str(len(torch.nn.utils.parameters_to_vector(pol_net.parameters()))))

if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.pol:
    pol.load_state_dict(torch.load(args.pol, map_location=lambda storage, loc: storage))

vf_net = VNetSNAILConstant(ob_space, args.timestep, args.num_channels, num_keys=args.num_keys, num_tc_fils=args.num_tc_fils, no_attention=args.no_attention, use_pe=args.use_pe)
vf = DeterministicSVfunc(ob_space, vf_net, data_parallel=args.data_parallel)
Beispiel #12
0
def train(
        traj,
        pol,
        vf,
        optim_vf,
        epoch=5,
        batch_size=64,
        num_epi_per_seq=1,  # optimization hypers
        max_kl=0.01,
        num_cg=10,
        damping=0.1,
        ent_beta=0):
    """
    Train function for trust region policy optimization.

    Parameters
    ----------
    traj : Traj
        On policy trajectory.
    pol : Pol
        Policy.
    vf : SVfunction
        V function.
    optim_vf : torch.optim.Optimizer
        Optimizer for V function.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    num_epi_per_seq : int
        Number of episodes in one sequence for rnn.
    max_kl : float
        Limit of KL divergence.
    num_cg : int
        Number of iteration in conjugate gradient computation.
    damping : float
        Damping parameter for Hessian Vector Product.

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    vf_losses = []
    logger.log("Optimizing...")
    iterator = traj.full_batch(1) if not pol.rnn else traj.iterate_rnn(
        batch_size=traj.num_epi)
    for batch in iterator:
        pol_loss = update_pol(pol,
                              batch,
                              max_kl=max_kl,
                              num_cg=num_cg,
                              damping=damping,
                              ent_beta=ent_beta)
        pol_losses.append(pol_loss)

    iterator = traj.iterate(batch_size,
                            epoch) if not pol.rnn else traj.iterate_rnn(
                                batch_size=batch_size,
                                num_epi_per_seq=num_epi_per_seq,
                                epoch=epoch)
    for batch in iterator:
        vf_loss = update_vf(vf, optim_vf, batch)
        vf_losses.append(vf_loss)

    logger.log("Optimization finished!")

    return dict(PolLoss=pol_losses, VfLoss=vf_losses)
Beispiel #13
0
def train(traj,
          pol, qfs, targ_qfs, log_alpha,
          optim_pol, optim_qfs, optim_alpha,
          epoch, batch_size, seq_length, burn_in_length,  # optimization hypers
          tau, gamma, sampling, reparam=True,
          log_enable=True,
          ):
    """
    Train function for soft actor critic.

    Parameters
    ----------
    traj : Traj
        Off policy trajectory.
    pol : Pol
        Policy.
    qfs : list of SAVfunction
        Q function.
    targ_qfs : list of SAVfunction
        Target Q function.
    log_alpha : torch.Tensor
        Temperature parameter of entropy.
    optim_pol : torch.optim.Optimizer
        Optimizer for Policy.
    optim_qfs : list of torch.optim.Optimizer
        Optimizer for Q function.
    optim_alpha : torch.optim.Optimizer
        Optimizer for alpha.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    seq_length : int
        Length of batches.
    burn_in_length : int
        Length of batches for burn-in.
    tau : float
        Target updating rate.
    gamma : float
        Discounting rate.
    sampling : int
        Number of samping in calculating expectation.
    reparam : bool
    log_enable: bool
        If True, enable logging

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    _qf_losses = []
    alpha_losses = []
    if log_enable:
        logger.log("Optimizing...")
    for batch, start_indices in traj.prioritized_random_batch_rnn(batch_size, seq_length, epoch, return_indices=True):
        batch, pol_loss, qf_losses, alpha_loss, td_losses = lf.r2d2_sac(
            pol, qfs, targ_qfs, log_alpha, batch, gamma, sampling, burn_in_length, reparam)

        optim_pol.zero_grad()
        pol_loss.backward()
        optim_pol.step()

        for optim_qf, qf_loss in zip(optim_qfs, qf_losses):
            optim_qf.zero_grad()
            qf_loss.backward()
            optim_qf.step()

        optim_alpha.zero_grad()
        alpha_loss.backward()
        optim_alpha.step()

        for qf, targ_qf in zip(qfs, targ_qfs):
            for q, targ_q in zip(qf.parameters(), targ_qf.parameters()):
                targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach())

        pol_losses.append(pol_loss.detach().cpu().numpy())
        _qf_losses.append(
            (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy())
        alpha_losses.append(alpha_loss.detach().cpu().numpy())

        # update seq_pris
        train_length = seq_length - burn_in_length
        for i in range(batch_size):
            start = start_indices[i] + burn_in_length
            seq_indices = torch.arange(start, start+train_length-1)
            traj = tf.update_pris(
                traj, td_losses[:, i], seq_indices, update_epi_pris=True, seq_length=seq_length)

    if log_enable:
        logger.log("Optimization finished!")

    return dict(
        PolLoss=pol_losses,
        QfLoss=_qf_losses,
        AlphaLoss=alpha_losses
    )
Beispiel #14
0
def train(
        agent_traj,
        expert_traj,
        pol,
        vf,
        optim_vf,
        optim_discim,
        rewf=None,
        shaping_vf=None,
        advf=None,
        rew_type='rew',
        rl_type='trpo',
        pol_ent_beta=0,
        discrim_ent_beta=0,
        epoch=1,
        batch_size=64,
        discrim_batch_size=32,
        num_epi_per_seq=1,
        discrim_step=1,  # optimization hypers
        damping=0.1,
        max_kl=0.01,
        num_cg=10,  # trpo hypers
        optim_pol=None,
        clip_param=0.2,
        max_grad_norm=0.5,
        clip_vfunc=False,
        kl_beta=1,
        kl_targ=0.01,  # ppo hypers
        gamma=0.995):

    pol_losses = []
    vf_losses = []
    discrim_losses = []

    logger.log("Optimizing...")
    if rl_type == 'trpo':
        iterator = agent_traj.full_batch(
            1) if not pol.rnn else agent_traj.iterate_rnn(
                batch_size=agent_traj.num_epi)
        for batch in iterator:
            pol_loss = trpo.update_pol(pol,
                                       batch,
                                       max_kl=max_kl,
                                       num_cg=num_cg,
                                       damping=damping,
                                       ent_beta=pol_ent_beta)
            pol_losses.append(pol_loss)

        iterator = agent_traj.iterate(
            batch_size, epoch) if not pol.rnn else agent_traj.iterate_rnn(
                batch_size=batch_size,
                num_epi_per_seq=num_epi_per_seq,
                epoch=epoch)
        for batch in iterator:
            vf_loss = trpo.update_vf(vf, optim_vf, batch)
            vf_losses.append(vf_loss)
        new_kl_beta = 0
        kl_mean = 0
    elif rl_type == 'ppo_clip':
        iterator = agent_traj.iterate(
            batch_size, epoch) if not pol.rnn else agent_traj.iterate_rnn(
                batch_size=batch_size,
                num_epi_per_seq=num_epi_per_seq,
                epoch=epoch)
        for batch in iterator:
            pol_loss = ppo_clip.update_pol(pol, optim_pol, batch, clip_param,
                                           pol_ent_beta, max_grad_norm)
            vf_loss = ppo_clip.update_vf(vf, optim_vf, batch, clip_param,
                                         clip_vfunc, max_grad_norm)

            pol_losses.append(pol_loss)
            vf_losses.append(vf_loss)
        new_kl_beta = 0
        kl_mean = 0
    elif rl_type == 'ppo_kl':
        iterator = agent_traj.iterate(
            batch_size, epoch) if not pol.rnn else agent_traj.iterate_rnn(
                batch_size=batch_size,
                num_epi_per_seq=num_epi_per_seq,
                epoch=epoch)
        for batch in iterator:
            pol_loss = ppo_kl.update_pol(pol, optim_pol, batch, kl_beta,
                                         max_grad_norm, pol_ent_beta)
            vf_loss = ppo_kl.update_vf(vf, optim_vf, batch)

            pol_losses.append(pol_loss)
            vf_losses.append(vf_loss)

        iterator = agent_traj.full_batch(
            1) if not pol.rnn else agent_traj.iterate_rnn(
                batch_size=agent_traj.num_epi)
        batch = next(iterator)
        with torch.no_grad():
            pol.reset()
            if pol.rnn:
                _, _, pd_params = pol(batch['obs'], h_masks=batch['h_masks'])
            else:
                _, _, pd_params = pol(batch['obs'])
            kl_mean = torch.mean(pol.pd.kl_pq(batch, pd_params)).item()
        if kl_mean > 1.3 * kl_targ:
            new_kl_beta = 1.5 * kl_beta
        elif kl_mean < 0.7 * kl_targ:
            new_kl_beta = kl_beta / 1.5
        else:
            new_kl_beta = kl_beta
    else:
        raise ValueError('Only trpo, ppo_clip and ppo_kl are supported')

    agent_iterator = agent_traj.iterate_step(batch_size=discrim_batch_size,
                                             step=discrim_step)
    expert_iterator = expert_traj.iterate_step(batch_size=discrim_batch_size,
                                               step=discrim_step)
    for agent_batch, expert_batch in zip(agent_iterator, expert_iterator):
        discrim_loss = update_discrim(rewf, shaping_vf, advf, pol, rew_type,
                                      optim_discim, agent_batch, expert_batch,
                                      gamma)
        discrim_losses.append(discrim_loss)
    logger.log("Optimization finished!")

    return dict(PolLoss=pol_losses,
                VfLoss=vf_losses,
                DiscrimLoss=discrim_losses,
                new_kl_beta=new_kl_beta,
                kl_mean=kl_mean)
Beispiel #15
0
def train(
        traj,
        pol,
        targ_pol,
        qf,
        targ_qf,
        optim_pol,
        optim_qf,
        epoch,
        batch_size,  # optimization hypers
        tau,
        gamma  # advantage estimation
):
    """
    Train function for deep deterministic policy gradient

    Parameters
    ----------
    traj : Traj
        Off policy trajectory.
    pol : Pol
        Policy.
    targ_pol : Pol
        Target Policy.
    qf : SAVfunction
        Q function.
    targ_qf : SAVfunction
        Target Q function.
    optim_pol : torch.optim.Optimizer
        Optimizer for Policy.
    optim_qf : torch.optim.Optimizer
        Optimizer for Q function.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    tau : float
        Target updating rate.
    gamma : float
        Discounting rate.

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    qf_losses = []
    logger.log("Optimizing...")
    for batch in traj.random_batch(batch_size, epoch):
        qf_bellman_loss = lf.bellman(qf, targ_qf, targ_pol, batch, gamma)
        optim_qf.zero_grad()
        qf_bellman_loss.backward()
        optim_qf.step()

        pol_loss = lf.ag(pol, qf, batch, no_noise=True)
        optim_pol.zero_grad()
        pol_loss.backward()
        optim_pol.step()

        for p, targ_p in zip(pol.parameters(), targ_pol.parameters()):
            targ_p.detach().copy_((1 - tau) * targ_p.detach() +
                                  tau * p.detach())
        for q, targ_q in zip(qf.parameters(), targ_qf.parameters()):
            targ_q.detach().copy_((1 - tau) * targ_q.detach() +
                                  tau * q.detach())

        qf_losses.append(qf_bellman_loss.detach().cpu().numpy())
        pol_losses.append(pol_loss.detach().cpu().numpy())
    logger.log("Optimization finished!")

    return {'PolLoss': pol_losses, 'QfLoss': qf_losses}
Beispiel #16
0
def train(
    traj,
    pol,
    targ_pol,
    qf,
    targ_qf,
    optim_pol,
    optim_qf,
    epoch,
    batch_size,  # optimization hypers
    tau,
    gamma,  # advantage estimation
    sampling,
    log_enable=True,
):
    """
    Train function for deep deterministic policy gradient

    Parameters
    ----------
    traj : Traj
        Off policy trajectory.
    pol : Pol
        Policy.
    targ_pol : Pol
        Target Policy.
    qf : SAVfunction
        Q function.
    targ_qf : SAVfunction
        Target Q function.
    optim_pol : torch.optim.Optimizer
        Optimizer for Policy.
    optim_qf : torch.optim.Optimizer
        Optimizer for Q function.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    tau : float
        Target updating rate.
    gamma : float
        Discounting rate.
    sampling : int
        Number of samping in calculating expectation.
    log_enable: bool
        If True, enable logging

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    qf_losses = []
    if log_enable:
        logger.log("Optimizing...")
    for batch in traj.iterate(batch_size, epoch):
        qf_bellman_loss = lf.bellman(qf,
                                     targ_qf,
                                     targ_pol,
                                     batch,
                                     gamma,
                                     sampling=sampling)
        optim_qf.zero_grad()
        qf_bellman_loss.backward()
        optim_qf.step()

        pol_loss = lf.ag(pol, qf, batch, sampling)
        optim_pol.zero_grad()
        pol_loss.backward()
        optim_pol.step()

        for q, targ_q, p, targ_p in zip(qf.parameters(), targ_qf.parameters(),
                                        pol.parameters(),
                                        targ_pol.parameters()):
            targ_p.detach().copy_((1 - tau) * targ_p.detach() +
                                  tau * p.detach())
            targ_q.detach().copy_((1 - tau) * targ_q.detach() +
                                  tau * q.detach())
        qf_losses.append(qf_bellman_loss.detach().cpu().numpy())
        pol_losses.append(pol_loss.detach().cpu().numpy())

    if log_enable:
        logger.log("Optimization finished!")

    return dict(
        PolLoss=pol_losses,
        QfLoss=qf_losses,
    )
Beispiel #17
0
def train(
        traj,
        pol,
        qfs,
        targ_qfs,
        log_alpha,
        optim_pol,
        optim_qfs,
        optim_alpha,
        epoch,
        batch_size,  # optimization hypers
        tau,
        gamma,
        sampling,
        discrim,
        num_skill,
        reparam=True):
    """
    Train function for soft actor critic.

    Parameters
    ----------
    traj : Traj
        Off policy trajectory.
    pol : Pol
        Policy.
    qfs : list of SAVfunction
        Q function.
    targ_qfs : list of SAVfunction
        Target Q function.
    log_alpha : torch.Tensor
        Temperature parameter of entropy.
    optim_pol : torch.optim.Optimizer
        Optimizer for Policy.
    optim_qfs : list of torch.optim.Optimizer
        Optimizer for Q function.
    optim_alpha : torch.optim.Optimizer
        Optimizer for alpha.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    tau : float
        Target updating rate.
    gamma : float
        Discounting rate.
    sampling : int
        Number of samping in calculating expectation.
    reparam : bool

    discrim : SVfunction
        Discriminator.
    discrim_f :  function 
        Feature extractor of discriminator.
    f_dim :  
        The dimention of discrim_f output.
    num_skill : int
        The number of skills.

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    _qf_losses = []
    alpha_losses = []
    logger.log("Optimizing...")
    for batch in traj.random_batch(batch_size, epoch):
        with torch.no_grad():
            rews, info = calc_rewards(batch['obs'], num_skill, discrim)
            batch['rews'] = rews

        pol_loss, qf_losses, alpha_loss = lf.sac(pol, qfs, targ_qfs, log_alpha,
                                                 batch, gamma, sampling,
                                                 reparam)

        optim_pol.zero_grad()
        pol_loss.backward()
        optim_pol.step()

        for optim_qf, qf_loss in zip(optim_qfs, qf_losses):
            optim_qf.zero_grad()
            qf_loss.backward()
            optim_qf.step()

        optim_alpha.zero_grad()
        alpha_loss.backward()
        optim_alpha.step()

        for qf, targ_qf in zip(qfs, targ_qfs):
            for q, targ_q in zip(qf.parameters(), targ_qf.parameters()):
                targ_q.detach().copy_((1 - tau) * targ_q.detach() +
                                      tau * q.detach())

        pol_losses.append(pol_loss.detach().cpu().numpy())
        _qf_losses.append(
            (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy())
        alpha_losses.append(alpha_loss.detach().cpu().numpy())

    logger.log("Optimization finished!")

    return dict(PolLoss=pol_losses, QfLoss=_qf_losses, AlphaLoss=alpha_losses)
Beispiel #18
0
def measure(name):
    import time
    s = time.time()
    yield
    e = time.time()
    logger.log("{}: {:.4f}sec".format(name, e - s))
Beispiel #19
0
def train(traj, pol, vf,
          kl_beta, kl_targ,
          optim_pol, optim_vf,
          epoch, batch_size, max_grad_norm,
          num_epi_per_seq=1, ent_beta=0,  # optimization hypers
          log_enable=True,
          ):
    """
    Train function for proximal policy optimization (kl).

    Parameters
    ----------
    traj : Traj
        On policy trajectory.
    pol : Pol
        Policy.
    vf : SVfunction
        V function.
    kl_beta : float
        KL divergence coefficient.
    kl_targ : float
        Target of KL divergence.
    optim_pol : torch.optim.Optimizer
        Optimizer for Policy.
    optim_vf : torch.optim.Optimizer
        Optimizer for V function.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    max_grad_norm : float
        Maximum gradient norm.
    num_epi_per_seq : int
        Number of episodes in one sequence for rnn.
    log_enable: bool
        If True, enable logging

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    vf_losses = []
    if log_enable:
        logger.log("Optimizing...")
    iterator = traj.iterate(batch_size, epoch) if not pol.rnn else traj.iterate_rnn(
        batch_size=batch_size, num_epi_per_seq=num_epi_per_seq, epoch=epoch)
    for batch in iterator:
        pol_loss = update_pol(pol, optim_pol, batch,
                              kl_beta, max_grad_norm, ent_beta)
        vf_loss = update_vf(vf, optim_vf, batch)

        pol_losses.append(pol_loss)
        vf_losses.append(vf_loss)

    iterator = traj.full_batch(1) if not pol.rnn else traj.iterate_rnn(
        batch_size=traj.num_epi)
    batch = next(iterator)
    with torch.no_grad():
        pol.reset()
        if pol.rnn:
            _, _, pd_params = pol(batch['obs'], h_masks=batch['h_masks'])
        else:
            _, _, pd_params = pol(batch['obs'])
        kl_mean = torch.mean(
            pol.pd.kl_pq(
                batch,
                pd_params
            )
        ).item()
    if kl_mean > 1.3 * kl_targ:
        new_kl_beta = 1.5 * kl_beta
    elif kl_mean < 0.7 * kl_targ:
        new_kl_beta = kl_beta / 1.5
    else:
        new_kl_beta = kl_beta
    if log_enable:
        logger.log("Optimization finished!")

    return dict(PolLoss=pol_losses, VfLoss=vf_losses, new_kl_beta=new_kl_beta, kl_mean=kl_mean)
            epis = student_sampler.sample(
                s_pol, max_epis=args.max_epis_per_iter)
    with measure('train'):
        traj = Traj()
        traj.add_epis(epis)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        result_dict = on_pol_teacher_distill.train(
            traj=traj,
            student_pol=s_pol,
            teacher_pol=t_pol,
            student_optim=optim_pol,
            epoch=args.epoch_per_iter,
            batchsize=args.batch_size)

    logger.log('Testing Student-policy')
    with measure('sample'):
        epis_measure = student_sampler.sample(
            s_pol, max_epis=args.max_epis_per_iter)

    with measure('measure'):
        traj_measure = Traj()
        traj_measure.add_epis(epis_measure)
        traj_measure = ef.compute_h_masks(traj_measure)
        traj_measure.register_epis()

    total_epi += traj_measure.num_epi
    step = traj_measure.num_step
    total_step += step
    rewards = [np.sum(epi['rews']) for epi in epis_measure]
    mean_rew = np.mean(rewards)
Beispiel #21
0
    if args.rnn:
        pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
    else:
        pol_net = PolNet(ob_space, ac_space)
    if isinstance(ac_space, gym.spaces.Box):
        pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn)
    elif isinstance(ac_space, gym.spaces.Discrete):
        pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn)
    elif isinstance(ac_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

with open(os.path.join(args.pol_dir, args.pol_fname), 'rb') as f:
    pol.load_state_dict(
        torch.load(f, map_location=lambda storage, location: storage))

epis = sampler.sample(pol, max_epis=args.num_epis)

filename = args.epis_fname if len(
    args.epis_fname) != 0 else env.env.spec.id + '_{}epis.pkl'.format(
        len(epis))
with open(os.path.join(args.epis_dir, filename), 'wb') as f:
    pickle.dump(epis, f)
rewards = [np.sum(epi['rews']) for epi in epis]
mean_rew = np.mean(rewards)
logger.log('expert_score={}'.format(mean_rew))
del sampler
                         data_parallel=args.data_parallel)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
expert_traj = Traj()
expert_traj.add_epis(expert_epis)
expert_traj = ef.add_next_obs(expert_traj)
expert_traj.register_epis()
expert_rewards = [np.sum(epi['rews']) for epi in expert_epis]
expert_mean_rew = np.mean(expert_rewards)
logger.log('expert_score={}'.format(expert_mean_rew))
logger.log('expert_num_epi={}'.format(expert_traj.num_epi))

total_epi = 0
total_step = 0
max_rew = -1e6
kl_beta = args.init_kl_beta

if args.pretrain:
    with measure('bc pretrain'):
        for _ in range(args.bc_epoch):
            _ = behavior_clone.train(expert_traj, pol, optim_pol,
                                     args.bc_batch_size)
    torch.save(pol.state_dict(), os.path.join(args.log, 'models',
                                              'pol_bc.pkl'))
Beispiel #23
0
    if args.rnn:
        pol_net = PolNetLSTM(observation_space,
                             action_space,
                             h_size=256,
                             cell_size=256)
    else:
        pol_net = PolNet(observation_space, action_space)
    if isinstance(action_space, gym.spaces.Box):
        pol = GaussianPol(observation_space, action_space, pol_net, args.rnn)
    elif isinstance(action_space, gym.spaces.Discrete):
        pol = CategoricalPol(observation_space, action_space, pol_net,
                             args.rnn)
    elif isinstance(action_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(observation_space, action_space, pol_net,
                                  args.rnn)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=1, seed=args.seed)

with open(os.path.join(args.pol_dir, 'models', args.pol_fname), 'rb') as f:
    pol.load_state_dict(
        torch.load(f, map_location=lambda storage, location: storage))

epis = sampler.sample(pol, max_epis=args.num_epis)

rewards = [np.sum(epi['rews']) for epi in epis]
mean_rew = np.mean(rewards)
logger.log('score={}'.format(mean_rew))
del sampler
Beispiel #24
0
                                  epoch,
                                  args.batch_size,
                                  args.tau,
                                  args.gamma,
                                  loss_type=args.loss_type)

        # multi-agent並列処理。dp_run=data_parallel run
        if args.data_parallel:
            qf.dp_run = False
            lagged_qf.dp_run = False
            targ_qf1.dp_run = False
            targ_qf2.dp_run = False

    total_grad_step += epoch
    if total_grad_step >= args.lag * num_update_lagged:  # 6000stepsごとにlagged netを更新
        logger.log('Updated lagged qf!!')
        lagged_qf_net.load_state_dict(qf_net.state_dict())
        num_update_lagged += 1

    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    # logを保存
    logger.record_results(args.log,
                          result_dict,
                          score_file,
                          total_epi,
                          step,
                          total_step,
                          rewards,
                          plot_title=args.env_name)
Beispiel #25
0
def train(
    traj,
    pol,
    vf,
    optim_pol,
    optim_vf,
    epoch,
    batch_size,
    num_epi_per_seq=1,  # optimization hypers
    clip_param=0.2,
    ent_beta=1e-3,
    max_grad_norm=0.5,
    clip_vfunc=False,
    log_enable=True,
):
    """
    Train function for proximal policy optimization (clip).

    Parameters
    ----------
    traj : Traj
        On policy trajectory.
    pol : Pol
        Policy.
    vf : SVfunction
        V function.
    optim_pol : torch.optim.Optimizer
        Optimizer for Policy.
    optim_vf : torch.optim.Optimizer
        Optimizer for V function.
    epoch : int
        Number of iteration.
    batch_size : int
        Number of batches.
    num_epi_per_seq : int
        Number of episodes in one sequence for rnn.
    clip_param : float
        Clipping ratio of objective function.
    ent_beta : float
        Entropy coefficient.
    max_grad_norm : float
        Maximum gradient norm.
    clip_vfunc: bool
        If True, vfunc is also updated by clipped objective function.
    log_enable: bool
        If True, enable logging

    Returns
    -------
    result_dict : dict
        Dictionary which contains losses information.
    """

    pol_losses = []
    vf_losses = []
    if log_enable:
        logger.log("Optimizing...")
    iterator = traj.iterate(batch_size,
                            epoch) if not pol.rnn else traj.iterate_rnn(
                                batch_size=batch_size,
                                num_epi_per_seq=num_epi_per_seq,
                                epoch=epoch)
    for batch in iterator:
        pol_loss = update_pol(pol, optim_pol, batch, clip_param, ent_beta,
                              max_grad_norm)
        vf_loss = update_vf(vf, optim_vf, batch, clip_param, clip_vfunc,
                            max_grad_norm)

        pol_losses.append(pol_loss)
        vf_losses.append(vf_loss)
    if log_enable:
        logger.log("Optimization finished!")

    return dict(PolLoss=pol_losses, VfLoss=vf_losses)