Ejemplo n.º 1
0
expert_rewards = [np.sum(epi['rews']) for epi in expert_epis]
expert_mean_rew = np.mean(expert_rewards)
logger.log('expert_score={}'.format(expert_mean_rew))
logger.log('expert_num_epi={}'.format(expert_traj.num_epi))

total_epi = 0
total_step = 0
max_rew = -1e6
kl_beta = args.init_kl_beta

if args.pretrain:
    with measure('bc pretrain'):
        for _ in range(args.bc_epoch):
            _ = behavior_clone.train(expert_traj, pol, optim_pol,
                                     args.bc_batch_size)
    torch.save(pol.state_dict(), os.path.join(args.log, 'models',
                                              'pol_bc.pkl'))

while args.max_epis > total_epi:
    with measure('sample'):
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, args.gamma)
        traj = ef.compute_advs(traj, args.gamma, args.lam)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()
Ejemplo n.º 2
0
    total_epi += traj.num_epi
    step = traj.num_step
    total_step += step
    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    logger.record_results(args.log,
                          result_dict,
                          score_file,
                          total_epi,
                          step,
                          total_step,
                          rewards,
                          plot_title=args.env_name)

    if mean_rew > max_rew:
        torch.save(pol.state_dict(),
                   os.path.join(args.log, 'models', 'pol_max.pkl'))
        torch.save(vf.state_dict(),
                   os.path.join(args.log, 'models', 'vf_max.pkl'))
        torch.save(optim_pol.state_dict(),
                   os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_vf.state_dict(),
                   os.path.join(args.log, 'models', 'optim_vf_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(),
               os.path.join(args.log, 'models', 'pol_last.pkl'))
    torch.save(vf.state_dict(), os.path.join(args.log, 'models',
                                             'vf_last.pkl'))
    torch.save(optim_pol.state_dict(),
               os.path.join(args.log, 'models', 'optim_pol_last.pkl'))
Ejemplo n.º 3
0
            off_traj,
            pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            step//50, args.rnn_batch_size, args.seq_length, args.burn_in_length,
            args.tau, args.gamma, args.sampling, not args.no_reparam
        )

    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    logger.record_results(args.log, result_dict, score_file,
                          total_epi, step, total_step,
                          rewards,
                          plot_title=args.env_name)

    if mean_rew > max_rew:
        torch.save(pol.state_dict(), os.path.join(
            args.log, 'models', 'pol_max.pkl'))
        torch.save(qf1.state_dict(), os.path.join(
            args.log, 'models', 'qf1_max.pkl'))
        torch.save(qf2.state_dict(), os.path.join(
            args.log, 'models', 'qf2_max.pkl'))
        torch.save(optim_pol.state_dict(), os.path.join(
            args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_qf1.state_dict(), os.path.join(
            args.log, 'models', 'optim_qf1_max.pkl'))
        torch.save(optim_qf2.state_dict(), os.path.join(
            args.log, 'models', 'optim_qf2_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(), os.path.join(
        args.log, 'models', 'pol_last.pkl'))
Ejemplo n.º 4
0
                          score_file,
                          total_epi,
                          step,
                          total_step,
                          rewards,
                          plot_title=args.env_name)

    # save models regular intervals
    steps_as = str(
        int(
            int(total_step / args.steps_per_save_models + 1) *
            args.steps_per_save_models))
    if 'prev_as' in locals():
        if not prev_as == steps_as:
            torch.save(
                pol.state_dict(),
                os.path.join(args.log, 'models', 'pol_' + steps_as + '.pkl'))
            torch.save(
                qf1.state_dict(),
                os.path.join(args.log, 'models', 'qf1_' + steps_as + '.pkl'))
            torch.save(
                qf2.state_dict(),
                os.path.join(args.log, 'models', 'qf2_' + steps_as + '.pkl'))
            torch.save(
                discrim.state_dict(),
                os.path.join(args.log, 'models',
                             'discrim_' + steps_as + '.pkl'))
            torch.save(
                optim_pol.state_dict(),
                os.path.join(args.log, 'models',
                             'optim_pol_' + steps_as + '.pkl'))