Esempio n. 1
0
    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space, h1=32, h2=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f:
            expert_epis = pickle.load(f)
        train_epis, test_epis = ef.train_test_split(
            expert_epis, train_size=0.7)
        train_traj = Traj()
        train_traj.add_epis(train_epis)
        train_traj.register_epis()
        test_traj = Traj()
        test_traj.add_epis(test_epis)
        test_traj.register_epis()

        result_dict = behavior_clone.train(
            train_traj, pol, optim_pol,
            256
        )

        del sampler
expert_traj = ef.add_next_obs(expert_traj)
expert_traj.register_epis()
expert_rewards = [np.sum(epi['rews']) for epi in expert_epis]
expert_mean_rew = np.mean(expert_rewards)
logger.log('expert_score={}'.format(expert_mean_rew))
logger.log('expert_num_epi={}'.format(expert_traj.num_epi))

total_epi = 0
total_step = 0
max_rew = -1e6
kl_beta = args.init_kl_beta

if args.pretrain:
    with measure('bc pretrain'):
        for _ in range(args.bc_epoch):
            _ = behavior_clone.train(expert_traj, pol, optim_pol,
                                     args.bc_batch_size)
    torch.save(pol.state_dict(), os.path.join(args.log, 'models',
                                              'pol_bc.pkl'))

while args.max_epis > total_epi:
    with measure('sample'):
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, args.gamma)
        traj = ef.compute_advs(traj, args.gamma, args.lam)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
train_traj.register_epis()
test_traj = Traj()
test_traj.add_epis(test_epis)
test_traj.register_epis()
expert_rewards = [np.sum(epi['rews']) for epi in expert_epis]
expert_mean_rew = np.mean(expert_rewards)
logger.log('expert_score={}'.format(expert_mean_rew))
logger.log('num_train_epi={}'.format(train_traj.num_epi))

max_rew = -1e6

for curr_epoch in range(args.epoch):
    if args.data_parallel:
        pol.dp_run = True

    result_dict = behavior_clone.train(train_traj, pol, optim_pol,
                                       args.batch_size)
    test_result_dict = behavior_clone.test(test_traj, pol)

    if args.data_parallel:
        pol.dp_run = False

    for key in test_result_dict.keys():
        result_dict[key] = test_result_dict[key]

        if curr_epoch % int(
                args.check_rate * args.epoch) == 0 or curr_epoch == 0:
            with measure('sample'):
                paths = sampler.sample(pol, max_epis=args.max_epis_per_iter)
            rewards = [np.sum(path['rews']) for path in paths]
            mean_rew = np.mean([np.sum(path['rews']) for path in paths])
            logger.record_results_bc(args.log,