def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2) result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20) del sampler
def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = CategoricalPol(self.env.ob_space, self.env.ac_space, pol_net) vf_net = VNet(self.env.ob_space, h1=32, h2=32) vf = DeterministicSVfunc(self.env.ob_space, vf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32) result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32, max_grad_norm=10) del sampler
pol=pol, vf=vf, clip_param=args.clip_param, optim_pol=optim_pol, optim_vf=optim_vf, epoch=args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size, max_grad_norm=args.max_grad_norm) else: result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=kl_beta, kl_targ=args.kl_targ, optim_pol=optim_pol, optim_vf=optim_vf, epoch=args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size, max_grad_norm=args.max_grad_norm) kl_beta = result_dict['new_kl_beta'] if args.data_parallel: pol.dp_run = False vf.dp_run = False total_epi += traj.num_epi step = traj.num_step total_step += step rewards = [np.sum(epi['rews']) for epi in epis]