def test_learning(self): t_pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=200, h2=100) s_pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=190, h2=90) t_pol = GaussianPol( self.env.observation_space, self.env.action_space, t_pol_net) s_pol = GaussianPol( self.env.observation_space, self.env.action_space, s_pol_net) student_sampler = EpiSampler(self.env, s_pol, num_parallel=1) optim_pol = torch.optim.Adam(s_pol.parameters(), 3e-4) epis = student_sampler.sample(s_pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = on_pol_teacher_distill.train( traj=traj, student_pol=s_pol, teacher_pol=t_pol, student_optim=optim_pol, epoch=1, batchsize=32) del student_sampler
else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn) if rank == 0: sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf.parameters(), args.vf_lr) ddp_pol, optim_pol = make_model_distributed(pol, optim_pol, args.use_apex, args.apex_opt_level, args.apex_keep_batchnorm_fp32, args.apex_sync_bn, args.apex_loss_scale, device_ids=[args.local_rank], output_device=args.local_rank) ddp_vf, optim_vf = make_model_distributed(vf, optim_vf, args.use_apex, args.apex_opt_level,