def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2) result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20) del sampler
def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = CategoricalPol( self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2) del sampler
s_pol = CategoricalPol( observation_space, action_space, s_pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): t_pol = MultiCategoricalPol( observation_space, action_space, t_pol_net, args.rnn) s_pol = MultiCategoricalPol( observation_space, action_space, s_pol_net, args.rnn) else: raise ValueError('Only Box, Discrete and Multidiscrete are supported') if args.teacher_pol: t_pol.load_state_dict(torch.load( os.path.join(args.teacher_dir, args.teacher_fname))) if args.rnn: s_vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: s_vf_net = VNet(observation_space) if args.sampling_policy == 'teacher': teacher_sampler = EpiSampler( env, t_pol, num_parallel=args.num_parallel, seed=args.seed) student_sampler = EpiSampler( env, s_pol, num_parallel=args.num_parallel, seed=args.seed)
s_pol = GaussianPol(ob_space, ac_space, s_pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.Discrete): t_pol = CategoricalPol(ob_space, ac_space, t_pol_net, args.rnn) s_pol = CategoricalPol(ob_space, ac_space, s_pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.MultiDiscrete): t_pol = MultiCategoricalPol(ob_space, ac_space, t_pol_net, args.rnn) s_pol = MultiCategoricalPol(ob_space, ac_space, s_pol_net, args.rnn) else: raise ValueError('Only Box, Discrete and Multidiscrete are supported') if args.teacher_pol: t_pol.load_state_dict( torch.load(os.path.join(args.teacher_dir, args.teacher_fname))) if args.rnn: s_vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: s_vf_net = VNet(ob_space) if args.sampling_policy == 'teacher': teacher_sampler = EpiSampler(env, t_pol, num_parallel=args.num_parallel, seed=args.seed) student_sampler = EpiSampler(env, s_pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(s_pol_net.parameters(), args.pol_lr)
pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) total_epi = 0 total_step = 0
def setup_nets(self): ob_space = self.env.observation_space ac_space = self.env.action_space if self.args.mirror is True: print("Initiating a symmetric network") pol_net = SymmetricNet( *self.env.unwrapped.mirror_sizes, hidden_size=int(self.args.hidden_size / 4), num_layers=self.args.num_layers, varying_std=self.args.varying_std, tanh_finish=self.args.tanh_finish, log_std=self.args.log_stdev, ) elif self.args.rnn: pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) elif self.args.net_version == 1: pol_net = PolNet(ob_space, ac_space, log_std=self.args.log_stdev) else: pol_net = PolNetB( ob_space, ac_space, hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, varying_std=self.args.varying_std, tanh_finish=self.args.tanh_finish, log_std=self.args.log_stdev, ) if self.args.mirror == "new": print("Initiating a new symmetric network") # TODO: in this case the action_space for the previous pol_net is incorrect, but it isn't easy to fix ... # we can use this for now which just ignores some of the final indices pol_net = SymNet( pol_net, ob_space.shape[0], *self.env.unwrapped.sym_act_inds, varying_std=self.args.varying_std, log_std=self.args.log_stdev, deterministic=False, ) if isinstance(ac_space, gym.spaces.Box): pol_class = GaussianPol elif isinstance(ac_space, gym.spaces.Discrete): pol_class = CategoricalPol elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol_class = MultiCategoricalPol else: raise ValueError( "Only Box, Discrete, and MultiDiscrete are supported") policy = pol_class( ob_space, ac_space, pol_net, self.args.rnn, data_parallel=self.args.data_parallel, parallel_dim=1 if self.args.rnn else 0, ) if self.args.mirror is True: vf_net = SymmetricValue( *self.env.unwrapped.mirror_sizes[:3], hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, ) elif self.args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) elif self.args.net_version == 1: vf_net = VNet(ob_space) else: vf_net = VNetB( ob_space, hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, ) if self.args.mirror == "new": print("Initiating a new symmetric value network") vf_net = SymVNet(vf_net, ob_space.shape[0]) vf = DeterministicSVfunc( ob_space, vf_net, self.args.rnn, data_parallel=self.args.data_parallel, parallel_dim=1 if self.args.rnn else 0, ) self.pol = policy self.vf = vf
assert env1.action_space.shape == env2.action_space.shape observation_space = env1.observation_space action_space = env1.action_space if args.rnn: pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size, cell_size=args.cell_size) else: pol_net = PolNet(observation_space, action_space) pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) if args.rnn: vf_net = VNetLSTM(observation_space, h_size=args.h_size, cell_size=args.cell_size) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) sampler1 = EpiSampler( env1, pol, num_parallel=args.num_parallel, seed=args.seed) sampler2 = EpiSampler( env2, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) total_epi = 0 total_step = 0