def test_learning(self): ob_space = self.env.real_observation_space skill_space = self.env.skill_space ob_skill_space = self.env.observation_space ac_space = self.env.action_space ob_dim = ob_skill_space.shape[0] - 4 f_dim = ob_dim def discrim_f(x): return x pol_net = PolNet(ob_skill_space, ac_space) pol = GaussianPol(ob_skill_space, ac_space, pol_net) qf_net1 = QNet(ob_skill_space, ac_space) qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net1) targ_qf_net1 = QNet(ob_skill_space, ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net1) qf_net2 = QNet(ob_skill_space, ac_space) qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net2) targ_qf_net2 = QNet(ob_skill_space, ac_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net2) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones(())) high = np.array([np.finfo(np.float32).max]*f_dim) f_space = gym.spaces.Box(-high, high, dtype=np.float32) discrim_net = DiaynDiscrimNet( f_space, skill_space, h_size=100, discrim_f=discrim_f) discrim = DeterministicSVfunc(f_space, discrim_net) optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 1e-4) optim_discrim = torch.optim.SGD(discrim.parameters(), lr=0.001, momentum=0.9) off_traj = Traj() sampler = EpiSampler(self.env, pol, num_parallel=1) epis = sampler.sample(pol, max_steps=200) on_traj = Traj() on_traj.add_epis(epis) on_traj = ef.add_next_obs(on_traj) on_traj = ef.compute_diayn_rews( on_traj, lambda x: diayn_sac.calc_rewards(x, 4, discrim)) on_traj.register_epis() off_traj.add_traj(on_traj) step = on_traj.num_step log_alpha = nn.Parameter(np.log(0.1)*torch.ones(())) # fix alpha result_dict = diayn_sac.train( off_traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, step, 128, 5e-3, 0.99, 1, discrim, 4, True) discrim_losses = diayn.train( discrim, optim_discrim, on_traj, 32, 100, 4) del sampler
def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) vf_net = VNet(self.env.observation_space) vf = DeterministicSVfunc(self.env.observation_space, vf_net) rewf_net = VNet(self.env.observation_space, h1=32, h2=32) rewf = DeterministicSVfunc(self.env.observation_space, rewf_net) shaping_vf_net = VNet(self.env.observation_space, h1=32, h2=32) shaping_vf = DeterministicSVfunc( self.env.observation_space, shaping_vf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() epis = sampler.sample(pol, max_steps=32) agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.add_next_obs(agent_traj) agent_traj = ef.compute_pseudo_rews( agent_traj, rew_giver=rewf, state_only=True) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, 0.99) agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() result_dict = airl.train(agent_traj, expert_traj, pol, vf, optim_vf, optim_discrim, rewf=rewf, shaping_vf=shaping_vf, rl_type='trpo', epoch=1, batch_size=32, discrim_batch_size=32, discrim_step=1, pol_ent_beta=1e-3, gamma=0.99) del sampler
def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = CategoricalPol(self.env.ob_space, self.env.ac_space, pol_net) vf_net = VNet(self.env.ob_space, h1=32, h2=32) vf = DeterministicSVfunc(self.env.ob_space, vf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32) result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32, max_grad_norm=10) del sampler
def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) vf_net = VNet(self.env.observation_space, h1=32, h2=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 24) del sampler
def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = CategoricalPol( self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2) del sampler
def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2) result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20) del sampler
def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net) vf_net = VNet(self.env.ob_space) vf = DeterministicSVfunc(self.env.ob_space, vf_net) discrim_net = DiscrimNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) discrim = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, discrim_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) optim_discrim = torch.optim.Adam(discrim_net.parameters(), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj.register_epis() epis = sampler.sample(pol, max_steps=32) agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.compute_pseudo_rews(agent_traj, discrim) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, 0.99) agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() result_dict = gail.train(agent_traj, expert_traj, pol, vf, discrim, optim_vf, optim_discrim, rl_type='trpo', epoch=1, batch_size=32, discrim_batch_size=32, discrim_step=1, pol_ent_beta=1e-3, discrim_ent_beta=1e-5) del sampler
def setUpClass(cls): env = GymEnv('Pendulum-v0') random_pol = RandomPol(cls.env.observation_space, cls.env.action_space) sampler = EpiSampler(cls.env, pol, num_parallel=1) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj.register_epis() cls.num_step = traj.num_step make_redis('localhost', '6379') cls.r = get_redis() cls.r.set('env', env) cls.r.set('traj', traj) pol_net = PolNet(env.observation_space, env.action_space) gpol = GaussianPol(env.observation_space, env.action_space, pol_net) pol_net = PolNet(env.observation_space, env.action_space, deterministic=True) dpol = DeterministicActionNoisePol( env.observation_space, env.action_space, pol_net) model_net = ModelNet(env.observation_space, env.action_space) mpcpol = MPCPol(env.observation_space, env.action_space, model_net, rew_func) q_net = QNet(env.observation_space, env.action_space) qfunc = DeterministicSAVfunc( env.observation_space, env.action_space, q_net) aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc) v_net = VNet(env.observation_space) vfunc = DeterministicSVfunc(env.observation_space, v_net) cls.r.set('gpol', cloudpickle.dumps(gpol)) cls.r.set('dpol', cloudpickle.dumps(dpol)) cls.r.set('mpcpol', cloudpickle.dumps(mpcpol)) cls.r.set('qfunc', cloudpickle.dumps(qfunc)) cls.r.set('aqpol', cloudpickle.dumps(aqpol)) cls.r.set('vfunc', cloudpickle.dumps(vfunc)) c2d = C2DEnv(env) pol_net = PolNet(c2d.observation_space, c2d.action_space) mcpol = MultiCategoricalPol( env.observation_space, env.action_space, pol_net) cls.r.set('mcpol', cloudpickle.dumps(mcpol))
logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) env = GymEnv(args.env_name, log_dir=os.path.join(args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) pol = GaussianPol(observation_space, action_space, pol_net) vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net) qf_net = QNet(observation_space, action_space) qf = DeterministicSAVfunc(observation_space, action_space, qf_net) targ_qf_net = QNet(observation_space, action_space) targ_qf_net.load_state_dict(qf_net.state_dict()) targ_qf = DeterministicSAVfunc(observation_space, action_space, targ_qf_net) log_alpha = nn.Parameter(torch.zeros((), device=device)) sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr) optim_alpha = torch.optim.Adam([log_alpha], args.pol_lr)
pol_net = PolNet(ob_space, ac_space) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol( ob_space, ac_space, pol_net, data_parallel=args.data_parallel) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, data_parallel=args.data_parallel) if args.rew_type == 'rew': rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) rewf = DeterministicSVfunc( ob_space, rewf_net, data_parallel=args.data_parallel) shaping_vf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) shaping_vf = DeterministicSVfunc( ob_space, shaping_vf_net, data_parallel=args.data_parallel) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), args.discrim_lr) advf = None elif args.rew_type == 'adv': advf_net = DiscrimNet(ob_space, ac_space, h1=args.discrim_h1, h2=args.discrim_h2) advf = DeterministicSAVfunc(
action_space = env.action_space if args.rnn: pol_net = PolNetLSTM(observation_space, action_space, h_size=256, cell_size=256) else: pol_net = PolNet(observation_space, action_space) pol = GaussianPol(observation_space, action_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) if args.rnn: vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) total_epi = 0 total_step = 0 max_rew = -1e6 while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis)
else: pol_net = PolNet(ob_space, ac_space) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, args.rnn) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) total_epi = 0 total_step = 0 max_rew = -1e6 while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf)
pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) total_epi = 0 total_step = 0 max_rew = -1e6 kl_beta = args.init_kl_beta while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net2 = QNet(ob_skill_space, action_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc( ob_skill_space, action_space, targ_qf_net2, data_parallel=args.data_parallel, parallel_dim=0) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones((), device=device)) high = np.array([np.finfo(np.float32).max]*f_dim) f_space = gym.spaces.Box(-high, high, dtype=np.float32) discrim_net = DiaynDiscrimNet( f_space, skill_space, h_size=args.discrim_h_size, discrim_f=discrim_f).to(device) discrim = DeterministicSVfunc( f_space, discrim_net, rnn=False, data_parallel=False, parallel_dim=0) # set optimizer to both models optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), args.qf_lr) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), args.qf_lr) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], args.pol_lr) optim_discrim = torch.optim.SGD(discrim.parameters( ), lr=args.discrim_lr, momentum=args.discrim_momentum) off_traj = Traj() sampler = EpiSampler( env, pol, num_parallel=args.num_parallel, seed=args.seed)
pol_net = PolNet(observation_space, action_space) if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn) if rank == 0: sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf.parameters(), args.vf_lr) ddp_pol, optim_pol = make_model_distributed(pol, optim_pol, args.use_apex, args.apex_opt_level, args.apex_keep_batchnorm_fp32,
def setup_nets(self): ob_space = self.env.observation_space ac_space = self.env.action_space if self.args.mirror is True: print("Initiating a symmetric network") pol_net = SymmetricNet( *self.env.unwrapped.mirror_sizes, hidden_size=int(self.args.hidden_size / 4), num_layers=self.args.num_layers, varying_std=self.args.varying_std, tanh_finish=self.args.tanh_finish, log_std=self.args.log_stdev, ) elif self.args.rnn: pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) elif self.args.net_version == 1: pol_net = PolNet(ob_space, ac_space, log_std=self.args.log_stdev) else: pol_net = PolNetB( ob_space, ac_space, hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, varying_std=self.args.varying_std, tanh_finish=self.args.tanh_finish, log_std=self.args.log_stdev, ) if self.args.mirror == "new": print("Initiating a new symmetric network") # TODO: in this case the action_space for the previous pol_net is incorrect, but it isn't easy to fix ... # we can use this for now which just ignores some of the final indices pol_net = SymNet( pol_net, ob_space.shape[0], *self.env.unwrapped.sym_act_inds, varying_std=self.args.varying_std, log_std=self.args.log_stdev, deterministic=False, ) if isinstance(ac_space, gym.spaces.Box): pol_class = GaussianPol elif isinstance(ac_space, gym.spaces.Discrete): pol_class = CategoricalPol elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol_class = MultiCategoricalPol else: raise ValueError( "Only Box, Discrete, and MultiDiscrete are supported") policy = pol_class( ob_space, ac_space, pol_net, self.args.rnn, data_parallel=self.args.data_parallel, parallel_dim=1 if self.args.rnn else 0, ) if self.args.mirror is True: vf_net = SymmetricValue( *self.env.unwrapped.mirror_sizes[:3], hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, ) elif self.args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) elif self.args.net_version == 1: vf_net = VNet(ob_space) else: vf_net = VNetB( ob_space, hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, ) if self.args.mirror == "new": print("Initiating a new symmetric value network") vf_net = SymVNet(vf_net, ob_space.shape[0]) vf = DeterministicSVfunc( ob_space, vf_net, self.args.rnn, data_parallel=self.args.data_parallel, parallel_dim=1 if self.args.rnn else 0, ) self.pol = policy self.vf = vf
logger.log(str(len(torch.nn.utils.parameters_to_vector(pol_net.parameters())))) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.pol: pol.load_state_dict(torch.load(args.pol, map_location=lambda storage, loc: storage)) vf_net = VNetSNAILConstant(ob_space, args.timestep, args.num_channels, num_keys=args.num_keys, num_tc_fils=args.num_tc_fils, no_attention=args.no_attention, use_pe=args.use_pe) vf = DeterministicSVfunc(ob_space, vf_net, data_parallel=args.data_parallel) if args.vf: vf.load_state_dict(torch.load(args.vf, map_location=lambda storage, loc: storage)) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) if center_env is not None: center_sampler = EpiSampler(center_env, pol, num_parallel=1, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) if args.optim_pol: optim_pol.load_state_dict(torch.load(args.optim_pol, map_location=lambda storage, loc: storage)) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) if args.optim_vf: optim_vf.load_state_dict(torch.load(args.optim_vf, map_location=lambda storage, loc: storage))
pol_net = PolNet(observation_space, action_space) if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol( observation_space, action_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) total_epi = 0 total_step = 0 max_rew = -1e6 kl_beta = args.init_kl_beta while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj()
pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn, data_parallel=args.ddp, parallel_dim=1 if args.rnn else 0) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn, data_parallel=args.ddp, parallel_dim=1 if args.rnn else 0) if dist.get_rank() == 0: sampler = DistributedEpiSampler(args.sampler_world_size, env=env, pol=pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) total_epi = 0 total_step = 0
observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net) if args.rew_type == 'rew': rewf_net = VNet(observation_space, h1=args.discrim_h1, h2=args.discrim_h2) rewf = DeterministicSVfunc(observation_space, rewf_net) shaping_vf_net = VNet(observation_space, h1=args.discrim_h1, h2=args.discrim_h2) shaping_vf = DeterministicSVfunc(observation_space, shaping_vf_net) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), args.discrim_lr) advf = None elif args.rew_type == 'adv': advf_net = DiscrimNet(observation_space, action_space,
record_video=args.record) env.env.seed(args.seed) ob_space = env.observation_space ac_space = env.action_space pol_net = PolNet(ob_space, ac_space) pol = GaussianPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel, parallel_dim=0) vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, data_parallel=args.data_parallel, parallel_dim=0) qf_net = QNet(ob_space, ac_space) qf = DeterministicSAVfunc(ob_space, ac_space, qf_net, data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net = QNet(ob_space, ac_space) targ_qf_net.load_state_dict(qf_net.state_dict()) targ_qf = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net, data_parallel=args.data_parallel, parallel_dim=0)
def main(args): init_ray(args.num_cpus, args.num_gpus, args.ray_redis_address) if not os.path.exists(args.log): os.makedirs(args.log) if not os.path.exists(os.path.join(args.log, 'models')): os.mkdir(os.path.join(args.log, 'models')) score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) with open(os.path.join(args.log, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) # when doing the distributed training, disable video recordings env = GymEnv(args.env_name) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) rnn = False # pol_net = PolNetLSTM(observation_space, action_space) # rnn = True if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, rnn=rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net) trainer = TrainManager(Trainer, args.num_trainer, args.master_address, args=args, vf=vf, pol=pol) sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed) total_epi = 0 total_step = 0 max_rew = -1e6 start_time = time.time() while args.max_epis > total_epi: with measure('sample'): sampler.set_pol_state(trainer.get_state("pol")) epis = sampler.sample(max_steps=args.max_steps_per_iter) with measure('train'): result_dict = trainer.train(epis=epis) step = result_dict["traj_num_step"] total_step += step total_epi += result_dict["traj_num_epi"] rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) elapsed_time = time.time() - start_time logger.record_tabular('ElapsedTime', elapsed_time) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) with measure('save'): pol_state = trainer.get_state("pol") vf_state = trainer.get_state("vf") optim_pol_state = trainer.get_state("optim_pol") optim_vf_state = trainer.get_state("optim_vf") torch.save(pol_state, os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_last.pkl')) torch.save(optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_last.pkl')) torch.save(optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_last.pkl')) if mean_rew > max_rew: torch.save(pol_state, os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_max.pkl')) torch.save( optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save( optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_max.pkl')) max_rew = mean_rew del sampler del trainer
parallel_dim=0) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones((), device=device)) high = np.array([np.finfo(np.float32).max] * f_dim) f_space = gym.spaces.Box(-high, high, dtype=np.float32) discrim_net = DiaynDiscrimNet(f_space, skill_space, h_size=args.discrim_h_size, discrim_f=discrim_f).to(device) discrim = DeterministicSVfunc(f_space, discrim_net, rnn=False, data_parallel=False, parallel_dim=0) # set optimizer to both models optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), args.qf_lr) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), args.qf_lr) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], args.pol_lr) optim_discrim = torch.optim.SGD(discrim.parameters(), lr=args.discrim_lr, momentum=args.discrim_momentum) off_traj = Traj() sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, data_parallel=args.data_parallel) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards)
env_name = 'RoboschoolPremaidAIWalker-v0' env = GymEnv(env_name, log_dir=os.path.join(log_dir_name, 'movie'), record_video=True) env.env.seed(seed) # check dimension of observation space and action space observation_space = env.observation_space action_space = env.action_space # policy pol_net = PolNet(observation_space, action_space) pol = GaussianPol(observation_space, action_space, pol_net) # value function vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net) # optimizer to both models optim_pol = torch.optim.Adam(pol_net.parameters(), lr=1e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), lr=3e-4) # arguments of PPO gamma = 0.99 lam = 0.95 clip_param = 0.2 epoch_per_iter = 4 batch_size = 64 max_grad_norm = 0.5 num_parallel = 16 sampler = EpiSampler(env, pol, num_parallel=num_parallel, seed=seed)