def setUpClass(cls): env = GymEnv('Pendulum-v0') random_pol = RandomPol(cls.env.observation_space, cls.env.action_space) sampler = EpiSampler(cls.env, pol, num_parallel=1) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj.register_epis() cls.num_step = traj.num_step make_redis('localhost', '6379') cls.r = get_redis() cls.r.set('env', env) cls.r.set('traj', traj) pol_net = PolNet(env.observation_space, env.action_space) gpol = GaussianPol(env.observation_space, env.action_space, pol_net) pol_net = PolNet(env.observation_space, env.action_space, deterministic=True) dpol = DeterministicActionNoisePol( env.observation_space, env.action_space, pol_net) model_net = ModelNet(env.observation_space, env.action_space) mpcpol = MPCPol(env.observation_space, env.action_space, model_net, rew_func) q_net = QNet(env.observation_space, env.action_space) qfunc = DeterministicSAVfunc( env.observation_space, env.action_space, q_net) aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc) v_net = VNet(env.observation_space) vfunc = DeterministicSVfunc(env.observation_space, v_net) cls.r.set('gpol', cloudpickle.dumps(gpol)) cls.r.set('dpol', cloudpickle.dumps(dpol)) cls.r.set('mpcpol', cloudpickle.dumps(mpcpol)) cls.r.set('qfunc', cloudpickle.dumps(qfunc)) cls.r.set('aqpol', cloudpickle.dumps(aqpol)) cls.r.set('vfunc', cloudpickle.dumps(vfunc)) c2d = C2DEnv(env) pol_net = PolNet(c2d.observation_space, c2d.action_space) mcpol = MultiCategoricalPol( env.observation_space, env.action_space, pol_net) cls.r.set('mcpol', cloudpickle.dumps(mcpol))
action_space = env.action_space pol_net = PolNet(observation_space, action_space) if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, data_parallel=args.data_parallel) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
h_size=256, cell_size=256) s_pol_net = PolNetLSTM(observation_space, action_space, h_size=256, cell_size=256) else: t_pol_net = PolNet(observation_space, action_space) s_pol_net = PolNet(observation_space, action_space, h1=190, h2=90) if isinstance(action_space, gym.spaces.Box): t_pol = GaussianPol(observation_space, action_space, t_pol_net, args.rnn) s_pol = GaussianPol(observation_space, action_space, s_pol_net, args.rnn) elif isinstance(action_space, gym.spaces.Discrete): t_pol = CategoricalPol( observation_space, action_space, t_pol_net, args.rnn) s_pol = CategoricalPol( observation_space, action_space, s_pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): t_pol = MultiCategoricalPol( observation_space, action_space, t_pol_net, args.rnn) s_pol = MultiCategoricalPol( observation_space, action_space, s_pol_net, args.rnn) else: raise ValueError('Only Box, Discrete and Multidiscrete are supported') if args.teacher_pol: t_pol.load_state_dict(torch.load( os.path.join(args.teacher_dir, args.teacher_fname))) if args.rnn: s_vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: s_vf_net = VNet(observation_space) if args.sampling_policy == 'teacher':
# Please note that the two policies do not have to have the same hidden architecture if args.rnn: t_pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) s_pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) else: t_pol_net = PolNet(ob_space, ac_space) s_pol_net = PolNet(ob_space, ac_space, h1=190, h2=90) if isinstance(ac_space, gym.spaces.Box): t_pol = GaussianPol(ob_space, ac_space, t_pol_net, args.rnn) s_pol = GaussianPol(ob_space, ac_space, s_pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.Discrete): t_pol = CategoricalPol(ob_space, ac_space, t_pol_net, args.rnn) s_pol = CategoricalPol(ob_space, ac_space, s_pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.MultiDiscrete): t_pol = MultiCategoricalPol(ob_space, ac_space, t_pol_net, args.rnn) s_pol = MultiCategoricalPol(ob_space, ac_space, s_pol_net, args.rnn) else: raise ValueError('Only Box, Discrete and Multidiscrete are supported') if args.teacher_pol: t_pol.load_state_dict( torch.load(os.path.join(args.teacher_dir, args.teacher_fname))) if args.rnn: s_vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: s_vf_net = VNet(ob_space) if args.sampling_policy == 'teacher': teacher_sampler = EpiSampler(env,
ac_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0)
def main(args): init_ray(args.num_cpus, args.num_gpus, args.ray_redis_address) if not os.path.exists(args.log): os.makedirs(args.log) if not os.path.exists(os.path.join(args.log, 'models')): os.mkdir(os.path.join(args.log, 'models')) score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) with open(os.path.join(args.log, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) # when doing the distributed training, disable video recordings env = GymEnv(args.env_name) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) rnn = False # pol_net = PolNetLSTM(observation_space, action_space) # rnn = True if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, rnn=rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net) trainer = TrainManager(Trainer, args.num_trainer, args.master_address, args=args, vf=vf, pol=pol) sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed) total_epi = 0 total_step = 0 max_rew = -1e6 start_time = time.time() while args.max_epis > total_epi: with measure('sample'): sampler.set_pol_state(trainer.get_state("pol")) epis = sampler.sample(max_steps=args.max_steps_per_iter) with measure('train'): result_dict = trainer.train(epis=epis) step = result_dict["traj_num_step"] total_step += step total_epi += result_dict["traj_num_epi"] rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) elapsed_time = time.time() - start_time logger.record_tabular('ElapsedTime', elapsed_time) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) with measure('save'): pol_state = trainer.get_state("pol") vf_state = trainer.get_state("vf") optim_pol_state = trainer.get_state("optim_pol") optim_vf_state = trainer.get_state("optim_vf") torch.save(pol_state, os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_last.pkl')) torch.save(optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_last.pkl')) torch.save(optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_last.pkl')) if mean_rew > max_rew: torch.save(pol_state, os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_max.pkl')) torch.save( optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save( optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_max.pkl')) max_rew = mean_rew del sampler del trainer
log_dir=os.path.join(args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) train_epis, test_epis = ef.train_test_split(expert_epis, train_size=args.train_size) train_traj = Traj() train_traj.add_epis(train_epis) train_traj.register_epis() test_traj = Traj() test_traj.add_epis(test_epis)
if args.c2d: env = C2DEnv(env) ob_space = env.observation_space ac_space = env.action_space pol_net = PolNet(ob_space, ac_space) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol( ob_space, ac_space, pol_net, data_parallel=args.data_parallel) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, data_parallel=args.data_parallel) if args.rew_type == 'rew': rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) rewf = DeterministicSVfunc( ob_space, rewf_net, data_parallel=args.data_parallel) shaping_vf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) shaping_vf = DeterministicSVfunc( ob_space, shaping_vf_net, data_parallel=args.data_parallel) optim_discrim = torch.optim.Adam(
env2 = C2DEnv(env2) assert env1.ob_space == env2.ob_space assert env1.ac_space.shape == env2.ac_space.shape ob_space = env1.observation_space ac_space = env1.action_space pol_net = PolNetLSTM(ob_space, ac_space, h_size=args.h_size, cell_size=args.cell_size) pol = MultiCategoricalPol(ob_space, ac_space, pol_net, True, data_parallel=args.data_parallel, parallel_dim=1) vf_net = VNetLSTM(ob_space, h_size=args.h_size, cell_size=args.cell_size) vf = DeterministicSVfunc(ob_space, vf_net, True, data_parallel=args.data_parallel, parallel_dim=1) sampler1 = EpiSampler(env1, pol, num_parallel=args.num_parallel, seed=args.seed) sampler2 = EpiSampler(env2,
env2 = RewInObEnv(env2) env2 = C2DEnv(env2) assert env1.observation_space == env2.observation_space assert env1.action_space.shape == env2.action_space.shape observation_space = env1.observation_space action_space = env1.action_space if args.rnn: pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size, cell_size=args.cell_size) else: pol_net = PolNet(observation_space, action_space) pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) if args.rnn: vf_net = VNetLSTM(observation_space, h_size=args.h_size, cell_size=args.cell_size) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) sampler1 = EpiSampler( env1, pol, num_parallel=args.num_parallel, seed=args.seed) sampler2 = EpiSampler( env2, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
env2 = C2DEnv(env2) assert env1.observation_space == env2.observation_space assert env1.action_space.shape == env2.action_space.shape observation_space = env1.observation_space action_space = env1.action_space if args.rnn: pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size, cell_size=args.cell_size) else: pol_net = PolNet(observation_space, action_space) pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn) if args.rnn: vf_net = VNetLSTM(observation_space, h_size=args.h_size, cell_size=args.cell_size) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn) sampler1 = EpiSampler(env1, pol, num_parallel=args.num_parallel, seed=args.seed) sampler2 = EpiSampler(env2, pol,
action_space, pol_net, args.rnn, data_parallel=args.ddp, parallel_dim=1 if args.rnn else 0) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net, args.rnn, data_parallel=args.ddp, parallel_dim=1 if args.rnn else 0) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn, data_parallel=args.ddp, parallel_dim=1 if args.rnn else 0) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn, data_parallel=args.ddp, parallel_dim=1 if args.rnn else 0)