def test_learning_rnn(self): def rew_func(next_obs, acs, mean_obs=0., std_obs=1., mean_acs=0., std_acs=1.): next_obs = next_obs * std_obs + mean_obs acs = acs * std_acs + mean_acs # Pendulum rews = -(torch.acos(next_obs[:, 0].clamp(min=-1, max=1))**2 + 0.1 * (next_obs[:, 2].clamp(min=-8, max=8)**2) + 0.001 * acs.squeeze(-1)**2) rews = rews.squeeze(0) return rews # init models dm_net = ModelNetLSTM(self.env.observation_space, self.env.action_space) dm = DeterministicSModel(self.env.observation_space, self.env.action_space, dm_net, rnn=True, data_parallel=False, parallel_dim=0) mpc_pol = MPCPol(self.env.observation_space, self.env.action_space, dm_net, rew_func, 1, 1, mean_obs=0., std_obs=1., mean_acs=0., std_acs=1., rnn=True) optim_dm = torch.optim.Adam(dm_net.parameters(), 1e-3) # sample with mpc policy sampler = EpiSampler(self.env, mpc_pol, num_parallel=1) epis = sampler.sample(mpc_pol, max_epis=1) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() traj.add_traj(traj) # train result_dict = mpc.train_dm(traj, dm, optim_dm, epoch=1, batch_size=1) del sampler
def setUpClass(cls): env = GymEnv('Pendulum-v0') random_pol = RandomPol(cls.env.observation_space, cls.env.action_space) sampler = EpiSampler(cls.env, pol, num_parallel=1) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj.register_epis() cls.num_step = traj.num_step make_redis('localhost', '6379') cls.r = get_redis() cls.r.set('env', env) cls.r.set('traj', traj) pol_net = PolNet(env.observation_space, env.action_space) gpol = GaussianPol(env.observation_space, env.action_space, pol_net) pol_net = PolNet(env.observation_space, env.action_space, deterministic=True) dpol = DeterministicActionNoisePol( env.observation_space, env.action_space, pol_net) model_net = ModelNet(env.observation_space, env.action_space) mpcpol = MPCPol(env.observation_space, env.action_space, model_net, rew_func) q_net = QNet(env.observation_space, env.action_space) qfunc = DeterministicSAVfunc( env.observation_space, env.action_space, q_net) aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc) v_net = VNet(env.observation_space) vfunc = DeterministicSVfunc(env.observation_space, v_net) cls.r.set('gpol', cloudpickle.dumps(gpol)) cls.r.set('dpol', cloudpickle.dumps(dpol)) cls.r.set('mpcpol', cloudpickle.dumps(mpcpol)) cls.r.set('qfunc', cloudpickle.dumps(qfunc)) cls.r.set('aqpol', cloudpickle.dumps(aqpol)) cls.r.set('vfunc', cloudpickle.dumps(vfunc)) c2d = C2DEnv(env) pol_net = PolNet(c2d.observation_space, c2d.action_space) mcpol = MultiCategoricalPol( env.observation_space, env.action_space, pol_net) cls.r.set('mcpol', cloudpickle.dumps(mcpol))
### Train Dynamics Model ### # initialize dynamics model and mpc policy if args.rnn: dm_net = ModelNetLSTM(ob_space, ac_space) else: dm_net = ModelNet(ob_space, ac_space) dm = DeterministicSModel(ob_space, ac_space, dm_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) mpc_pol = MPCPol(ob_space, ac_space, dm_net, rew_func, args.n_samples, args.horizon_of_samples, mean_obs, std_obs, mean_acs, std_acs, args.rnn) optim_dm = torch.optim.Adam(dm_net.parameters(), args.dm_lr) rl_sampler = EpiSampler(env, mpc_pol, num_parallel=args.num_parallel, seed=args.seed) # train loop total_epi = 0 total_step = 0 counter_agg_iters = 0 max_rew = -1e+6 while args.max_epis > total_epi: with measure('train model'):