Exemple #1
0
    def test_learning_rnn(self):
        def rew_func(next_obs,
                     acs,
                     mean_obs=0.,
                     std_obs=1.,
                     mean_acs=0.,
                     std_acs=1.):
            next_obs = next_obs * std_obs + mean_obs
            acs = acs * std_acs + mean_acs
            # Pendulum
            rews = -(torch.acos(next_obs[:, 0].clamp(min=-1, max=1))**2 + 0.1 *
                     (next_obs[:, 2].clamp(min=-8, max=8)**2) +
                     0.001 * acs.squeeze(-1)**2)
            rews = rews.squeeze(0)

            return rews

        # init models
        dm_net = ModelNetLSTM(self.env.observation_space,
                              self.env.action_space)
        dm = DeterministicSModel(self.env.observation_space,
                                 self.env.action_space,
                                 dm_net,
                                 rnn=True,
                                 data_parallel=False,
                                 parallel_dim=0)

        mpc_pol = MPCPol(self.env.observation_space,
                         self.env.action_space,
                         dm_net,
                         rew_func,
                         1,
                         1,
                         mean_obs=0.,
                         std_obs=1.,
                         mean_acs=0.,
                         std_acs=1.,
                         rnn=True)
        optim_dm = torch.optim.Adam(dm_net.parameters(), 1e-3)

        # sample with mpc policy
        sampler = EpiSampler(self.env, mpc_pol, num_parallel=1)
        epis = sampler.sample(mpc_pol, max_epis=1)

        traj = Traj()
        traj.add_epis(epis)
        traj = ef.add_next_obs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        traj.add_traj(traj)

        # train
        result_dict = mpc.train_dm(traj, dm, optim_dm, epoch=1, batch_size=1)

        del sampler
Exemple #2
0
    def setUpClass(cls):
        env = GymEnv('Pendulum-v0')
        random_pol = RandomPol(cls.env.observation_space, cls.env.action_space)
        sampler = EpiSampler(cls.env, pol, num_parallel=1)
        epis = sampler.sample(pol, max_steps=32)
        traj = Traj()
        traj.add_epis(epis)
        traj.register_epis()

        cls.num_step = traj.num_step

        make_redis('localhost', '6379')
        cls.r = get_redis()

        cls.r.set('env', env)
        cls.r.set('traj', traj)

        pol_net = PolNet(env.observation_space, env.action_space)
        gpol = GaussianPol(env.observation_space, env.action_space, pol_net)
        pol_net = PolNet(env.observation_space,
                         env.action_space, deterministic=True)
        dpol = DeterministicActionNoisePol(
            env.observation_space, env.action_space, pol_net)
        model_net = ModelNet(env.observation_space, env.action_space)
        mpcpol = MPCPol(env.observation_space,
                        env.action_space, model_net, rew_func)
        q_net = QNet(env.observation_space, env.action_space)
        qfunc = DeterministicSAVfunc(
            env.observation_space, env.action_space, q_net)
        aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc)
        v_net = VNet(env.observation_space)
        vfunc = DeterministicSVfunc(env.observation_space, v_net)

        cls.r.set('gpol', cloudpickle.dumps(gpol))
        cls.r.set('dpol', cloudpickle.dumps(dpol))
        cls.r.set('mpcpol', cloudpickle.dumps(mpcpol))
        cls.r.set('qfunc', cloudpickle.dumps(qfunc))
        cls.r.set('aqpol', cloudpickle.dumps(aqpol))
        cls.r.set('vfunc', cloudpickle.dumps(vfunc))

        c2d = C2DEnv(env)
        pol_net = PolNet(c2d.observation_space, c2d.action_space)
        mcpol = MultiCategoricalPol(
            env.observation_space, env.action_space, pol_net)

        cls.r.set('mcpol', cloudpickle.dumps(mcpol))
Exemple #3
0
### Train Dynamics Model ###

# initialize dynamics model and mpc policy
if args.rnn:
    dm_net = ModelNetLSTM(ob_space, ac_space)
else:
    dm_net = ModelNet(ob_space, ac_space)
dm = DeterministicSModel(ob_space,
                         ac_space,
                         dm_net,
                         args.rnn,
                         data_parallel=args.data_parallel,
                         parallel_dim=1 if args.rnn else 0)
mpc_pol = MPCPol(ob_space, ac_space, dm_net, rew_func, args.n_samples,
                 args.horizon_of_samples, mean_obs, std_obs, mean_acs, std_acs,
                 args.rnn)
optim_dm = torch.optim.Adam(dm_net.parameters(), args.dm_lr)

rl_sampler = EpiSampler(env,
                        mpc_pol,
                        num_parallel=args.num_parallel,
                        seed=args.seed)

# train loop
total_epi = 0
total_step = 0
counter_agg_iters = 0
max_rew = -1e+6
while args.max_epis > total_epi:
    with measure('train model'):