Beispiel #1
0
    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2,
                                     optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2)
        result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2,
                                   optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20)

        del sampler
Beispiel #2
0
    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = CategoricalPol(
            self.env.observation_space, self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2)

        del sampler
    s_pol = CategoricalPol(
        observation_space, action_space, s_pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    t_pol = MultiCategoricalPol(
        observation_space, action_space, t_pol_net, args.rnn)
    s_pol = MultiCategoricalPol(
        observation_space, action_space, s_pol_net, args.rnn)
else:
    raise ValueError('Only Box, Discrete and Multidiscrete are supported')

if args.teacher_pol:
    t_pol.load_state_dict(torch.load(
        os.path.join(args.teacher_dir, args.teacher_fname)))

if args.rnn:
    s_vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256)
else:
    s_vf_net = VNet(observation_space)

if args.sampling_policy == 'teacher':
    teacher_sampler = EpiSampler(
        env,
        t_pol,
        num_parallel=args.num_parallel,
        seed=args.seed)

student_sampler = EpiSampler(
    env,
    s_pol,
    num_parallel=args.num_parallel,
    seed=args.seed)
    s_pol = GaussianPol(ob_space, ac_space, s_pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.Discrete):
    t_pol = CategoricalPol(ob_space, ac_space, t_pol_net, args.rnn)
    s_pol = CategoricalPol(ob_space, ac_space, s_pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    t_pol = MultiCategoricalPol(ob_space, ac_space, t_pol_net, args.rnn)
    s_pol = MultiCategoricalPol(ob_space, ac_space, s_pol_net, args.rnn)
else:
    raise ValueError('Only Box, Discrete and Multidiscrete are supported')

if args.teacher_pol:
    t_pol.load_state_dict(
        torch.load(os.path.join(args.teacher_dir, args.teacher_fname)))

if args.rnn:
    s_vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
else:
    s_vf_net = VNet(ob_space)

if args.sampling_policy == 'teacher':
    teacher_sampler = EpiSampler(env,
                                 t_pol,
                                 num_parallel=args.num_parallel,
                                 seed=args.seed)

student_sampler = EpiSampler(env,
                             s_pol,
                             num_parallel=args.num_parallel,
                             seed=args.seed)

optim_pol = torch.optim.Adam(s_pol_net.parameters(), args.pol_lr)
Beispiel #5
0
                         pol_net,
                         args.rnn,
                         data_parallel=args.data_parallel,
                         parallel_dim=1 if args.rnn else 0)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(ob_space,
                              ac_space,
                              pol_net,
                              args.rnn,
                              data_parallel=args.data_parallel,
                              parallel_dim=1 if args.rnn else 0)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
else:
    vf_net = VNet(ob_space)
vf = DeterministicSVfunc(ob_space,
                         vf_net,
                         args.rnn,
                         data_parallel=args.data_parallel,
                         parallel_dim=1 if args.rnn else 0)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)

total_epi = 0
total_step = 0
Beispiel #6
0
    def setup_nets(self):
        ob_space = self.env.observation_space
        ac_space = self.env.action_space

        if self.args.mirror is True:
            print("Initiating a symmetric network")
            pol_net = SymmetricNet(
                *self.env.unwrapped.mirror_sizes,
                hidden_size=int(self.args.hidden_size / 4),
                num_layers=self.args.num_layers,
                varying_std=self.args.varying_std,
                tanh_finish=self.args.tanh_finish,
                log_std=self.args.log_stdev,
            )
        elif self.args.rnn:
            pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
        elif self.args.net_version == 1:
            pol_net = PolNet(ob_space, ac_space, log_std=self.args.log_stdev)
        else:
            pol_net = PolNetB(
                ob_space,
                ac_space,
                hidden_size=self.args.hidden_size,
                num_layers=self.args.num_layers,
                varying_std=self.args.varying_std,
                tanh_finish=self.args.tanh_finish,
                log_std=self.args.log_stdev,
            )

        if self.args.mirror == "new":
            print("Initiating a new symmetric network")
            # TODO: in this case the action_space for the previous pol_net is incorrect, but it isn't easy to fix ...
            # we can use this for now which just ignores some of the final indices
            pol_net = SymNet(
                pol_net,
                ob_space.shape[0],
                *self.env.unwrapped.sym_act_inds,
                varying_std=self.args.varying_std,
                log_std=self.args.log_stdev,
                deterministic=False,
            )

        if isinstance(ac_space, gym.spaces.Box):
            pol_class = GaussianPol
        elif isinstance(ac_space, gym.spaces.Discrete):
            pol_class = CategoricalPol
        elif isinstance(ac_space, gym.spaces.MultiDiscrete):
            pol_class = MultiCategoricalPol
        else:
            raise ValueError(
                "Only Box, Discrete, and MultiDiscrete are supported")

        policy = pol_class(
            ob_space,
            ac_space,
            pol_net,
            self.args.rnn,
            data_parallel=self.args.data_parallel,
            parallel_dim=1 if self.args.rnn else 0,
        )

        if self.args.mirror is True:
            vf_net = SymmetricValue(
                *self.env.unwrapped.mirror_sizes[:3],
                hidden_size=self.args.hidden_size,
                num_layers=self.args.num_layers,
            )
        elif self.args.rnn:
            vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
        elif self.args.net_version == 1:
            vf_net = VNet(ob_space)
        else:
            vf_net = VNetB(
                ob_space,
                hidden_size=self.args.hidden_size,
                num_layers=self.args.num_layers,
            )

        if self.args.mirror == "new":
            print("Initiating a new symmetric value network")
            vf_net = SymVNet(vf_net, ob_space.shape[0])

        vf = DeterministicSVfunc(
            ob_space,
            vf_net,
            self.args.rnn,
            data_parallel=self.args.data_parallel,
            parallel_dim=1 if self.args.rnn else 0,
        )

        self.pol = policy
        self.vf = vf
Beispiel #7
0
assert env1.action_space.shape == env2.action_space.shape

observation_space = env1.observation_space
action_space = env1.action_space

if args.rnn:
    pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size,
                         cell_size=args.cell_size)
else:
    pol_net = PolNet(observation_space, action_space)

pol = MultiCategoricalPol(observation_space, action_space, pol_net,
                          args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0)

if args.rnn:
    vf_net = VNetLSTM(observation_space, h_size=args.h_size,
                      cell_size=args.cell_size)
else:
    vf_net = VNet(observation_space)
vf = DeterministicSVfunc(observation_space, vf_net, args.rnn,
                         data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0)

sampler1 = EpiSampler(
    env1, pol, num_parallel=args.num_parallel, seed=args.seed)
sampler2 = EpiSampler(
    env2, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)

total_epi = 0
total_step = 0