Ejemplo n.º 1
0
    def test_learning(self):
        qf_net = QNet(self.env.observation_space, self.env.action_space, 32,
                      32)
        lagged_qf_net = QNet(self.env.observation_space, self.env.action_space,
                             32, 32)
        lagged_qf_net.load_state_dict(qf_net.state_dict())
        targ_qf1_net = QNet(self.env.observation_space, self.env.action_space,
                            32, 32)
        targ_qf1_net.load_state_dict(qf_net.state_dict())
        targ_qf2_net = QNet(self.env.observation_space, self.env.action_space,
                            32, 32)
        targ_qf2_net.load_state_dict(lagged_qf_net.state_dict())
        qf = DeterministicSAVfunc(self.env.observation_space,
                                  self.env.action_space, qf_net)
        lagged_qf = DeterministicSAVfunc(self.env.observation_space,
                                         self.env.action_space, lagged_qf_net)
        targ_qf1 = CEMDeterministicSAVfunc(self.env.observation_space,
                                           self.env.action_space,
                                           targ_qf1_net,
                                           num_sampling=60,
                                           num_best_sampling=6,
                                           num_iter=2,
                                           multivari=False)
        targ_qf2 = DeterministicSAVfunc(self.env.observation_space,
                                        self.env.action_space, targ_qf2_net)

        pol = ArgmaxQfPol(self.env.observation_space,
                          self.env.action_space,
                          targ_qf1,
                          eps=0.2)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)
        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = qtopt.train(traj, qf, lagged_qf, targ_qf1, targ_qf2,
                                  optim_qf, 1000, 32, 0.9999, 0.995, 'mse')

        del sampler
Ejemplo n.º 2
0
    def setUpClass(cls):
        env = GymEnv('Pendulum-v0')
        random_pol = RandomPol(cls.env.observation_space, cls.env.action_space)
        sampler = EpiSampler(cls.env, pol, num_parallel=1)
        epis = sampler.sample(pol, max_steps=32)
        traj = Traj()
        traj.add_epis(epis)
        traj.register_epis()

        cls.num_step = traj.num_step

        make_redis('localhost', '6379')
        cls.r = get_redis()

        cls.r.set('env', env)
        cls.r.set('traj', traj)

        pol_net = PolNet(env.observation_space, env.action_space)
        gpol = GaussianPol(env.observation_space, env.action_space, pol_net)
        pol_net = PolNet(env.observation_space,
                         env.action_space, deterministic=True)
        dpol = DeterministicActionNoisePol(
            env.observation_space, env.action_space, pol_net)
        model_net = ModelNet(env.observation_space, env.action_space)
        mpcpol = MPCPol(env.observation_space,
                        env.action_space, model_net, rew_func)
        q_net = QNet(env.observation_space, env.action_space)
        qfunc = DeterministicSAVfunc(
            env.observation_space, env.action_space, q_net)
        aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc)
        v_net = VNet(env.observation_space)
        vfunc = DeterministicSVfunc(env.observation_space, v_net)

        cls.r.set('gpol', cloudpickle.dumps(gpol))
        cls.r.set('dpol', cloudpickle.dumps(dpol))
        cls.r.set('mpcpol', cloudpickle.dumps(mpcpol))
        cls.r.set('qfunc', cloudpickle.dumps(qfunc))
        cls.r.set('aqpol', cloudpickle.dumps(aqpol))
        cls.r.set('vfunc', cloudpickle.dumps(vfunc))

        c2d = C2DEnv(env)
        pol_net = PolNet(c2d.observation_space, c2d.action_space)
        mcpol = MultiCategoricalPol(
            env.observation_space, env.action_space, pol_net)

        cls.r.set('mcpol', cloudpickle.dumps(mcpol))
Ejemplo n.º 3
0
targ_qf2_net = QTOptNet(observation_space, action_space)
targ_qf2_net.load_state_dict(
    lagged_qf_net.state_dict())  # model(重み)をロード(6000stepsごとにlagged netからコピー)
targ_qf2 = DeterministicSAVfunc(flattend_observation_space,
                                action_space,
                                targ_qf2_net,
                                data_parallel=args.data_parallel)

# q-networkの最適化手法
print('optimizer')
optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr)

# epsilon-greedy policy
print('Policy')
pol = ArgmaxQfPol(flattend_observation_space,
                  action_space,
                  targ_qf1,
                  eps=args.eps)

# replay bufferからサンプリング?
print('sampler')
sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

# off-policy experience. Traj=(s,a,r,s')
off_traj = Traj(args.max_steps_off, traj_device='cpu')

total_epi = 0
total_step = 0
total_grad_step = 0  # パラメータ更新回数
num_update_lagged = 0  # lagged netの更新回数
max_rew = -1000
Ejemplo n.º 4
0
                                 data_parallel=args.data_parallel)
targ_qf1 = CEMDeterministicSAVfunc(ob_space,
                                   ac_space,
                                   targ_qf1_net,
                                   num_sampling=args.num_sampling,
                                   num_best_sampling=args.num_best_sampling,
                                   num_iter=args.num_iter,
                                   multivari=args.multivari,
                                   data_parallel=args.data_parallel,
                                   save_memory=args.save_memory)
targ_qf2 = DeterministicSAVfunc(ob_space,
                                ac_space,
                                targ_qf2_net,
                                data_parallel=args.data_parallel)

pol = ArgmaxQfPol(ob_space, ac_space, targ_qf1, eps=args.eps)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr)

off_traj = Traj(args.max_steps_off, traj_device='cpu')

total_epi = 0
total_step = 0
total_grad_step = 0
num_update_lagged = 0
max_rew = -1e6

while args.max_epis > total_epi:
    with measure('sample'):
Ejemplo n.º 5
0
                                 data_parallel=args.data_parallel)

# target network theta2
targ_qf2_net = QNet(observation_space, action_space, args.h1, args.h2)
targ_qf2_net.load_state_dict(
    lagged_qf_net.state_dict())  # model(重み)をロード(6000stepsごとにlagged netからコピー)
targ_qf2 = DeterministicSAVfunc(observation_space,
                                action_space,
                                targ_qf2_net,
                                data_parallel=args.data_parallel)

# q-networkの最適化手法
optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr)

# epsilon-greedy policy
pol = ArgmaxQfPol(observation_space, action_space, targ_qf1, eps=args.eps)

# replay bufferからサンプリング?
sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

# off-policy experience. Traj=(s,a,r,s')
off_traj = Traj(args.max_steps_off, traj_device='cpu')

total_epi = 0
total_step = 0
total_grad_step = 0  # パラメータ更新回数
num_update_lagged = 0  # lagged netの更新回数
max_rew = -1e6

while args.max_epis > total_epi:
    with measure('sample'):