Exemple #1
0
    def test_learning(self):
        qf_net = QNet(self.env.observation_space, self.env.action_space, 32,
                      32)
        lagged_qf_net = QNet(self.env.observation_space, self.env.action_space,
                             32, 32)
        lagged_qf_net.load_state_dict(qf_net.state_dict())
        targ_qf1_net = QNet(self.env.observation_space, self.env.action_space,
                            32, 32)
        targ_qf1_net.load_state_dict(qf_net.state_dict())
        targ_qf2_net = QNet(self.env.observation_space, self.env.action_space,
                            32, 32)
        targ_qf2_net.load_state_dict(lagged_qf_net.state_dict())
        qf = DeterministicSAVfunc(self.env.observation_space,
                                  self.env.action_space, qf_net)
        lagged_qf = DeterministicSAVfunc(self.env.observation_space,
                                         self.env.action_space, lagged_qf_net)
        targ_qf1 = CEMDeterministicSAVfunc(self.env.observation_space,
                                           self.env.action_space,
                                           targ_qf1_net,
                                           num_sampling=60,
                                           num_best_sampling=6,
                                           num_iter=2,
                                           multivari=False)
        targ_qf2 = DeterministicSAVfunc(self.env.observation_space,
                                        self.env.action_space, targ_qf2_net)

        pol = ArgmaxQfPol(self.env.observation_space,
                          self.env.action_space,
                          targ_qf1,
                          eps=0.2)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)
        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = qtopt.train(traj, qf, lagged_qf, targ_qf1, targ_qf2,
                                  optim_qf, 1000, 32, 0.9999, 0.995, 'mse')

        del sampler
Exemple #2
0
        total_step += step
        epoch = step

        if args.data_parallel:
            qf.dp_run = True
            lagged_qf.dp_run = True
            targ_qf1.dp_run = True
            targ_qf2.dp_run = True
        # train
        print('train')
        result_dict = qtopt.train(off_traj,
                                  qf,
                                  lagged_qf,
                                  targ_qf1,
                                  targ_qf2,
                                  optim_qf,
                                  epoch,
                                  args.batch_size,
                                  args.tau,
                                  args.gamma,
                                  loss_type=args.loss_type)

        # multi-agent並列処理。dp_run=data_parallel run
        if args.data_parallel:
            qf.dp_run = False
            lagged_qf.dp_run = False
            targ_qf1.dp_run = False
            targ_qf2.dp_run = False

    total_grad_step += epoch
    if total_grad_step >= args.lag * num_update_lagged:  # 6000stepsごとにlagged netを更新