Esempio n. 1
0
    def test_learning(self):
        t_pol_net = PolNet(self.env.observation_space,
                           self.env.action_space, h1=200, h2=100)
        s_pol_net = PolNet(self.env.observation_space,
                           self.env.action_space, h1=190, h2=90)

        t_pol = GaussianPol(
            self.env.observation_space, self.env.action_space, t_pol_net)
        s_pol = GaussianPol(
            self.env.observation_space, self.env.action_space, s_pol_net)

        student_sampler = EpiSampler(self.env, s_pol, num_parallel=1)

        optim_pol = torch.optim.Adam(s_pol.parameters(), 3e-4)

        epis = student_sampler.sample(s_pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        result_dict = on_pol_teacher_distill.train(
            traj=traj,
            student_pol=s_pol,
            teacher_pol=t_pol,
            student_optim=optim_pol,
            epoch=1,
            batchsize=32)

        del student_sampler
Esempio n. 2
0
    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net)

        targ_pol_net = PolNet(self.env.ob_space, self.env.ac_space, 32, 32)
        targ_pol_net.load_state_dict(pol_net.state_dict())
        targ_pol = GaussianPol(
            self.env.ob_space, self.env.ac_space, targ_pol_net)

        qf_net = QNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        qf = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net)

        targ_qf_net = QNet(self.env.ob_space, self.env.ac_space, 32, 32)
        targ_qf_net.load_state_dict(targ_qf_net.state_dict())
        targ_qf = DeterministicSAVfunc(
            self.env.ob_space, self.env.ac_space, targ_qf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = svg.train(
            traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, 1, 32, 0.01, 0.9, 1)

        del sampler
Esempio n. 3
0
    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net)

        vf_net = VNet(self.env.ob_space, h1=32, h2=32)
        vf = DeterministicSVfunc(self.env.ob_space, vf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 24)

        del sampler
Esempio n. 4
0
    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2)

        del sampler
Esempio n. 5
0
    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space, h1=32, h2=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f:
            expert_epis = pickle.load(f)
        train_epis, test_epis = ef.train_test_split(
            expert_epis, train_size=0.7)
        train_traj = Traj()
        train_traj.add_epis(train_epis)
        train_traj.register_epis()
        test_traj = Traj()
        test_traj.add_epis(test_epis)
        test_traj.register_epis()

        result_dict = behavior_clone.train(
            train_traj, pol, optim_pol,
            256
        )

        del sampler
Esempio n. 6
0
    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2,
                                     optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2)
        result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2,
                                   optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20)

        del sampler
Esempio n. 7
0
    def test_learning(self):
        ob_space = self.env.real_observation_space
        skill_space = self.env.skill_space
        ob_skill_space = self.env.observation_space
        ac_space = self.env.action_space
        ob_dim = ob_skill_space.shape[0] - 4
        f_dim = ob_dim
        def discrim_f(x): return x

        pol_net = PolNet(ob_skill_space, ac_space)
        pol = GaussianPol(ob_skill_space, ac_space, pol_net)
        qf_net1 = QNet(ob_skill_space, ac_space)
        qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net1)
        targ_qf_net1 = QNet(ob_skill_space, ac_space)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net1)
        qf_net2 = QNet(ob_skill_space, ac_space)
        qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net2)
        targ_qf_net2 = QNet(ob_skill_space, ac_space)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net2)
        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]
        log_alpha = nn.Parameter(torch.ones(()))

        high = np.array([np.finfo(np.float32).max]*f_dim)
        f_space = gym.spaces.Box(-high, high, dtype=np.float32)
        discrim_net = DiaynDiscrimNet(
            f_space, skill_space, h_size=100, discrim_f=discrim_f)
        discrim = DeterministicSVfunc(f_space, discrim_net)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 1e-4)
        optim_discrim = torch.optim.SGD(discrim.parameters(),
                                        lr=0.001, momentum=0.9)

        off_traj = Traj()
        sampler = EpiSampler(self.env, pol, num_parallel=1)

        epis = sampler.sample(pol, max_steps=200)
        on_traj = Traj()
        on_traj.add_epis(epis)
        on_traj = ef.add_next_obs(on_traj)
        on_traj = ef.compute_diayn_rews(
            on_traj, lambda x: diayn_sac.calc_rewards(x, 4, discrim))
        on_traj.register_epis()
        off_traj.add_traj(on_traj)
        step = on_traj.num_step
        log_alpha = nn.Parameter(np.log(0.1)*torch.ones(()))  # fix alpha
        result_dict = diayn_sac.train(
            off_traj, pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            step, 128, 5e-3, 0.99, 1, discrim, 4, True)
        discrim_losses = diayn.train(
            discrim, optim_discrim, on_traj, 32, 100, 4)

        del sampler
Esempio n. 8
0
    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net)

        qf_net1 = QNet(self.env.ob_space, self.env.ac_space)
        qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                   qf_net1)
        targ_qf_net1 = QNet(self.env.ob_space, self.env.ac_space)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                        targ_qf_net1)

        qf_net2 = QNet(self.env.ob_space, self.env.ac_space)
        qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                   qf_net2)
        targ_qf_net2 = QNet(self.env.ob_space, self.env.ac_space)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                        targ_qf_net2)

        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]

        log_alpha = nn.Parameter(torch.zeros(()))

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = sac.train(
            traj,
            pol,
            qfs,
            targ_qfs,
            log_alpha,
            optim_pol,
            optim_qfs,
            optim_alpha,
            2,
            32,
            0.01,
            0.99,
            2,
        )

        del sampler
Esempio n. 9
0
    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space,
                         h1=32,
                         h2=32)
        pol = GaussianPol(self.env.observation_space, self.env.action_space,
                          pol_net)

        vf_net = VNet(self.env.observation_space)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net)

        discrim_net = DiscrimNet(self.env.observation_space,
                                 self.env.action_space,
                                 h1=32,
                                 h2=32)
        discrim = DeterministicSAVfunc(self.env.observation_space,
                                       self.env.action_space, discrim_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)
        optim_discrim = torch.optim.Adam(discrim_net.parameters(), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'),
                  'rb') as f:
            expert_epis = pickle.load(f)
        expert_traj = Traj()
        expert_traj.add_epis(expert_epis)
        expert_traj.register_epis()

        epis = sampler.sample(pol, max_steps=32)

        agent_traj = Traj()
        agent_traj.add_epis(epis)
        agent_traj = ef.compute_pseudo_rews(agent_traj, discrim)
        agent_traj = ef.compute_vs(agent_traj, vf)
        agent_traj = ef.compute_rets(agent_traj, 0.99)
        agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95)
        agent_traj = ef.centerize_advs(agent_traj)
        agent_traj = ef.compute_h_masks(agent_traj)
        agent_traj.register_epis()

        result_dict = gail.train(agent_traj,
                                 expert_traj,
                                 pol,
                                 vf,
                                 discrim,
                                 optim_vf,
                                 optim_discrim,
                                 rl_type='trpo',
                                 epoch=1,
                                 batch_size=32,
                                 discrim_batch_size=32,
                                 discrim_step=1,
                                 pol_ent_beta=1e-3,
                                 discrim_ent_beta=1e-5)

        del sampler
Esempio n. 10
0
    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net)

        vf_net = VNet(self.env.ob_space)
        vf = DeterministicSVfunc(self.env.ob_space, vf_net)

        rewf_net = VNet(self.env.ob_space, h1=32, h2=32)
        rewf = DeterministicSVfunc(self.env.ob_space, rewf_net)
        shaping_vf_net = VNet(self.env.ob_space, h1=32, h2=32)
        shaping_vf = DeterministicSVfunc(self.env.ob_space, shaping_vf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)
        optim_discrim = torch.optim.Adam(
            list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f:
            expert_epis = pickle.load(f)
        expert_traj = Traj()
        expert_traj.add_epis(expert_epis)
        expert_traj = ef.add_next_obs(expert_traj)
        expert_traj.register_epis()

        epis = sampler.sample(pol, max_steps=32)

        agent_traj = Traj()
        agent_traj.add_epis(epis)
        agent_traj = ef.add_next_obs(agent_traj)
        agent_traj = ef.compute_pseudo_rews(
            agent_traj, rew_giver=rewf, state_only=True)
        agent_traj = ef.compute_vs(agent_traj, vf)
        agent_traj = ef.compute_rets(agent_traj, 0.99)
        agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95)
        agent_traj = ef.centerize_advs(agent_traj)
        agent_traj = ef.compute_h_masks(agent_traj)
        agent_traj.register_epis()

        result_dict = airl.train(agent_traj, expert_traj, pol, vf, optim_vf, optim_discrim,
                                 rewf=rewf, shaping_vf=shaping_vf,
                                 rl_type='trpo',
                                 epoch=1,
                                 batch_size=32, discrim_batch_size=32,
                                 discrim_step=1,
                                 pol_ent_beta=1e-3, gamma=0.99)

        del sampler
Esempio n. 11
0
    def setUpClass(cls):
        env = GymEnv('Pendulum-v0')
        random_pol = RandomPol(cls.env.observation_space, cls.env.action_space)
        sampler = EpiSampler(cls.env, pol, num_parallel=1)
        epis = sampler.sample(pol, max_steps=32)
        traj = Traj()
        traj.add_epis(epis)
        traj.register_epis()

        cls.num_step = traj.num_step

        make_redis('localhost', '6379')
        cls.r = get_redis()

        cls.r.set('env', env)
        cls.r.set('traj', traj)

        pol_net = PolNet(env.observation_space, env.action_space)
        gpol = GaussianPol(env.observation_space, env.action_space, pol_net)
        pol_net = PolNet(env.observation_space,
                         env.action_space, deterministic=True)
        dpol = DeterministicActionNoisePol(
            env.observation_space, env.action_space, pol_net)
        model_net = ModelNet(env.observation_space, env.action_space)
        mpcpol = MPCPol(env.observation_space,
                        env.action_space, model_net, rew_func)
        q_net = QNet(env.observation_space, env.action_space)
        qfunc = DeterministicSAVfunc(
            env.observation_space, env.action_space, q_net)
        aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc)
        v_net = VNet(env.observation_space)
        vfunc = DeterministicSVfunc(env.observation_space, v_net)

        cls.r.set('gpol', cloudpickle.dumps(gpol))
        cls.r.set('dpol', cloudpickle.dumps(dpol))
        cls.r.set('mpcpol', cloudpickle.dumps(mpcpol))
        cls.r.set('qfunc', cloudpickle.dumps(qfunc))
        cls.r.set('aqpol', cloudpickle.dumps(aqpol))
        cls.r.set('vfunc', cloudpickle.dumps(vfunc))

        c2d = C2DEnv(env)
        pol_net = PolNet(c2d.observation_space, c2d.action_space)
        mcpol = MultiCategoricalPol(
            env.observation_space, env.action_space, pol_net)

        cls.r.set('mcpol', cloudpickle.dumps(mcpol))
Esempio n. 12
0
    def test_learning(self):
        pol_net = PolDictNet(self.dict_observation_space,
                             self.env.action_space,
                             h1=32,
                             h2=32)
        pol = GaussianPol(self.env.observation_space, self.env.action_space,
                          pol_net)

        vf_net = VNet(self.env.observation_space, h1=32, h2=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj,
                                     pol=pol,
                                     vf=vf,
                                     clip_param=0.2,
                                     optim_pol=optim_pol,
                                     optim_vf=optim_vf,
                                     epoch=1,
                                     batch_size=32)

        del sampler
Esempio n. 13
0
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

ob_space = env.observation_space
ac_space = env.action_space

if args.rnn:
    pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
else:
    pol_net = PolNet(ob_space, ac_space)
if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space,
                      ac_space,
                      pol_net,
                      args.rnn,
                      data_parallel=args.data_parallel,
                      parallel_dim=1 if args.rnn else 0)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space,
                         ac_space,
                         pol_net,
                         args.rnn,
                         data_parallel=args.data_parallel,
                         parallel_dim=1 if args.rnn else 0)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(ob_space,
                              ac_space,
                              pol_net,
                              args.rnn,
                              data_parallel=args.data_parallel,
Esempio n. 14
0
def main(args):
    init_ray(args.num_cpus, args.num_gpus, args.ray_redis_address)

    if not os.path.exists(args.log):
        os.makedirs(args.log)
    if not os.path.exists(os.path.join(args.log, 'models')):
        os.mkdir(os.path.join(args.log, 'models'))
    score_file = os.path.join(args.log, 'progress.csv')
    logger.add_tabular_output(score_file)
    logger.add_tensorboard_output(args.log)
    with open(os.path.join(args.log, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))

    # when doing the distributed training, disable video recordings
    env = GymEnv(args.env_name)
    env.env.seed(args.seed)
    if args.c2d:
        env = C2DEnv(env)

    observation_space = env.observation_space
    action_space = env.action_space
    pol_net = PolNet(observation_space, action_space)
    rnn = False
    # pol_net = PolNetLSTM(observation_space, action_space)
    # rnn = True
    if isinstance(action_space, gym.spaces.Box):
        pol = GaussianPol(observation_space, action_space, pol_net, rnn=rnn)
    elif isinstance(action_space, gym.spaces.Discrete):
        pol = CategoricalPol(observation_space, action_space, pol_net)
    elif isinstance(action_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(observation_space, action_space, pol_net)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

    vf_net = VNet(observation_space)
    vf = DeterministicSVfunc(observation_space, vf_net)

    trainer = TrainManager(Trainer,
                           args.num_trainer,
                           args.master_address,
                           args=args,
                           vf=vf,
                           pol=pol)
    sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed)

    total_epi = 0
    total_step = 0
    max_rew = -1e6
    start_time = time.time()

    while args.max_epis > total_epi:

        with measure('sample'):
            sampler.set_pol_state(trainer.get_state("pol"))
            epis = sampler.sample(max_steps=args.max_steps_per_iter)

        with measure('train'):
            result_dict = trainer.train(epis=epis)

        step = result_dict["traj_num_step"]
        total_step += step
        total_epi += result_dict["traj_num_epi"]

        rewards = [np.sum(epi['rews']) for epi in epis]
        mean_rew = np.mean(rewards)
        elapsed_time = time.time() - start_time
        logger.record_tabular('ElapsedTime', elapsed_time)
        logger.record_results(args.log,
                              result_dict,
                              score_file,
                              total_epi,
                              step,
                              total_step,
                              rewards,
                              plot_title=args.env_name)

        with measure('save'):
            pol_state = trainer.get_state("pol")
            vf_state = trainer.get_state("vf")
            optim_pol_state = trainer.get_state("optim_pol")
            optim_vf_state = trainer.get_state("optim_vf")

            torch.save(pol_state,
                       os.path.join(args.log, 'models', 'pol_last.pkl'))
            torch.save(vf_state, os.path.join(args.log, 'models',
                                              'vf_last.pkl'))
            torch.save(optim_pol_state,
                       os.path.join(args.log, 'models', 'optim_pol_last.pkl'))
            torch.save(optim_vf_state,
                       os.path.join(args.log, 'models', 'optim_vf_last.pkl'))

            if mean_rew > max_rew:
                torch.save(pol_state,
                           os.path.join(args.log, 'models', 'pol_max.pkl'))
                torch.save(vf_state,
                           os.path.join(args.log, 'models', 'vf_max.pkl'))
                torch.save(
                    optim_pol_state,
                    os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
                torch.save(
                    optim_vf_state,
                    os.path.join(args.log, 'models', 'optim_vf_max.pkl'))
                max_rew = mean_rew
    del sampler
    del trainer
Esempio n. 15
0
env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

ob_space = env.observation_space
ac_space = env.action_space

if args.rnn:
    pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
else:
    pol_net = PolNet(ob_space, ac_space)
if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
else:
    vf_net = VNet(ob_space)
vf = DeterministicSVfunc(ob_space, vf_net, args.rnn)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)
logger.add_tabular_output(score_file)

env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
if isinstance(action_space, gym.spaces.Box):
    pol = GaussianPol(observation_space,
                      action_space,
                      pol_net,
                      data_parallel=args.data_parallel)
elif isinstance(action_space, gym.spaces.Discrete):
    pol = CategoricalPol(observation_space,
                         action_space,
                         pol_net,
                         data_parallel=args.data_parallel)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space,
                              action_space,
                              pol_net,
                              data_parallel=args.data_parallel)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
Esempio n. 17
0
log_dir_name = 'garbage_airl_20190915_2'
env_name = 'RoboschoolPremaidAIWalker-v0'
env = gym.make(env_name)
env.seed(seed)

# check dimension of observation space and action space
observation_space = env.observation_space
action_space = env.action_space

# policy
pol_net = PolNet(observation_space, action_space)

# load best policy
best_path = f'{log_dir_name}/models/pol_max.pkl'
best_pol = GaussianPol(observation_space, action_space, pol_net)
best_pol.load_state_dict(torch.load(best_path), )

# show trained policy's behavior
done = False
o = env.reset()
for _ in range(1000):  # show 16.5 sec (0.0165 * 1000)
    if done:
        time.sleep(1)  # when the boundary of eposode
        o = env.reset()
    ac_real, ac, a_i = best_pol.deterministic_ac_real(
        torch.tensor(o, dtype=torch.float))
    ac_real = ac_real.reshape(best_pol.action_space.shape)
    next_o, r, done, e_i = env.step(np.array(ac_real))
    o = next_o
    env.render()
Esempio n. 18
0
score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)

env = GymEnv(args.env_name, log_dir=os.path.join(
    args.log, 'movie'), record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

ob_space = env.observation_space
ac_space = env.action_space


pol_net = PolNet(ob_space, ac_space)
if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space, ac_space, pol_net,
                      data_parallel=args.data_parallel)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space, ac_space, pol_net,
                         data_parallel=args.data_parallel)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(
        ob_space, ac_space, pol_net, data_parallel=args.data_parallel)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

vf_net = VNet(ob_space)
vf = DeterministicSVfunc(ob_space, vf_net,
                         data_parallel=args.data_parallel)

if args.rew_type == 'rew':
    rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2)
Esempio n. 19
0
observation_space = env.observation_space
action_space = env.action_space

# Generate teacher (t) policy and student (s) policy and load teacher policy
# Please note that the two policies do not have to have the same hidden architecture

if args.rnn:
    t_pol_net = PolNetLSTM(observation_space, action_space,
                           h_size=256, cell_size=256)
    s_pol_net = PolNetLSTM(observation_space, action_space,
                           h_size=256, cell_size=256)
else:
    t_pol_net = PolNet(observation_space, action_space)
    s_pol_net = PolNet(observation_space, action_space, h1=190, h2=90)
if isinstance(action_space, gym.spaces.Box):
    t_pol = GaussianPol(observation_space, action_space, t_pol_net, args.rnn)
    s_pol = GaussianPol(observation_space, action_space, s_pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.Discrete):
    t_pol = CategoricalPol(
        observation_space, action_space, t_pol_net, args.rnn)
    s_pol = CategoricalPol(
        observation_space, action_space, s_pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    t_pol = MultiCategoricalPol(
        observation_space, action_space, t_pol_net, args.rnn)
    s_pol = MultiCategoricalPol(
        observation_space, action_space, s_pol_net, args.rnn)
else:
    raise ValueError('Only Box, Discrete and Multidiscrete are supported')

if args.teacher_pol:
Esempio n. 20
0
    def test_learning(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        qf_net1 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net1, rnn=True)
        targ_qf_net1 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net1, rnn=True)

        qf_net2 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net2, rnn=True)
        targ_qf_net2 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net2, rnn=True)

        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]

        log_alpha = nn.Parameter(torch.zeros(()))

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        max_pri = traj.get_max_pri()
        traj = ef.set_all_pris(traj, max_pri)
        traj = ef.compute_seq_pris(traj, 4)
        traj = ef.compute_h_masks(traj)
        for i in range(len(qfs)):
            traj = ef.compute_hs(
                traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True)
            traj = ef.compute_hs(
                traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True)
        traj.register_epis()

        result_dict = r2d2_sac.train(
            traj,
            pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            2, 32, 4, 2,
            0.01, 0.99, 2,
        )

        del sampler
Esempio n. 21
0
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

observation_space = env.observation_space
action_space = env.action_space

if args.rnn:
    pol_net = PolNetLSTM(observation_space,
                         action_space,
                         h_size=256,
                         cell_size=256)
else:
    pol_net = PolNet(observation_space, action_space)
if isinstance(action_space, gym.spaces.Box):
    pol = GaussianPol(observation_space, action_space, pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.Discrete):
    pol = CategoricalPol(observation_space, action_space, pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space, action_space, pol_net,
                              args.rnn)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256)
else:
    vf_net = VNet(observation_space)
vf = DeterministicSVfunc(observation_space, vf_net, args.rnn)

if rank == 0:
Esempio n. 22
0
env = gym.make(args.env_name)
env = SkillEnv(env, num_skill=args.num_skill)
obs = env.reset()
observation_space = env.real_observation_space
skill_space = env.skill_space
ob_skill_space = env.observation_space
action_space = env.action_space
ob_dim = ob_skill_space.shape[0] - args.num_skill
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
device = torch.device(device_name)
set_device(device)

# policy
pol_net = PolNet(ob_skill_space, action_space)
pol = GaussianPol(ob_skill_space, action_space, pol_net,
                  data_parallel=args.data_parallel, parallel_dim=0)

# q-function
qf_net1 = QNet(ob_skill_space, action_space)
qf1 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net1,
                           data_parallel=args.data_parallel, parallel_dim=0)
targ_qf_net1 = QNet(ob_skill_space, action_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(
    ob_skill_space, action_space, targ_qf_net1, data_parallel=args.data_parallel, parallel_dim=0)
qf_net2 = QNet(ob_skill_space, action_space)
qf2 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net2,
                           data_parallel=args.data_parallel, parallel_dim=0)
targ_qf_net2 = QNet(ob_skill_space, action_space)
targ_qf_net2.load_state_dict(qf_net2.state_dict())
targ_qf2 = DeterministicSAVfunc(
Esempio n. 23
0
device = torch.device(device_name)
set_device(device)

score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)

env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)

ob_space = env.observation_space
ac_space = env.action_space

pol_net = PolNet(ob_space, ac_space)
pol = GaussianPol(ob_space, ac_space, pol_net)

targ_pol_net = PolNet(ob_space, ac_space)
targ_pol_net.load_state_dict(pol_net.state_dict())
targ_pol = GaussianPol(ob_space, ac_space, targ_pol_net)

qf_net = QNet(ob_space, ac_space)
qf = DeterministicSAVfunc(ob_space, ac_space, qf_net)

targ_qf_net = QNet(ob_space, ac_space)
targ_qf_net.load_state_dict(targ_qf_net.state_dict())
targ_qf = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net)

sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
Esempio n. 24
0
if args.ddpg:
    pol_net = PolNet(ob_space,
                     ac_space,
                     args.pol_h1,
                     args.pol_h2,
                     deterministic=True)
    noise = OUActionNoise(ac_space.shape)
    pol = DeterministicActionNoisePol(ob_space, ac_space, pol_net, noise)
else:
    if args.rnn:
        pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
    else:
        pol_net = PolNet(ob_space, ac_space)
    if isinstance(ac_space, gym.spaces.Box):
        pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn)
    elif isinstance(ac_space, gym.spaces.Discrete):
        pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn)
    elif isinstance(ac_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

with open(os.path.join(args.pol_dir, args.pol_fname), 'rb') as f:
    pol.load_state_dict(
        torch.load(f, map_location=lambda storage, location: storage))

epis = sampler.sample(pol, max_epis=args.num_epis)
Esempio n. 25
0
device = torch.device(device_name)
set_device(device)

score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)
logger.add_tensorboard_output(args.log)

env = GymEnv(args.env_name, log_dir=os.path.join(
    args.log, 'movie'), record_video=args.record)
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNetLSTM(observation_space, action_space)
pol = GaussianPol(observation_space, action_space, pol_net, rnn=True)

qf_net1 = QNetLSTM(observation_space, action_space)
qf1 = DeterministicSAVfunc(observation_space, action_space, qf_net1, rnn=True)
targ_qf_net1 = QNetLSTM(observation_space, action_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net1, rnn=True)

qf_net2 = QNetLSTM(observation_space, action_space)
qf2 = DeterministicSAVfunc(observation_space, action_space, qf_net2, rnn=True)
targ_qf_net2 = QNetLSTM(observation_space, action_space)
targ_qf_net2.load_state_dict(qf_net2.state_dict())
targ_qf2 = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net2, rnn=True)
Esempio n. 26
0
                     args.pol_h1,
                     args.pol_h2,
                     deterministic=True)
    noise = OUActionNoise(action_space)
    pol = DeterministicActionNoisePol(observation_space, action_space, pol_net,
                                      noise)
else:
    if args.rnn:
        pol_net = PolNetLSTM(observation_space,
                             action_space,
                             h_size=256,
                             cell_size=256)
    else:
        pol_net = PolNet(observation_space, action_space)
    if isinstance(action_space, gym.spaces.Box):
        pol = GaussianPol(observation_space, action_space, pol_net, args.rnn)
    elif isinstance(action_space, gym.spaces.Discrete):
        pol = CategoricalPol(observation_space, action_space, pol_net,
                             args.rnn)
    elif isinstance(action_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(observation_space, action_space, pol_net,
                                  args.rnn)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=1, seed=args.seed)

with open(os.path.join(args.pol_dir, 'models', args.pol_fname), 'rb') as f:
    pol.load_state_dict(
        torch.load(f, map_location=lambda storage, location: storage))
Esempio n. 27
0
parser.add_argument('--cell_size', type=int, default=70)
args = parser.parse_args()

device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
device = torch.device(device_name)
set_device(device)

r = redis.StrictRedis()

high = np.inf * np.ones(4)
observation_space = Box(low=-high, high=high, dtype=np.float32)
high = np.ones(1)
action_space = Box(low=-high, high=high)

pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size, cell_size=args.cell_size)
pol = GaussianPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel, rnn=True)

if args.pol:
    pol.load_state_dict(torch.load(args.pol, map_location=lambda storage, loc: storage))
else:
    raise Exception

pol.to(device)

pol.dp_run = False

pol.reset()

r.set('start', 'false')
while True:
    if r.get('start').decode('utf-8') == 'true':
Esempio n. 28
0
logger.add_tabular_output(score_file)
logger.add_tensorboard_output(args.log)

env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
if isinstance(action_space, gym.spaces.Box):
    pol = GaussianPol(observation_space, action_space, pol_net)
elif isinstance(action_space, gym.spaces.Discrete):
    pol = CategoricalPol(observation_space, action_space, pol_net)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space, action_space, pol_net)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
train_epis, test_epis = ef.train_test_split(expert_epis,
                                            train_size=args.train_size)
train_traj = Traj()
logger.add_tabular_output(score_file)

env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
if isinstance(action_space, gym.spaces.Box):
    pol = GaussianPol(observation_space,
                      action_space,
                      pol_net,
                      data_parallel=args.data_parallel)
elif isinstance(action_space, gym.spaces.Discrete):
    pol = CategoricalPol(observation_space,
                         action_space,
                         pol_net,
                         data_parallel=args.data_parallel)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space,
                              action_space,
                              pol_net,
                              data_parallel=args.data_parallel)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

vf_net = VNet(observation_space)
Esempio n. 30
0
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
device = torch.device(device_name)
set_device(device)

score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)

env = GymEnv(args.env_name, log_dir=os.path.join(
    args.log, 'movie'), record_video=args.record)
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
pol = GaussianPol(observation_space, action_space, pol_net,
                  data_parallel=args.data_parallel, parallel_dim=0)

vf_net = VNet(observation_space)
vf = DeterministicSVfunc(
    observation_space, vf_net, data_parallel=args.data_parallel, parallel_dim=0)

qf_net = QNet(observation_space, action_space)
qf = DeterministicSAVfunc(observation_space, action_space, qf_net,
                          data_parallel=args.data_parallel, parallel_dim=0)
targ_qf_net = QNet(observation_space, action_space)
targ_qf_net.load_state_dict(qf_net.state_dict())
targ_qf = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net, data_parallel=args.data_parallel, parallel_dim=0)

log_alpha = nn.Parameter(torch.zeros((), device=device))