Beispiel #1
0
    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = CategoricalPol(
            self.env.observation_space, self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2)

        del sampler
Beispiel #2
0
    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space, h1=32, h2=32)
        pol = CategoricalPol(self.env.observation_space,
                             self.env.action_space, pol_net)

        vf_net = VNet(self.env.observation_space, h1=32, h2=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2,
                                     optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32)
        result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2,
                                   optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32, max_grad_norm=10)

        del sampler
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
if isinstance(action_space, gym.spaces.Box):
    pol = GaussianPol(observation_space,
                      action_space,
                      pol_net,
                      data_parallel=args.data_parallel)
elif isinstance(action_space, gym.spaces.Discrete):
    pol = CategoricalPol(observation_space,
                         action_space,
                         pol_net,
                         data_parallel=args.data_parallel)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space,
                              action_space,
                              pol_net,
                              data_parallel=args.data_parallel)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

vf_net = VNet(observation_space)
vf = DeterministicSVfunc(observation_space,
                         vf_net,
                         data_parallel=args.data_parallel)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
# Generate teacher (t) policy and student (s) policy and load teacher policy
# Please note that the two policies do not have to have the same hidden architecture

if args.rnn:
    t_pol_net = PolNetLSTM(observation_space, action_space,
                           h_size=256, cell_size=256)
    s_pol_net = PolNetLSTM(observation_space, action_space,
                           h_size=256, cell_size=256)
else:
    t_pol_net = PolNet(observation_space, action_space)
    s_pol_net = PolNet(observation_space, action_space, h1=190, h2=90)
if isinstance(action_space, gym.spaces.Box):
    t_pol = GaussianPol(observation_space, action_space, t_pol_net, args.rnn)
    s_pol = GaussianPol(observation_space, action_space, s_pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.Discrete):
    t_pol = CategoricalPol(
        observation_space, action_space, t_pol_net, args.rnn)
    s_pol = CategoricalPol(
        observation_space, action_space, s_pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    t_pol = MultiCategoricalPol(
        observation_space, action_space, t_pol_net, args.rnn)
    s_pol = MultiCategoricalPol(
        observation_space, action_space, s_pol_net, args.rnn)
else:
    raise ValueError('Only Box, Discrete and Multidiscrete are supported')

if args.teacher_pol:
    t_pol.load_state_dict(torch.load(
        os.path.join(args.teacher_dir, args.teacher_fname)))

if args.rnn:
ac_space = env.action_space

# Generate teacher (t) policy and student (s) policy and load teacher policy
# Please note that the two policies do not have to have the same hidden architecture

if args.rnn:
    t_pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
    s_pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
else:
    t_pol_net = PolNet(ob_space, ac_space)
    s_pol_net = PolNet(ob_space, ac_space, h1=190, h2=90)
if isinstance(ac_space, gym.spaces.Box):
    t_pol = GaussianPol(ob_space, ac_space, t_pol_net, args.rnn)
    s_pol = GaussianPol(ob_space, ac_space, s_pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.Discrete):
    t_pol = CategoricalPol(ob_space, ac_space, t_pol_net, args.rnn)
    s_pol = CategoricalPol(ob_space, ac_space, s_pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    t_pol = MultiCategoricalPol(ob_space, ac_space, t_pol_net, args.rnn)
    s_pol = MultiCategoricalPol(ob_space, ac_space, s_pol_net, args.rnn)
else:
    raise ValueError('Only Box, Discrete and Multidiscrete are supported')

if args.teacher_pol:
    t_pol.load_state_dict(
        torch.load(os.path.join(args.teacher_dir, args.teacher_fname)))

if args.rnn:
    s_vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
else:
    s_vf_net = VNet(ob_space)
Beispiel #6
0
if args.rnn:
    pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
else:
    pol_net = PolNet(ob_space, ac_space)
if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space,
                      ac_space,
                      pol_net,
                      args.rnn,
                      data_parallel=args.data_parallel,
                      parallel_dim=1 if args.rnn else 0)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space,
                         ac_space,
                         pol_net,
                         args.rnn,
                         data_parallel=args.data_parallel,
                         parallel_dim=1 if args.rnn else 0)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(ob_space,
                              ac_space,
                              pol_net,
                              args.rnn,
                              data_parallel=args.data_parallel,
                              parallel_dim=1 if args.rnn else 0)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
else:
Beispiel #7
0
def main(args):
    init_ray(args.num_cpus, args.num_gpus, args.ray_redis_address)

    if not os.path.exists(args.log):
        os.makedirs(args.log)
    if not os.path.exists(os.path.join(args.log, 'models')):
        os.mkdir(os.path.join(args.log, 'models'))
    score_file = os.path.join(args.log, 'progress.csv')
    logger.add_tabular_output(score_file)
    logger.add_tensorboard_output(args.log)
    with open(os.path.join(args.log, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))

    # when doing the distributed training, disable video recordings
    env = GymEnv(args.env_name)
    env.env.seed(args.seed)
    if args.c2d:
        env = C2DEnv(env)

    observation_space = env.observation_space
    action_space = env.action_space
    pol_net = PolNet(observation_space, action_space)
    rnn = False
    # pol_net = PolNetLSTM(observation_space, action_space)
    # rnn = True
    if isinstance(action_space, gym.spaces.Box):
        pol = GaussianPol(observation_space, action_space, pol_net, rnn=rnn)
    elif isinstance(action_space, gym.spaces.Discrete):
        pol = CategoricalPol(observation_space, action_space, pol_net)
    elif isinstance(action_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(observation_space, action_space, pol_net)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

    vf_net = VNet(observation_space)
    vf = DeterministicSVfunc(observation_space, vf_net)

    trainer = TrainManager(Trainer,
                           args.num_trainer,
                           args.master_address,
                           args=args,
                           vf=vf,
                           pol=pol)
    sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed)

    total_epi = 0
    total_step = 0
    max_rew = -1e6
    start_time = time.time()

    while args.max_epis > total_epi:

        with measure('sample'):
            sampler.set_pol_state(trainer.get_state("pol"))
            epis = sampler.sample(max_steps=args.max_steps_per_iter)

        with measure('train'):
            result_dict = trainer.train(epis=epis)

        step = result_dict["traj_num_step"]
        total_step += step
        total_epi += result_dict["traj_num_epi"]

        rewards = [np.sum(epi['rews']) for epi in epis]
        mean_rew = np.mean(rewards)
        elapsed_time = time.time() - start_time
        logger.record_tabular('ElapsedTime', elapsed_time)
        logger.record_results(args.log,
                              result_dict,
                              score_file,
                              total_epi,
                              step,
                              total_step,
                              rewards,
                              plot_title=args.env_name)

        with measure('save'):
            pol_state = trainer.get_state("pol")
            vf_state = trainer.get_state("vf")
            optim_pol_state = trainer.get_state("optim_pol")
            optim_vf_state = trainer.get_state("optim_vf")

            torch.save(pol_state,
                       os.path.join(args.log, 'models', 'pol_last.pkl'))
            torch.save(vf_state, os.path.join(args.log, 'models',
                                              'vf_last.pkl'))
            torch.save(optim_pol_state,
                       os.path.join(args.log, 'models', 'optim_pol_last.pkl'))
            torch.save(optim_vf_state,
                       os.path.join(args.log, 'models', 'optim_vf_last.pkl'))

            if mean_rew > max_rew:
                torch.save(pol_state,
                           os.path.join(args.log, 'models', 'pol_max.pkl'))
                torch.save(vf_state,
                           os.path.join(args.log, 'models', 'vf_max.pkl'))
                torch.save(
                    optim_pol_state,
                    os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
                torch.save(
                    optim_vf_state,
                    os.path.join(args.log, 'models', 'optim_vf_max.pkl'))
                max_rew = mean_rew
    del sampler
    del trainer
Beispiel #8
0
env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
if isinstance(action_space, gym.spaces.Box):
    pol = GaussianPol(observation_space, action_space, pol_net)
elif isinstance(action_space, gym.spaces.Discrete):
    pol = CategoricalPol(observation_space, action_space, pol_net)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space, action_space, pol_net)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
train_epis, test_epis = ef.train_test_split(expert_epis,
                                            train_size=args.train_size)
train_traj = Traj()
train_traj.add_epis(train_epis)
train_traj.register_epis()
Beispiel #9
0
env = GymEnv(args.env_name, log_dir=os.path.join(
    args.log, 'movie'), record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

ob_space = env.observation_space
ac_space = env.action_space


pol_net = PolNet(ob_space, ac_space)
if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space, ac_space, pol_net,
                      data_parallel=args.data_parallel)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space, ac_space, pol_net,
                         data_parallel=args.data_parallel)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(
        ob_space, ac_space, pol_net, data_parallel=args.data_parallel)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

vf_net = VNet(ob_space)
vf = DeterministicSVfunc(ob_space, vf_net,
                         data_parallel=args.data_parallel)

if args.rew_type == 'rew':
    rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2)
    rewf = DeterministicSVfunc(
        ob_space, rewf_net, data_parallel=args.data_parallel)
    shaping_vf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2)
Beispiel #10
0
def main():
    pygame.init()  # 初期化
    (w, h) = (480, 320)
    screen = pygame.display.set_mode((w, h), FULLSCREEN)  # window size
    pygame.display.set_caption("Sikamaru")  # window bar

    # initialization
    tx = 0
    ty = 0
    sika = Sikamaru((w / 2, h / 2))
    sleep_count = 5
    eat_mode = 100
    esa = Food()
    wait = True
    seed = 42

    # TODO define RL agent
    '''
    state : 4D (sikaposi, esaposi)
    action : 2D (-20,+20)^2
    SAC
    simple_net : 30,30
    '''
    np.random.seed(seed)
    torch.manual_seed(seed)

    low = np.zeros(4)
    high = w * np.ones(4)
    ob_space = gym.spaces.Box(low=low, high=high)
    ac_space = gym.spaces.Discrete(4)
    ac_dict = {
        0: np.array([-20, 0]),
        1: np.array([20, 0]),
        2: np.array([0, -20]),
        3: np.array([0, 20])
    }
    pol_net = PolNet(ob_space, ac_space)
    pol = CategoricalPol(ob_space, ac_space, pol_net)
    qf_net1 = QNet(ob_space, ac_space)
    qf1 = DeterministicSAVfunc(ob_space, ac_space, qf_net1)
    targ_qf_net1 = QNet(ob_space, ac_space)
    targ_qf_net1.load_state_dict(qf_net1.state_dict())
    targ_qf1 = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net1)
    qf_net2 = QNet(ob_space, ac_space)
    qf2 = DeterministicSAVfunc(ob_space, ac_space, qf_net2)
    targ_qf_net2 = QNet(ob_space, ac_space)
    targ_qf_net2.load_state_dict(qf_net2.state_dict())
    targ_qf2 = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net2)
    qfs = [qf1, qf2]
    targ_qfs = [targ_qf1, targ_qf2]
    log_alpha = nn.Parameter(torch.ones(()))

    optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4)
    optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
    optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
    optim_qfs = [optim_qf1, optim_qf2]
    optim_alpha = torch.optim.Adam([log_alpha], 1e-4)

    # off_traj = Traj()

    while (True):
        screen.fill((
            0,
            100,
            0,
        ))  # backgroud color

        # my procedure
        ## env
        obs = make_obs((tx, ty), sika.posi, w, h)
        ac_real, ac, a_i = pol.deterministic_ac_real(
            torch.tensor(obs, dtype=torch.float))
        # ac_real = ac_real.reshape(pol.ac_space.shape)
        a = rule_act((tx, ty), sika.posi)
        # a = ac_dict[int(ac_real)]

        nx = sika.posi[0] + a[0]
        nx = max(min(nx, w), 0)
        ny = sika.posi[1] + a[1]
        ny = max(min(ny, h), 0)

        sika.move((nx, ny))
        screen.blit(sika.get_im(), sika.rect)

        if esa.life:  # RL
            # TOOD:record as epi

            screen.blit(esa.im, esa.rect)
            # scr
            rew = esa.life_step(sika)
            if rew > 0:
                sika.bigup()
            if esa.life == 0:
                pass
                #TODO add one epi and learn

                wait = False

        if wait:
            pygame.time.wait(500)
        wait = True
        pygame.display.update()  # 画面更新

        ## event
        for event in pygame.event.get():
            if event.type == MOUSEBUTTONDOWN and event.button == 1:
                tx, ty = event.pos
                esa.set((tx, ty))
            if event.type == KEYDOWN:
                if event.key == K_ESCAPE:
                    sys.exit()

            if event.type == QUIT:  # 終了処理
                pygame.quit()
                sys.exit()
Beispiel #11
0
                         action_space,
                         h_size=256,
                         cell_size=256)
else:
    pol_net = PolNet(observation_space, action_space)
if isinstance(action_space, gym.spaces.Box):
    pol = GaussianPol(observation_space,
                      action_space,
                      pol_net,
                      args.rnn,
                      data_parallel=args.ddp,
                      parallel_dim=1 if args.rnn else 0)
elif isinstance(action_space, gym.spaces.Discrete):
    pol = CategoricalPol(observation_space,
                         action_space,
                         pol_net,
                         args.rnn,
                         data_parallel=args.ddp,
                         parallel_dim=1 if args.rnn else 0)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space,
                              action_space,
                              pol_net,
                              args.rnn,
                              data_parallel=args.ddp,
                              parallel_dim=1 if args.rnn else 0)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256)
else: