Esempio n. 1
0
# replay bufferからサンプリング?
print('sampler')
sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

# off-policy experience. Traj=(s,a,r,s')
off_traj = Traj(args.max_steps_off, traj_device='cpu')

total_epi = 0
total_step = 0
total_grad_step = 0  # パラメータ更新回数
num_update_lagged = 0  # lagged netの更新回数
max_rew = -1000

print('start')
while args.max_epis > total_epi:
    with measure('sample'):
        print('sampling')
        # policyにしたがって行動し、経験を貯める(env.stepをone_epiの__init__内で行っている)
        # off-policy
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        # on-policyのサンプリング
        print('on-policy')
        on_traj = Traj(traj_device='cpu')
        on_traj.add_epis(epis)
        on_traj = epi_functional.add_next_obs(on_traj)
        on_traj.register_epis()
        off_traj.add_traj(on_traj)  # off-policyに加える

        # episodeとstepのカウント
        total_epi += on_traj.num_epi
Esempio n. 2
0
def main(args):
    init_ray(args.num_cpus, args.num_gpus, args.ray_redis_address)

    if not os.path.exists(args.log):
        os.makedirs(args.log)
    if not os.path.exists(os.path.join(args.log, 'models')):
        os.mkdir(os.path.join(args.log, 'models'))
    score_file = os.path.join(args.log, 'progress.csv')
    logger.add_tabular_output(score_file)
    logger.add_tensorboard_output(args.log)
    with open(os.path.join(args.log, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))

    # when doing the distributed training, disable video recordings
    env = GymEnv(args.env_name)
    env.env.seed(args.seed)
    if args.c2d:
        env = C2DEnv(env)

    observation_space = env.observation_space
    action_space = env.action_space
    pol_net = PolNet(observation_space, action_space)
    rnn = False
    # pol_net = PolNetLSTM(observation_space, action_space)
    # rnn = True
    if isinstance(action_space, gym.spaces.Box):
        pol = GaussianPol(observation_space, action_space, pol_net, rnn=rnn)
    elif isinstance(action_space, gym.spaces.Discrete):
        pol = CategoricalPol(observation_space, action_space, pol_net)
    elif isinstance(action_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(observation_space, action_space, pol_net)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

    vf_net = VNet(observation_space)
    vf = DeterministicSVfunc(observation_space, vf_net)

    trainer = TrainManager(Trainer,
                           args.num_trainer,
                           args.master_address,
                           args=args,
                           vf=vf,
                           pol=pol)
    sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed)

    total_epi = 0
    total_step = 0
    max_rew = -1e6
    start_time = time.time()

    while args.max_epis > total_epi:

        with measure('sample'):
            sampler.set_pol_state(trainer.get_state("pol"))
            epis = sampler.sample(max_steps=args.max_steps_per_iter)

        with measure('train'):
            result_dict = trainer.train(epis=epis)

        step = result_dict["traj_num_step"]
        total_step += step
        total_epi += result_dict["traj_num_epi"]

        rewards = [np.sum(epi['rews']) for epi in epis]
        mean_rew = np.mean(rewards)
        elapsed_time = time.time() - start_time
        logger.record_tabular('ElapsedTime', elapsed_time)
        logger.record_results(args.log,
                              result_dict,
                              score_file,
                              total_epi,
                              step,
                              total_step,
                              rewards,
                              plot_title=args.env_name)

        with measure('save'):
            pol_state = trainer.get_state("pol")
            vf_state = trainer.get_state("vf")
            optim_pol_state = trainer.get_state("optim_pol")
            optim_vf_state = trainer.get_state("optim_vf")

            torch.save(pol_state,
                       os.path.join(args.log, 'models', 'pol_last.pkl'))
            torch.save(vf_state, os.path.join(args.log, 'models',
                                              'vf_last.pkl'))
            torch.save(optim_pol_state,
                       os.path.join(args.log, 'models', 'optim_pol_last.pkl'))
            torch.save(optim_vf_state,
                       os.path.join(args.log, 'models', 'optim_vf_last.pkl'))

            if mean_rew > max_rew:
                torch.save(pol_state,
                           os.path.join(args.log, 'models', 'pol_max.pkl'))
                torch.save(vf_state,
                           os.path.join(args.log, 'models', 'vf_max.pkl'))
                torch.save(
                    optim_pol_state,
                    os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
                torch.save(
                    optim_vf_state,
                    os.path.join(args.log, 'models', 'optim_vf_max.pkl'))
                max_rew = mean_rew
    del sampler
    del trainer
    if args.data_parallel:
        pol.dp_run = True

    result_dict = behavior_clone.train(train_traj, pol, optim_pol,
                                       args.batch_size)
    test_result_dict = behavior_clone.test(test_traj, pol)

    if args.data_parallel:
        pol.dp_run = False

    for key in test_result_dict.keys():
        result_dict[key] = test_result_dict[key]

        if curr_epoch % int(
                args.check_rate * args.epoch) == 0 or curr_epoch == 0:
            with measure('sample'):
                paths = sampler.sample(pol, max_epis=args.max_epis_per_iter)
            rewards = [np.sum(path['rews']) for path in paths]
            mean_rew = np.mean([np.sum(path['rews']) for path in paths])
            logger.record_results_bc(args.log,
                                     result_dict,
                                     score_file,
                                     curr_epoch,
                                     rewards,
                                     plot_title=args.env_name)

        if mean_rew > max_rew:
            torch.save(pol.state_dict(),
                       os.path.join(args.log, 'models', 'pol_max.pkl'))
            torch.save(optim_pol.state_dict(),
                       os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
Esempio n. 4
0
                 args.horizon_of_samples, mean_obs, std_obs, mean_acs, std_acs,
                 args.rnn)
optim_dm = torch.optim.Adam(dm_net.parameters(), args.dm_lr)

rl_sampler = EpiSampler(env,
                        mpc_pol,
                        num_parallel=args.num_parallel,
                        seed=args.seed)

# train loop
total_epi = 0
total_step = 0
counter_agg_iters = 0
max_rew = -1e+6
while args.max_epis > total_epi:
    with measure('train model'):
        result_dict = mpc.train_dm(traj,
                                   dm,
                                   optim_dm,
                                   epoch=args.epoch_per_iter,
                                   batch_size=args.batch_size
                                   if not args.rnn else args.rnn_batch_size)
    with measure('sample'):
        mpc_pol = MPCPol(ob_space, ac_space, dm.net, rew_func, args.n_samples,
                         args.horizon_of_samples, mean_obs, std_obs, mean_acs,
                         std_acs, args.rnn)
        epis = rl_sampler.sample(mpc_pol, max_epis=args.max_epis_per_iter)

        curr_traj = Traj(traj_device='cpu')
        curr_traj.add_epis(epis)
Esempio n. 5
0
ddp_vf, optim_vf = make_model_distributed(vf,
                                          optim_vf,
                                          args.use_apex,
                                          args.apex_opt_level,
                                          args.apex_keep_batchnorm_fp32,
                                          args.apex_sync_bn,
                                          args.apex_loss_scale,
                                          device_ids=[args.local_rank],
                                          output_device=args.local_rank)

total_epi = 0
total_step = 0
max_rew = -1e6
kl_beta = args.init_kl_beta
while args.max_epis > total_epi:
    with measure('sample', log_enable=rank == 0):
        if rank == 0:
            epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train', log_enable=rank == 0):
        traj = Traj(ddp=True, traj_device="cpu")
        if rank == 0:
            traj.add_epis(epis)

            traj = ef.compute_vs(traj, vf)
            traj = ef.compute_rets(traj, args.gamma)
            traj = ef.compute_advs(traj, args.gamma, args.lam)
            traj = ef.centerize_advs(traj)
            traj = ef.compute_h_masks(traj)
            traj.register_epis()
        traj = tf.sync(traj)
Esempio n. 6
0
    def train(self):
        args = self.args

        # TODO: cuda seems to be broken, I don't care about it right now
        # if args.cuda:
        #     # current_obs = current_obs.cuda()
        #     rollouts.cuda()

        self.train_start_time = time.time()
        total_epi = 0
        total_step = 0
        max_rew = -1e6
        sampler = None

        score_file = os.path.join(self.logger.get_logdir(), "progress.csv")
        logger.add_tabular_output(score_file)

        num_total_frames = args.num_total_frames

        mirror_function = None
        if args.mirror_tuples and hasattr(self.env.unwrapped,
                                          "mirror_indices"):
            mirror_function = get_mirror_function(
                **self.env.unwrapped.mirror_indices)
            num_total_frames *= 2
            if not args.tanh_finish:
                warnings.warn(
                    "When `mirror_tuples` is `True`,"
                    " `tanh_finish` should be set to `True` as well."
                    " Otherwise there is a chance of the training blowing up.")

        while num_total_frames > total_step:
            # setup the correct curriculum learning environment/parameters
            new_curriculum = self.curriculum_handler(total_step /
                                                     args.num_total_frames)

            if total_step == 0 or new_curriculum:
                if sampler is not None:
                    del sampler
                sampler = EpiSampler(
                    self.env,
                    self.pol,
                    num_parallel=self.args.num_processes,
                    seed=self.args.seed + total_step,  # TODO: better fix?
                )

            with measure("sample"):
                epis = sampler.sample(self.pol,
                                      max_steps=args.num_steps *
                                      args.num_processes)

            with measure("train"):
                with measure("epis"):
                    traj = Traj()
                    traj.add_epis(epis)

                    traj = ef.compute_vs(traj, self.vf)
                    traj = ef.compute_rets(traj, args.decay_gamma)
                    traj = ef.compute_advs(traj, args.decay_gamma,
                                           args.gae_lambda)
                    traj = ef.centerize_advs(traj)
                    traj = ef.compute_h_masks(traj)
                    traj.register_epis()

                    if mirror_function:
                        traj.add_traj(mirror_function(traj))

                # if args.data_parallel:
                #     self.pol.dp_run = True
                #     self.vf.dp_run = True

                result_dict = ppo_clip.train(
                    traj=traj,
                    pol=self.pol,
                    vf=self.vf,
                    clip_param=args.clip_eps,
                    optim_pol=self.optim_pol,
                    optim_vf=self.optim_vf,
                    epoch=args.epoch_per_iter,
                    batch_size=args.batch_size
                    if not args.rnn else args.rnn_batch_size,
                    max_grad_norm=args.max_grad_norm,
                )

                # if args.data_parallel:
                #     self.pol.dp_run = False
                #     self.vf.dp_run = False

            ## append the metrics to the `results_dict` (reported in the progress.csv)
            result_dict.update(self.get_extra_metrics(epis))

            total_epi += traj.num_epi
            step = traj.num_step
            total_step += step
            rewards = [np.sum(epi["rews"]) for epi in epis]
            mean_rew = np.mean(rewards)
            logger.record_results(
                self.logger.get_logdir(),
                result_dict,
                score_file,
                total_epi,
                step,
                total_step,
                rewards,
                plot_title=args.env,
            )

            if mean_rew > max_rew:
                self.save_models("max")
                max_rew = mean_rew

            self.save_models("last")

            self.scheduler_pol.step()
            self.scheduler_vf.step()

            del traj