# replay bufferからサンプリング? print('sampler') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) # off-policy experience. Traj=(s,a,r,s') off_traj = Traj(args.max_steps_off, traj_device='cpu') total_epi = 0 total_step = 0 total_grad_step = 0 # パラメータ更新回数 num_update_lagged = 0 # lagged netの更新回数 max_rew = -1000 print('start') while args.max_epis > total_epi: with measure('sample'): print('sampling') # policyにしたがって行動し、経験を貯める(env.stepをone_epiの__init__内で行っている) # off-policy epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): # on-policyのサンプリング print('on-policy') on_traj = Traj(traj_device='cpu') on_traj.add_epis(epis) on_traj = epi_functional.add_next_obs(on_traj) on_traj.register_epis() off_traj.add_traj(on_traj) # off-policyに加える # episodeとstepのカウント total_epi += on_traj.num_epi
def main(args): init_ray(args.num_cpus, args.num_gpus, args.ray_redis_address) if not os.path.exists(args.log): os.makedirs(args.log) if not os.path.exists(os.path.join(args.log, 'models')): os.mkdir(os.path.join(args.log, 'models')) score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) with open(os.path.join(args.log, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) # when doing the distributed training, disable video recordings env = GymEnv(args.env_name) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) rnn = False # pol_net = PolNetLSTM(observation_space, action_space) # rnn = True if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, rnn=rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net) trainer = TrainManager(Trainer, args.num_trainer, args.master_address, args=args, vf=vf, pol=pol) sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed) total_epi = 0 total_step = 0 max_rew = -1e6 start_time = time.time() while args.max_epis > total_epi: with measure('sample'): sampler.set_pol_state(trainer.get_state("pol")) epis = sampler.sample(max_steps=args.max_steps_per_iter) with measure('train'): result_dict = trainer.train(epis=epis) step = result_dict["traj_num_step"] total_step += step total_epi += result_dict["traj_num_epi"] rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) elapsed_time = time.time() - start_time logger.record_tabular('ElapsedTime', elapsed_time) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) with measure('save'): pol_state = trainer.get_state("pol") vf_state = trainer.get_state("vf") optim_pol_state = trainer.get_state("optim_pol") optim_vf_state = trainer.get_state("optim_vf") torch.save(pol_state, os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_last.pkl')) torch.save(optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_last.pkl')) torch.save(optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_last.pkl')) if mean_rew > max_rew: torch.save(pol_state, os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_max.pkl')) torch.save( optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save( optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_max.pkl')) max_rew = mean_rew del sampler del trainer
if args.data_parallel: pol.dp_run = True result_dict = behavior_clone.train(train_traj, pol, optim_pol, args.batch_size) test_result_dict = behavior_clone.test(test_traj, pol) if args.data_parallel: pol.dp_run = False for key in test_result_dict.keys(): result_dict[key] = test_result_dict[key] if curr_epoch % int( args.check_rate * args.epoch) == 0 or curr_epoch == 0: with measure('sample'): paths = sampler.sample(pol, max_epis=args.max_epis_per_iter) rewards = [np.sum(path['rews']) for path in paths] mean_rew = np.mean([np.sum(path['rews']) for path in paths]) logger.record_results_bc(args.log, result_dict, score_file, curr_epoch, rewards, plot_title=args.env_name) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
args.horizon_of_samples, mean_obs, std_obs, mean_acs, std_acs, args.rnn) optim_dm = torch.optim.Adam(dm_net.parameters(), args.dm_lr) rl_sampler = EpiSampler(env, mpc_pol, num_parallel=args.num_parallel, seed=args.seed) # train loop total_epi = 0 total_step = 0 counter_agg_iters = 0 max_rew = -1e+6 while args.max_epis > total_epi: with measure('train model'): result_dict = mpc.train_dm(traj, dm, optim_dm, epoch=args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size) with measure('sample'): mpc_pol = MPCPol(ob_space, ac_space, dm.net, rew_func, args.n_samples, args.horizon_of_samples, mean_obs, std_obs, mean_acs, std_acs, args.rnn) epis = rl_sampler.sample(mpc_pol, max_epis=args.max_epis_per_iter) curr_traj = Traj(traj_device='cpu') curr_traj.add_epis(epis)
ddp_vf, optim_vf = make_model_distributed(vf, optim_vf, args.use_apex, args.apex_opt_level, args.apex_keep_batchnorm_fp32, args.apex_sync_bn, args.apex_loss_scale, device_ids=[args.local_rank], output_device=args.local_rank) total_epi = 0 total_step = 0 max_rew = -1e6 kl_beta = args.init_kl_beta while args.max_epis > total_epi: with measure('sample', log_enable=rank == 0): if rank == 0: epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train', log_enable=rank == 0): traj = Traj(ddp=True, traj_device="cpu") if rank == 0: traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, args.gamma) traj = ef.compute_advs(traj, args.gamma, args.lam) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() traj = tf.sync(traj)
def train(self): args = self.args # TODO: cuda seems to be broken, I don't care about it right now # if args.cuda: # # current_obs = current_obs.cuda() # rollouts.cuda() self.train_start_time = time.time() total_epi = 0 total_step = 0 max_rew = -1e6 sampler = None score_file = os.path.join(self.logger.get_logdir(), "progress.csv") logger.add_tabular_output(score_file) num_total_frames = args.num_total_frames mirror_function = None if args.mirror_tuples and hasattr(self.env.unwrapped, "mirror_indices"): mirror_function = get_mirror_function( **self.env.unwrapped.mirror_indices) num_total_frames *= 2 if not args.tanh_finish: warnings.warn( "When `mirror_tuples` is `True`," " `tanh_finish` should be set to `True` as well." " Otherwise there is a chance of the training blowing up.") while num_total_frames > total_step: # setup the correct curriculum learning environment/parameters new_curriculum = self.curriculum_handler(total_step / args.num_total_frames) if total_step == 0 or new_curriculum: if sampler is not None: del sampler sampler = EpiSampler( self.env, self.pol, num_parallel=self.args.num_processes, seed=self.args.seed + total_step, # TODO: better fix? ) with measure("sample"): epis = sampler.sample(self.pol, max_steps=args.num_steps * args.num_processes) with measure("train"): with measure("epis"): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, self.vf) traj = ef.compute_rets(traj, args.decay_gamma) traj = ef.compute_advs(traj, args.decay_gamma, args.gae_lambda) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() if mirror_function: traj.add_traj(mirror_function(traj)) # if args.data_parallel: # self.pol.dp_run = True # self.vf.dp_run = True result_dict = ppo_clip.train( traj=traj, pol=self.pol, vf=self.vf, clip_param=args.clip_eps, optim_pol=self.optim_pol, optim_vf=self.optim_vf, epoch=args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size, max_grad_norm=args.max_grad_norm, ) # if args.data_parallel: # self.pol.dp_run = False # self.vf.dp_run = False ## append the metrics to the `results_dict` (reported in the progress.csv) result_dict.update(self.get_extra_metrics(epis)) total_epi += traj.num_epi step = traj.num_step total_step += step rewards = [np.sum(epi["rews"]) for epi in epis] mean_rew = np.mean(rewards) logger.record_results( self.logger.get_logdir(), result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env, ) if mean_rew > max_rew: self.save_models("max") max_rew = mean_rew self.save_models("last") self.scheduler_pol.step() self.scheduler_vf.step() del traj