def measure(name, log_enable=True): import time s = time.time() yield e = time.time() if log_enable: logger.log("{}: {:.4f}sec".format(name, e - s))
def __init__(self, env, pol, num_parallel=8, prepro=None, seed=256, worker_cls=None, node_info={}): if not ray.is_initialized(): logger.log( "Ray is not initialized. Initialize ray with no GPU resources") init_ray() pol = copy.deepcopy(pol) pol.to('cpu') pol = ray.put(pol) env = ray.put(env) resources = [] for k, v in node_info.items(): for _ in range(v): resources.append({k: 1}) assert len(resources) <= num_parallel if len(resources) < num_parallel: for _ in range(num_parallel - len(resources)): resources.append(None) if worker_cls is None: worker_cls = DefaultSampleWorker self.workers = [ worker_cls.as_remote(resources=r).remote(pol, env, seed, i, prepro) for i, r in zip(range(num_parallel), resources) ]
def train(traj, qf, lagged_qf, targ_qf1, targ_qf2, optim_qf, epoch, batch_size, # optimization hypers tau=0.9999, gamma=0.9, # advantage estimation loss_type='mse' ): """ Train function for qtopt Parameters ---------- traj : Traj Off policy trajectory. qf : SAVfunction Q function. lagged_qf : SAVfunction Lagged Q function. targ_qf1 : CEMSAVfunction Target Q function. targ_qf2 : CEMSAVfunction Lagged Target Q function. optim_qf : torch.optim.Optimizer Optimizer for Q function. epoch : int Number of iteration. batch_size : int Number of batches. tau : float Target updating rate. gamma : float Discounting rate. loss_type : string Type of belleman loss. Returns ------- result_dict : dict Dictionary which contains losses information. """ qf_losses = [] logger.log("Optimizing...") iterator = traj.random_batch(batch_size, epoch) for batch in iterator: qf_bellman_loss = lf.clipped_double_bellman( qf, targ_qf1, targ_qf2, batch, gamma, loss_type=loss_type) optim_qf.zero_grad() qf_bellman_loss.backward() optim_qf.step() for q, targ_q1 in zip(qf.parameters(), targ_qf1.parameters()): targ_q1.detach().copy_((1 - tau) * targ_q1.detach() + tau * q.detach()) for lagged_q, targ_q2 in zip(lagged_qf.parameters(), targ_qf2.parameters()): targ_q2.detach().copy_((1 - tau) * targ_q2.detach() + tau * lagged_q.detach()) qf_losses.append(qf_bellman_loss.detach().cpu().numpy()) logger.log("Optimization finished!") return {'QfLoss': qf_losses}
def train_dm(traj, dyn_model, optim_dm, epoch=60, batch_size=512, target='next_obs', td=True, num_epi_per_seq=1, log_enable=True): """ Train function for dynamics model. Parameters ---------- traj : Traj On policy trajectory. dyn_model : Model dynamics model. optim_dm : torch.optim.Optimizer Optimizer for dynamics model. epoch : int Number of iteration. batch_size : int Number of batches. target : str Target of prediction is next_obs or rews. td : bool If True, dyn_model learn temporal differance of target. num_epi_per_seq : int Number of episodes in one sequence for rnn. log_enable: bool If True, enable logging Returns ------- result_dict : dict Dictionary which contains losses information. """ dm_losses = [] if log_enable: logger.log("Optimizing...") batch_size = min(batch_size, traj.num_epi) if dyn_model.rnn: iterator = traj.random_batch_rnn(batch_size=batch_size, epoch=epoch) else: iterator = traj.random_batch(batch_size, epoch) for batch in iterator: dm_loss = update_dm(dyn_model, optim_dm, batch, target=target, td=td) dm_losses.append(dm_loss) if log_enable: logger.log("Optimization finished!") return dict(DynModelLoss=dm_losses)
def train( traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, epoch, batch_size, # optimization hypers tau, gamma, log_enable=True, ): pol_losses = [] qf_losses = [] if log_enable: logger.log("Optimizing...") for batch, indices in traj.prioritized_random_batch(batch_size, epoch, return_indices=True): qf_bellman_loss = lf.bellman(qf, targ_qf, targ_pol, batch, gamma, reduction='none') td_loss = torch.sqrt(qf_bellman_loss * 2) qf_bellman_loss = torch.mean(qf_bellman_loss) optim_qf.zero_grad() qf_bellman_loss.backward() optim_qf.step() pol_loss = lf.ag(pol, qf, batch) optim_pol.zero_grad() pol_loss.backward() optim_pol.step() for p, targ_p in zip(pol.parameters(), targ_pol.parameters()): targ_p.detach().copy_((1 - tau) * targ_p.detach() + tau * p.detach()) for q, targ_q in zip(qf.parameters(), targ_qf.parameters()): targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach()) qf_losses.append(qf_bellman_loss.detach().cpu().numpy()) pol_losses.append(pol_loss.detach().cpu().numpy()) traj = tf.update_pris(traj, td_loss, indices) if log_enable: logger.log("Optimization finished!") return {'PolLoss': pol_losses, 'QfLoss': qf_losses}
def update_pol(pol, batch, make_kl=make_kl, max_kl=0.01, damping=0.1, num_cg=10, ent_beta=0): pol_loss = lf.pg(pol, batch, ent_beta) grads = torch.autograd.grad(pol_loss, pol.parameters(), create_graph=True) grads = [g.contiguous() for g in grads] flat_pol_loss_grad = nn.utils.parameters_to_vector(grads).detach() def Fvp(v): kl = make_kl(pol, batch) kl = torch.mean(kl) grads = torch.autograd.grad(kl, pol.parameters(), create_graph=True) grads = [g.contiguous() for g in grads] flat_grad_kl = nn.utils.parameters_to_vector(grads) gvp = torch.sum(flat_grad_kl * v) grads = torch.autograd.grad(gvp, pol.parameters()) grads = [g.contiguous() for g in grads] fvp = nn.utils.parameters_to_vector(grads).detach() return fvp + v * damping stepdir = conjugate_gradients(Fvp, -flat_pol_loss_grad, num_cg) shs = 0.5 * torch.sum(stepdir * Fvp(stepdir), 0, keepdim=True) if (shs < 0).any(): logger.log('invalid shs') return pol_loss.data.cpu().numpy() lm = torch.sqrt(shs / max_kl) fullstep = stepdir / lm[0] neggdotstepdir = torch.sum(-flat_pol_loss_grad * stepdir, 0, keepdim=True) prev_params = nn.utils.parameters_to_vector( [p.contiguous() for p in pol.parameters()]).detach() success, new_params = linesearch(pol, batch, lf.pg, prev_params, fullstep, neggdotstepdir / lm[0], ent_beta=ent_beta) nn.utils.vector_to_parameters(new_params, pol.parameters()) return pol_loss.detach().cpu().numpy()
def __init__(self, env, record_video=False, video_schedule=None, log_dir=None, force_reset=False): if isinstance(env, str): env = gym.envs.make(env) self.env = env if hasattr(env, 'original_env'): self.original_env = env.original_env else: self.original_env = env if self.env.spec is not None: self.env_id = env.spec.id else: self.env_id = None if log_dir is None: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self.observation_space = env.observation_space logger.log("observation space: {}".format(self.observation_space)) self.action_space = env.action_space logger.log("action space: {}".format(self.action_space)) if self.env.spec is not None: self._horizon = env.spec.tags[ 'wrapper_config.TimeLimit.max_episode_steps'] else: self._horizon = None self._log_dir = log_dir self._force_reset = force_reset
def train(traj, student_pol, teacher_pol, student_optim, epoch, batchsize, num_epi_per_seq=1): s_pol_losses = [] logger.log("Optimizing...") iterator = traj.iterate( batchsize, epoch) if not student_pol.rnn else traj.iterate_rnn( batchsize=batchsize, num_epi_per_seq=num_epi_per_seq, epoch=epoch) for batch in iterator: s_pol_loss = update_pol(student_pol=student_pol, teacher_pol=teacher_pol, optim_pol=student_optim, batch=batch) s_pol_losses.append(s_pol_loss) logger.log('Optimization finished') return dict(S_Pol_loss=s_pol_losses)
def train(traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, epoch, batch_size, # optimization hypers tau, gamma, sampling, reparam=True, log_enable=True, max_grad_norm=0.5, ): """ Train function for soft actor critic. Parameters ---------- traj : Traj Off policy trajectory. pol : Pol Policy. qfs : list of SAVfunction Q function. targ_qfs : list of SAVfunction Target Q function. log_alpha : torch.Tensor Temperature parameter of entropy. optim_pol : torch.optim.Optimizer Optimizer for Policy. optim_qfs : list of torch.optim.Optimizer Optimizer for Q function. optim_alpha : torch.optim.Optimizer Optimizer for alpha. epoch : int Number of iteration. batch_size : int Number of batches. tau : float Target updating rate. gamma : float Discounting rate. sampling : int Number of samping in calculating expectation. reparam : bool log_enable: bool If True, enable logging max_grad_norm : float Maximum gradient norm. Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] _qf_losses = [] alpha_losses = [] if log_enable: logger.log("Optimizing...") for batch in traj.random_batch(batch_size, epoch): pol_loss, qf_losses, alpha_loss = lf.sac( pol, qfs, targ_qfs, log_alpha, batch, gamma, sampling, reparam) optim_pol.zero_grad() pol_loss.backward() torch.nn.utils.clip_grad_norm_(pol.parameters(), max_grad_norm) optim_pol.step() for qf, optim_qf, qf_loss in zip(qfs, optim_qfs, qf_losses): optim_qf.zero_grad() qf_loss.backward() torch.nn.utils.clip_grad_norm_(qf.parameters(), max_grad_norm) optim_qf.step() optim_alpha.zero_grad() alpha_loss.backward() optim_alpha.step() for qf, targ_qf in zip(qfs, targ_qfs): for q, targ_q in zip(qf.parameters(), targ_qf.parameters()): targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach()) pol_losses.append(pol_loss.detach().cpu().numpy()) _qf_losses.append( (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy()) alpha_losses.append(alpha_loss.detach().cpu().numpy()) if log_enable: logger.log("Optimization finished!") return dict( PolLoss=pol_losses, QfLoss=_qf_losses, AlphaLoss=alpha_losses )
def train( traj, pol, targ_pol, qfs, targ_qfs, optim_pol, optim_qfs, epoch, batch_size, # optimization hypers tau, gamma, # advantage estimation pol_update=True, log_enable=True, max_grad_norm=0.5, target_policy_smoothing_func=None, ): pol_losses = [] _qf_losses = [] if log_enable: logger.log("Optimizing...") for batch in traj.random_batch(batch_size, epoch): if (target_policy_smoothing_func is not None): qf_losses = lf.td3( qfs, targ_qfs, targ_pol, batch, gamma, continuous=True, deterministic=True, sampling=1, target_policy_smoothing_func=target_policy_smoothing_func) else: qf_losses = lf.td3(qfs, targ_qfs, targ_pol, batch, gamma, continuous=True, deterministic=True, sampling=1) for qf, optim_qf, qf_loss in zip(qfs, optim_qfs, qf_losses): optim_qf.zero_grad() qf_loss.backward() if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(qf.parameters(), max_grad_norm) optim_qf.step() _qf_losses.append( (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy()) if pol_update: pol_loss = lf.ag(pol, qfs[0], batch, no_noise=True) optim_pol.zero_grad() pol_loss.backward() if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(pol.parameters(), max_grad_norm) optim_pol.step() for p, targ_p in zip(pol.parameters(), targ_pol.parameters()): targ_p.detach().copy_((1 - tau) * targ_p.detach() + tau * p.detach()) for qf, targ_qf in zip(qfs, targ_qfs): for q, targ_q in zip(qf.parameters(), targ_qf.parameters()): targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach()) pol_losses.append(pol_loss.detach().cpu().numpy()) if log_enable: logger.log("Optimization finished!") if pol_update: return dict( PolLoss=pol_losses, QfLoss=_qf_losses, ) else: return dict(QfLoss=_qf_losses, )
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda) device = torch.device(device_name) set_device(device) score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) env, center_env = create_env(args) ob_space = env.observation_space ac_space = env.action_space pol_net = PolNetSNAILConstant(ob_space, ac_space, args.timestep, args.num_channels, num_keys=args.num_keys, num_tc_fils=args.num_tc_fils, no_attention=args.no_attention, use_pe=args.use_pe) logger.log(str(len(torch.nn.utils.parameters_to_vector(pol_net.parameters())))) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.pol: pol.load_state_dict(torch.load(args.pol, map_location=lambda storage, loc: storage)) vf_net = VNetSNAILConstant(ob_space, args.timestep, args.num_channels, num_keys=args.num_keys, num_tc_fils=args.num_tc_fils, no_attention=args.no_attention, use_pe=args.use_pe) vf = DeterministicSVfunc(ob_space, vf_net, data_parallel=args.data_parallel)
def train( traj, pol, vf, optim_vf, epoch=5, batch_size=64, num_epi_per_seq=1, # optimization hypers max_kl=0.01, num_cg=10, damping=0.1, ent_beta=0): """ Train function for trust region policy optimization. Parameters ---------- traj : Traj On policy trajectory. pol : Pol Policy. vf : SVfunction V function. optim_vf : torch.optim.Optimizer Optimizer for V function. epoch : int Number of iteration. batch_size : int Number of batches. num_epi_per_seq : int Number of episodes in one sequence for rnn. max_kl : float Limit of KL divergence. num_cg : int Number of iteration in conjugate gradient computation. damping : float Damping parameter for Hessian Vector Product. Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] vf_losses = [] logger.log("Optimizing...") iterator = traj.full_batch(1) if not pol.rnn else traj.iterate_rnn( batch_size=traj.num_epi) for batch in iterator: pol_loss = update_pol(pol, batch, max_kl=max_kl, num_cg=num_cg, damping=damping, ent_beta=ent_beta) pol_losses.append(pol_loss) iterator = traj.iterate(batch_size, epoch) if not pol.rnn else traj.iterate_rnn( batch_size=batch_size, num_epi_per_seq=num_epi_per_seq, epoch=epoch) for batch in iterator: vf_loss = update_vf(vf, optim_vf, batch) vf_losses.append(vf_loss) logger.log("Optimization finished!") return dict(PolLoss=pol_losses, VfLoss=vf_losses)
def train(traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, epoch, batch_size, seq_length, burn_in_length, # optimization hypers tau, gamma, sampling, reparam=True, log_enable=True, ): """ Train function for soft actor critic. Parameters ---------- traj : Traj Off policy trajectory. pol : Pol Policy. qfs : list of SAVfunction Q function. targ_qfs : list of SAVfunction Target Q function. log_alpha : torch.Tensor Temperature parameter of entropy. optim_pol : torch.optim.Optimizer Optimizer for Policy. optim_qfs : list of torch.optim.Optimizer Optimizer for Q function. optim_alpha : torch.optim.Optimizer Optimizer for alpha. epoch : int Number of iteration. batch_size : int Number of batches. seq_length : int Length of batches. burn_in_length : int Length of batches for burn-in. tau : float Target updating rate. gamma : float Discounting rate. sampling : int Number of samping in calculating expectation. reparam : bool log_enable: bool If True, enable logging Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] _qf_losses = [] alpha_losses = [] if log_enable: logger.log("Optimizing...") for batch, start_indices in traj.prioritized_random_batch_rnn(batch_size, seq_length, epoch, return_indices=True): batch, pol_loss, qf_losses, alpha_loss, td_losses = lf.r2d2_sac( pol, qfs, targ_qfs, log_alpha, batch, gamma, sampling, burn_in_length, reparam) optim_pol.zero_grad() pol_loss.backward() optim_pol.step() for optim_qf, qf_loss in zip(optim_qfs, qf_losses): optim_qf.zero_grad() qf_loss.backward() optim_qf.step() optim_alpha.zero_grad() alpha_loss.backward() optim_alpha.step() for qf, targ_qf in zip(qfs, targ_qfs): for q, targ_q in zip(qf.parameters(), targ_qf.parameters()): targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach()) pol_losses.append(pol_loss.detach().cpu().numpy()) _qf_losses.append( (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy()) alpha_losses.append(alpha_loss.detach().cpu().numpy()) # update seq_pris train_length = seq_length - burn_in_length for i in range(batch_size): start = start_indices[i] + burn_in_length seq_indices = torch.arange(start, start+train_length-1) traj = tf.update_pris( traj, td_losses[:, i], seq_indices, update_epi_pris=True, seq_length=seq_length) if log_enable: logger.log("Optimization finished!") return dict( PolLoss=pol_losses, QfLoss=_qf_losses, AlphaLoss=alpha_losses )
def train( agent_traj, expert_traj, pol, vf, optim_vf, optim_discim, rewf=None, shaping_vf=None, advf=None, rew_type='rew', rl_type='trpo', pol_ent_beta=0, discrim_ent_beta=0, epoch=1, batch_size=64, discrim_batch_size=32, num_epi_per_seq=1, discrim_step=1, # optimization hypers damping=0.1, max_kl=0.01, num_cg=10, # trpo hypers optim_pol=None, clip_param=0.2, max_grad_norm=0.5, clip_vfunc=False, kl_beta=1, kl_targ=0.01, # ppo hypers gamma=0.995): pol_losses = [] vf_losses = [] discrim_losses = [] logger.log("Optimizing...") if rl_type == 'trpo': iterator = agent_traj.full_batch( 1) if not pol.rnn else agent_traj.iterate_rnn( batch_size=agent_traj.num_epi) for batch in iterator: pol_loss = trpo.update_pol(pol, batch, max_kl=max_kl, num_cg=num_cg, damping=damping, ent_beta=pol_ent_beta) pol_losses.append(pol_loss) iterator = agent_traj.iterate( batch_size, epoch) if not pol.rnn else agent_traj.iterate_rnn( batch_size=batch_size, num_epi_per_seq=num_epi_per_seq, epoch=epoch) for batch in iterator: vf_loss = trpo.update_vf(vf, optim_vf, batch) vf_losses.append(vf_loss) new_kl_beta = 0 kl_mean = 0 elif rl_type == 'ppo_clip': iterator = agent_traj.iterate( batch_size, epoch) if not pol.rnn else agent_traj.iterate_rnn( batch_size=batch_size, num_epi_per_seq=num_epi_per_seq, epoch=epoch) for batch in iterator: pol_loss = ppo_clip.update_pol(pol, optim_pol, batch, clip_param, pol_ent_beta, max_grad_norm) vf_loss = ppo_clip.update_vf(vf, optim_vf, batch, clip_param, clip_vfunc, max_grad_norm) pol_losses.append(pol_loss) vf_losses.append(vf_loss) new_kl_beta = 0 kl_mean = 0 elif rl_type == 'ppo_kl': iterator = agent_traj.iterate( batch_size, epoch) if not pol.rnn else agent_traj.iterate_rnn( batch_size=batch_size, num_epi_per_seq=num_epi_per_seq, epoch=epoch) for batch in iterator: pol_loss = ppo_kl.update_pol(pol, optim_pol, batch, kl_beta, max_grad_norm, pol_ent_beta) vf_loss = ppo_kl.update_vf(vf, optim_vf, batch) pol_losses.append(pol_loss) vf_losses.append(vf_loss) iterator = agent_traj.full_batch( 1) if not pol.rnn else agent_traj.iterate_rnn( batch_size=agent_traj.num_epi) batch = next(iterator) with torch.no_grad(): pol.reset() if pol.rnn: _, _, pd_params = pol(batch['obs'], h_masks=batch['h_masks']) else: _, _, pd_params = pol(batch['obs']) kl_mean = torch.mean(pol.pd.kl_pq(batch, pd_params)).item() if kl_mean > 1.3 * kl_targ: new_kl_beta = 1.5 * kl_beta elif kl_mean < 0.7 * kl_targ: new_kl_beta = kl_beta / 1.5 else: new_kl_beta = kl_beta else: raise ValueError('Only trpo, ppo_clip and ppo_kl are supported') agent_iterator = agent_traj.iterate_step(batch_size=discrim_batch_size, step=discrim_step) expert_iterator = expert_traj.iterate_step(batch_size=discrim_batch_size, step=discrim_step) for agent_batch, expert_batch in zip(agent_iterator, expert_iterator): discrim_loss = update_discrim(rewf, shaping_vf, advf, pol, rew_type, optim_discim, agent_batch, expert_batch, gamma) discrim_losses.append(discrim_loss) logger.log("Optimization finished!") return dict(PolLoss=pol_losses, VfLoss=vf_losses, DiscrimLoss=discrim_losses, new_kl_beta=new_kl_beta, kl_mean=kl_mean)
def train( traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, epoch, batch_size, # optimization hypers tau, gamma # advantage estimation ): """ Train function for deep deterministic policy gradient Parameters ---------- traj : Traj Off policy trajectory. pol : Pol Policy. targ_pol : Pol Target Policy. qf : SAVfunction Q function. targ_qf : SAVfunction Target Q function. optim_pol : torch.optim.Optimizer Optimizer for Policy. optim_qf : torch.optim.Optimizer Optimizer for Q function. epoch : int Number of iteration. batch_size : int Number of batches. tau : float Target updating rate. gamma : float Discounting rate. Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] qf_losses = [] logger.log("Optimizing...") for batch in traj.random_batch(batch_size, epoch): qf_bellman_loss = lf.bellman(qf, targ_qf, targ_pol, batch, gamma) optim_qf.zero_grad() qf_bellman_loss.backward() optim_qf.step() pol_loss = lf.ag(pol, qf, batch, no_noise=True) optim_pol.zero_grad() pol_loss.backward() optim_pol.step() for p, targ_p in zip(pol.parameters(), targ_pol.parameters()): targ_p.detach().copy_((1 - tau) * targ_p.detach() + tau * p.detach()) for q, targ_q in zip(qf.parameters(), targ_qf.parameters()): targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach()) qf_losses.append(qf_bellman_loss.detach().cpu().numpy()) pol_losses.append(pol_loss.detach().cpu().numpy()) logger.log("Optimization finished!") return {'PolLoss': pol_losses, 'QfLoss': qf_losses}
def train( traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, epoch, batch_size, # optimization hypers tau, gamma, # advantage estimation sampling, log_enable=True, ): """ Train function for deep deterministic policy gradient Parameters ---------- traj : Traj Off policy trajectory. pol : Pol Policy. targ_pol : Pol Target Policy. qf : SAVfunction Q function. targ_qf : SAVfunction Target Q function. optim_pol : torch.optim.Optimizer Optimizer for Policy. optim_qf : torch.optim.Optimizer Optimizer for Q function. epoch : int Number of iteration. batch_size : int Number of batches. tau : float Target updating rate. gamma : float Discounting rate. sampling : int Number of samping in calculating expectation. log_enable: bool If True, enable logging Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] qf_losses = [] if log_enable: logger.log("Optimizing...") for batch in traj.iterate(batch_size, epoch): qf_bellman_loss = lf.bellman(qf, targ_qf, targ_pol, batch, gamma, sampling=sampling) optim_qf.zero_grad() qf_bellman_loss.backward() optim_qf.step() pol_loss = lf.ag(pol, qf, batch, sampling) optim_pol.zero_grad() pol_loss.backward() optim_pol.step() for q, targ_q, p, targ_p in zip(qf.parameters(), targ_qf.parameters(), pol.parameters(), targ_pol.parameters()): targ_p.detach().copy_((1 - tau) * targ_p.detach() + tau * p.detach()) targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach()) qf_losses.append(qf_bellman_loss.detach().cpu().numpy()) pol_losses.append(pol_loss.detach().cpu().numpy()) if log_enable: logger.log("Optimization finished!") return dict( PolLoss=pol_losses, QfLoss=qf_losses, )
def train( traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, epoch, batch_size, # optimization hypers tau, gamma, sampling, discrim, num_skill, reparam=True): """ Train function for soft actor critic. Parameters ---------- traj : Traj Off policy trajectory. pol : Pol Policy. qfs : list of SAVfunction Q function. targ_qfs : list of SAVfunction Target Q function. log_alpha : torch.Tensor Temperature parameter of entropy. optim_pol : torch.optim.Optimizer Optimizer for Policy. optim_qfs : list of torch.optim.Optimizer Optimizer for Q function. optim_alpha : torch.optim.Optimizer Optimizer for alpha. epoch : int Number of iteration. batch_size : int Number of batches. tau : float Target updating rate. gamma : float Discounting rate. sampling : int Number of samping in calculating expectation. reparam : bool discrim : SVfunction Discriminator. discrim_f : function Feature extractor of discriminator. f_dim : The dimention of discrim_f output. num_skill : int The number of skills. Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] _qf_losses = [] alpha_losses = [] logger.log("Optimizing...") for batch in traj.random_batch(batch_size, epoch): with torch.no_grad(): rews, info = calc_rewards(batch['obs'], num_skill, discrim) batch['rews'] = rews pol_loss, qf_losses, alpha_loss = lf.sac(pol, qfs, targ_qfs, log_alpha, batch, gamma, sampling, reparam) optim_pol.zero_grad() pol_loss.backward() optim_pol.step() for optim_qf, qf_loss in zip(optim_qfs, qf_losses): optim_qf.zero_grad() qf_loss.backward() optim_qf.step() optim_alpha.zero_grad() alpha_loss.backward() optim_alpha.step() for qf, targ_qf in zip(qfs, targ_qfs): for q, targ_q in zip(qf.parameters(), targ_qf.parameters()): targ_q.detach().copy_((1 - tau) * targ_q.detach() + tau * q.detach()) pol_losses.append(pol_loss.detach().cpu().numpy()) _qf_losses.append( (sum(qf_losses) / len(qf_losses)).detach().cpu().numpy()) alpha_losses.append(alpha_loss.detach().cpu().numpy()) logger.log("Optimization finished!") return dict(PolLoss=pol_losses, QfLoss=_qf_losses, AlphaLoss=alpha_losses)
def measure(name): import time s = time.time() yield e = time.time() logger.log("{}: {:.4f}sec".format(name, e - s))
def train(traj, pol, vf, kl_beta, kl_targ, optim_pol, optim_vf, epoch, batch_size, max_grad_norm, num_epi_per_seq=1, ent_beta=0, # optimization hypers log_enable=True, ): """ Train function for proximal policy optimization (kl). Parameters ---------- traj : Traj On policy trajectory. pol : Pol Policy. vf : SVfunction V function. kl_beta : float KL divergence coefficient. kl_targ : float Target of KL divergence. optim_pol : torch.optim.Optimizer Optimizer for Policy. optim_vf : torch.optim.Optimizer Optimizer for V function. epoch : int Number of iteration. batch_size : int Number of batches. max_grad_norm : float Maximum gradient norm. num_epi_per_seq : int Number of episodes in one sequence for rnn. log_enable: bool If True, enable logging Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] vf_losses = [] if log_enable: logger.log("Optimizing...") iterator = traj.iterate(batch_size, epoch) if not pol.rnn else traj.iterate_rnn( batch_size=batch_size, num_epi_per_seq=num_epi_per_seq, epoch=epoch) for batch in iterator: pol_loss = update_pol(pol, optim_pol, batch, kl_beta, max_grad_norm, ent_beta) vf_loss = update_vf(vf, optim_vf, batch) pol_losses.append(pol_loss) vf_losses.append(vf_loss) iterator = traj.full_batch(1) if not pol.rnn else traj.iterate_rnn( batch_size=traj.num_epi) batch = next(iterator) with torch.no_grad(): pol.reset() if pol.rnn: _, _, pd_params = pol(batch['obs'], h_masks=batch['h_masks']) else: _, _, pd_params = pol(batch['obs']) kl_mean = torch.mean( pol.pd.kl_pq( batch, pd_params ) ).item() if kl_mean > 1.3 * kl_targ: new_kl_beta = 1.5 * kl_beta elif kl_mean < 0.7 * kl_targ: new_kl_beta = kl_beta / 1.5 else: new_kl_beta = kl_beta if log_enable: logger.log("Optimization finished!") return dict(PolLoss=pol_losses, VfLoss=vf_losses, new_kl_beta=new_kl_beta, kl_mean=kl_mean)
epis = student_sampler.sample( s_pol, max_epis=args.max_epis_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = on_pol_teacher_distill.train( traj=traj, student_pol=s_pol, teacher_pol=t_pol, student_optim=optim_pol, epoch=args.epoch_per_iter, batchsize=args.batch_size) logger.log('Testing Student-policy') with measure('sample'): epis_measure = student_sampler.sample( s_pol, max_epis=args.max_epis_per_iter) with measure('measure'): traj_measure = Traj() traj_measure.add_epis(epis_measure) traj_measure = ef.compute_h_masks(traj_measure) traj_measure.register_epis() total_epi += traj_measure.num_epi step = traj_measure.num_step total_step += step rewards = [np.sum(epi['rews']) for epi in epis_measure] mean_rew = np.mean(rewards)
if args.rnn: pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) else: pol_net = PolNet(ob_space, ac_space) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) with open(os.path.join(args.pol_dir, args.pol_fname), 'rb') as f: pol.load_state_dict( torch.load(f, map_location=lambda storage, location: storage)) epis = sampler.sample(pol, max_epis=args.num_epis) filename = args.epis_fname if len( args.epis_fname) != 0 else env.env.spec.id + '_{}epis.pkl'.format( len(epis)) with open(os.path.join(args.epis_dir, filename), 'wb') as f: pickle.dump(epis, f) rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.log('expert_score={}'.format(mean_rew)) del sampler
data_parallel=args.data_parallel) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew)) logger.log('expert_num_epi={}'.format(expert_traj.num_epi)) total_epi = 0 total_step = 0 max_rew = -1e6 kl_beta = args.init_kl_beta if args.pretrain: with measure('bc pretrain'): for _ in range(args.bc_epoch): _ = behavior_clone.train(expert_traj, pol, optim_pol, args.bc_batch_size) torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_bc.pkl'))
if args.rnn: pol_net = PolNetLSTM(observation_space, action_space, h_size=256, cell_size=256) else: pol_net = PolNet(observation_space, action_space) if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') sampler = EpiSampler(env, pol, num_parallel=1, seed=args.seed) with open(os.path.join(args.pol_dir, 'models', args.pol_fname), 'rb') as f: pol.load_state_dict( torch.load(f, map_location=lambda storage, location: storage)) epis = sampler.sample(pol, max_epis=args.num_epis) rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.log('score={}'.format(mean_rew)) del sampler
epoch, args.batch_size, args.tau, args.gamma, loss_type=args.loss_type) # multi-agent並列処理。dp_run=data_parallel run if args.data_parallel: qf.dp_run = False lagged_qf.dp_run = False targ_qf1.dp_run = False targ_qf2.dp_run = False total_grad_step += epoch if total_grad_step >= args.lag * num_update_lagged: # 6000stepsごとにlagged netを更新 logger.log('Updated lagged qf!!') lagged_qf_net.load_state_dict(qf_net.state_dict()) num_update_lagged += 1 rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) # logを保存 logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name)
def train( traj, pol, vf, optim_pol, optim_vf, epoch, batch_size, num_epi_per_seq=1, # optimization hypers clip_param=0.2, ent_beta=1e-3, max_grad_norm=0.5, clip_vfunc=False, log_enable=True, ): """ Train function for proximal policy optimization (clip). Parameters ---------- traj : Traj On policy trajectory. pol : Pol Policy. vf : SVfunction V function. optim_pol : torch.optim.Optimizer Optimizer for Policy. optim_vf : torch.optim.Optimizer Optimizer for V function. epoch : int Number of iteration. batch_size : int Number of batches. num_epi_per_seq : int Number of episodes in one sequence for rnn. clip_param : float Clipping ratio of objective function. ent_beta : float Entropy coefficient. max_grad_norm : float Maximum gradient norm. clip_vfunc: bool If True, vfunc is also updated by clipped objective function. log_enable: bool If True, enable logging Returns ------- result_dict : dict Dictionary which contains losses information. """ pol_losses = [] vf_losses = [] if log_enable: logger.log("Optimizing...") iterator = traj.iterate(batch_size, epoch) if not pol.rnn else traj.iterate_rnn( batch_size=batch_size, num_epi_per_seq=num_epi_per_seq, epoch=epoch) for batch in iterator: pol_loss = update_pol(pol, optim_pol, batch, clip_param, ent_beta, max_grad_norm) vf_loss = update_vf(vf, optim_vf, batch, clip_param, clip_vfunc, max_grad_norm) pol_losses.append(pol_loss) vf_losses.append(vf_loss) if log_enable: logger.log("Optimization finished!") return dict(PolLoss=pol_losses, VfLoss=vf_losses)