def _process_agents( self, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, ): # actor preparation actor_components = utils.get_trainer_components( agent=self.actor, loss_params=actor_loss_params, optimizer_params=actor_optimizer_params, scheduler_params=actor_scheduler_params, grad_clip_params=actor_grad_clip_params) # criterion self._actor_loss_params = actor_components["loss_params"] self.actor_criterion = actor_components["criterion"] # optimizer self._actor_optimizer_params = actor_components["optimizer_params"] self.actor_optimizer = actor_components["optimizer"] # scheduler self._actor_scheduler_params = actor_components["scheduler_params"] self.actor_scheduler = actor_components["scheduler"] # grad clipping self._actor_grad_clip_params = actor_components["grad_clip_params"] self.actor_grad_clip_fn = actor_components["grad_clip_fn"] # critic preparation critic_components = utils.get_trainer_components( agent=self.critic, loss_params=critic_loss_params, optimizer_params=critic_optimizer_params, scheduler_params=critic_scheduler_params, grad_clip_params=critic_grad_clip_params) # criterion self._critic_loss_params = critic_components["loss_params"] self.critic_criterion = critic_components["criterion"] # optimizer self._critic_optimizer_params = critic_components["optimizer_params"] self.critic_optimizer = critic_components["optimizer"] # scheduler self._critic_scheduler_params = critic_components["scheduler_params"] self.critic_scheduler = critic_components["scheduler"] # grad clipping self._critic_grad_clip_params = critic_components["grad_clip_params"] self.critic_grad_clip_fn = critic_components["grad_clip_fn"]
def _init(self, critics: List[CriticSpec], reward_scale: float = 1.0): self.reward_scale = reward_scale # @TODO: policy regularization critics = [x.to(self._device) for x in critics] target_critics = [copy.deepcopy(x).to(self._device) for x in critics] critics_optimizer = [] critics_scheduler = [] for critic in critics: critic_components = utils.get_trainer_components( agent=critic, loss_params=self._critic_loss_params, optimizer_params=self._critic_optimizer_params, scheduler_params=self._critic_scheduler_params, grad_clip_params=self._critic_grad_clip_params) critics_optimizer.append(critic_components["optimizer"]) critics_scheduler.append(critic_components["scheduler"]) self.critics = [self.critic] + critics self.critics_optimizer = [self.critic_optimizer] + critics_optimizer self.critics_scheduler = [self.critic_scheduler] + critics_scheduler self.target_critics = [self.target_critic] + target_critics # value distribution approximation critic_distribution = self.critic.distribution self._loss_fn = self._base_loss self._num_heads = self.critic.num_heads self._num_critics = len(self.critics) self._hyperbolic_constant = self.critic.hyperbolic_constant self._gammas = \ utils.hyperbolic_gammas( self._gamma, self._hyperbolic_constant, self._num_heads ) self._gammas = utils.any2device(self._gammas, device=self._device) assert critic_distribution in [None, "categorical", "quantile"] if critic_distribution == "categorical": self.num_atoms = self.critic.num_atoms values_range = self.critic.values_range self.v_min, self.v_max = values_range self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1) z = torch.linspace(start=self.v_min, end=self.v_max, steps=self.num_atoms) self.z = utils.any2device(z, device=self._device) self._loss_fn = self._categorical_loss elif critic_distribution == "quantile": self.num_atoms = self.critic.num_atoms tau_min = 1 / (2 * self.num_atoms) tau_max = 1 - tau_min tau = torch.linspace(start=tau_min, end=tau_max, steps=self.num_atoms) self.tau = utils.any2device(tau, device=self._device) self._loss_fn = self._quantile_loss else: assert self.critic_criterion is not None
def __init__( self, critic: CriticSpec, gamma: float, n_step: int, critic_loss_params: Dict = None, critic_optimizer_params: Dict = None, critic_scheduler_params: Dict = None, critic_grad_clip_params: Dict = None, critic_tau: float = 1.0, **kwargs ): self._device = utils.get_device() self.critic = critic.to(self._device) self.target_critic = copy.deepcopy(critic).to(self._device) # preparation agent_stuff = utils.get_trainer_components( agent=self.critic, loss_params=critic_loss_params, optimizer_params=critic_optimizer_params, scheduler_params=critic_scheduler_params, grad_clip_params=critic_grad_clip_params ) # criterion self._critic_loss_params = agent_stuff["loss_params"] self.critic_criterion = agent_stuff["criterion"] # optimizer self._critic_optimizer_params = agent_stuff["optimizer_params"] self.critic_optimizer = agent_stuff["optimizer"] # scheduler self._critic_scheduler_params = agent_stuff["scheduler_params"] self.critic_scheduler = agent_stuff["scheduler"] # grad clipping self._critic_grad_clip_params = agent_stuff["grad_clip_params"] self.critic_grad_clip_fn = agent_stuff["grad_clip_fn"] # other hyperparameters self._n_step = n_step self._gamma = gamma self.critic_tau = critic_tau # other init self._init(**kwargs)
def __init__( self, actor: ActorSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, actor_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, **kwargs ): self._device = utils.get_device() self.actor = actor.to(self._device) # actor preparation actor_components = utils.get_trainer_components( agent=self.actor, loss_params=actor_loss_params, optimizer_params=actor_optimizer_params, scheduler_params=actor_scheduler_params, grad_clip_params=actor_grad_clip_params ) # criterion self._actor_loss_params = actor_components["loss_params"] self.actor_criterion = actor_components["criterion"] # optimizer self._actor_optimizer_params = actor_components["optimizer_params"] self.actor_optimizer = actor_components["optimizer"] # scheduler self._actor_scheduler_params = actor_components["scheduler_params"] self.actor_scheduler = actor_components["scheduler"] # grad clipping self._actor_grad_clip_params = actor_components["grad_clip_params"] self.actor_grad_clip_fn = actor_components["grad_clip_fn"] # other hyperparameters self._n_step = n_step self._gamma = gamma # other init self._init(**kwargs)
def __init__(self, actor: ActorSpec, critic: CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, **kwargs): self._device = utils.get_device() self.actor = actor.to(self._device) self.critic = critic.to(self._device) # actor preparation actor_components = utils.get_trainer_components( agent=self.actor, loss_params=actor_loss_params, optimizer_params=actor_optimizer_params, scheduler_params=actor_scheduler_params, grad_clip_params=actor_grad_clip_params) # criterion self._actor_loss_params = actor_components["loss_params"] self.actor_criterion = actor_components["criterion"] # optimizer self._actor_optimizer_params = actor_components["optimizer_params"] self.actor_optimizer = actor_components["optimizer"] # scheduler self._actor_scheduler_params = actor_components["scheduler_params"] self.actor_scheduler = actor_components["scheduler"] # grad clipping self._actor_grad_clip_params = actor_components["grad_clip_params"] self.actor_grad_clip_fn = actor_components["grad_clip_fn"] # critic preparation critic_components = utils.get_trainer_components( agent=self.critic, loss_params=critic_loss_params, optimizer_params=critic_optimizer_params, scheduler_params=critic_scheduler_params, grad_clip_params=critic_grad_clip_params) # criterion self._critic_loss_params = critic_components["loss_params"] self.critic_criterion = critic_components["criterion"] # optimizer self._critic_optimizer_params = critic_components["optimizer_params"] self.critic_optimizer = critic_components["optimizer"] # scheduler self._critic_scheduler_params = critic_components["scheduler_params"] self.critic_scheduler = critic_components["scheduler"] # grad clipping self._critic_grad_clip_params = critic_components["grad_clip_params"] self.critic_grad_clip_fn = critic_components["grad_clip_fn"] # other hyperparameters assert n_step == 1, "For now, on-policy setup works only with n-step=1" self._n_step = n_step self._gamma = gamma # other init self._init(**kwargs)
def __init__( self, actor: ActorSpec, critic: CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, actor_tau: float = 1.0, critic_tau: float = 1.0, action_boundaries: tuple = None, **kwargs ): self._device = utils.get_device() self.actor = actor.to(self._device) self.critic = critic.to(self._device) self.target_actor = copy.deepcopy(actor).to(self._device) self.target_critic = copy.deepcopy(critic).to(self._device) # actor preparation actor_components = utils.get_trainer_components( agent=self.actor, loss_params=actor_loss_params, optimizer_params=actor_optimizer_params, scheduler_params=actor_scheduler_params, grad_clip_params=actor_grad_clip_params ) # criterion self._actor_loss_params = actor_components["loss_params"] self.actor_criterion = actor_components["criterion"] # optimizer self._actor_optimizer_params = actor_components["optimizer_params"] self.actor_optimizer = actor_components["optimizer"] # scheduler self._actor_scheduler_params = actor_components["scheduler_params"] self.actor_scheduler = actor_components["scheduler"] # grad clipping self._actor_grad_clip_params = actor_components["grad_clip_params"] self.actor_grad_clip_fn = actor_components["grad_clip_fn"] # critic preparation critic_components = utils.get_trainer_components( agent=self.critic, loss_params=critic_loss_params, optimizer_params=critic_optimizer_params, scheduler_params=critic_scheduler_params, grad_clip_params=critic_grad_clip_params ) # criterion self._critic_loss_params = critic_components["loss_params"] self.critic_criterion = critic_components["criterion"] # optimizer self._critic_optimizer_params = critic_components["optimizer_params"] self.critic_optimizer = critic_components["optimizer"] # scheduler self._critic_scheduler_params = critic_components["scheduler_params"] self.critic_scheduler = critic_components["scheduler"] # grad clipping self._critic_grad_clip_params = critic_components["grad_clip_params"] self.critic_grad_clip_fn = critic_components["grad_clip_fn"] # other hyperparameters self._n_step = n_step self._gamma = gamma self._actor_tau = actor_tau self._critic_tau = critic_tau if action_boundaries is not None: assert len(action_boundaries) == 2, \ "Should be min and max action boundaries" self._action_boundaries = action_boundaries # other init self._init(**kwargs)