Beispiel #1
0
    def _process_agents(
        self,
        actor_loss_params: Dict = None,
        critic_loss_params: Dict = None,
        actor_optimizer_params: Dict = None,
        critic_optimizer_params: Dict = None,
        actor_scheduler_params: Dict = None,
        critic_scheduler_params: Dict = None,
        actor_grad_clip_params: Dict = None,
        critic_grad_clip_params: Dict = None,
    ):
        # actor preparation
        actor_components = utils.get_trainer_components(
            agent=self.actor,
            loss_params=actor_loss_params,
            optimizer_params=actor_optimizer_params,
            scheduler_params=actor_scheduler_params,
            grad_clip_params=actor_grad_clip_params)
        # criterion
        self._actor_loss_params = actor_components["loss_params"]
        self.actor_criterion = actor_components["criterion"]
        # optimizer
        self._actor_optimizer_params = actor_components["optimizer_params"]
        self.actor_optimizer = actor_components["optimizer"]
        # scheduler
        self._actor_scheduler_params = actor_components["scheduler_params"]
        self.actor_scheduler = actor_components["scheduler"]
        # grad clipping
        self._actor_grad_clip_params = actor_components["grad_clip_params"]
        self.actor_grad_clip_fn = actor_components["grad_clip_fn"]

        # critic preparation
        critic_components = utils.get_trainer_components(
            agent=self.critic,
            loss_params=critic_loss_params,
            optimizer_params=critic_optimizer_params,
            scheduler_params=critic_scheduler_params,
            grad_clip_params=critic_grad_clip_params)
        # criterion
        self._critic_loss_params = critic_components["loss_params"]
        self.critic_criterion = critic_components["criterion"]
        # optimizer
        self._critic_optimizer_params = critic_components["optimizer_params"]
        self.critic_optimizer = critic_components["optimizer"]
        # scheduler
        self._critic_scheduler_params = critic_components["scheduler_params"]
        self.critic_scheduler = critic_components["scheduler"]
        # grad clipping
        self._critic_grad_clip_params = critic_components["grad_clip_params"]
        self.critic_grad_clip_fn = critic_components["grad_clip_fn"]
Beispiel #2
0
    def _init(self, critics: List[CriticSpec], reward_scale: float = 1.0):
        self.reward_scale = reward_scale
        # @TODO: policy regularization

        critics = [x.to(self._device) for x in critics]
        target_critics = [copy.deepcopy(x).to(self._device) for x in critics]
        critics_optimizer = []
        critics_scheduler = []

        for critic in critics:
            critic_components = utils.get_trainer_components(
                agent=critic,
                loss_params=self._critic_loss_params,
                optimizer_params=self._critic_optimizer_params,
                scheduler_params=self._critic_scheduler_params,
                grad_clip_params=self._critic_grad_clip_params)
            critics_optimizer.append(critic_components["optimizer"])
            critics_scheduler.append(critic_components["scheduler"])

        self.critics = [self.critic] + critics
        self.critics_optimizer = [self.critic_optimizer] + critics_optimizer
        self.critics_scheduler = [self.critic_scheduler] + critics_scheduler
        self.target_critics = [self.target_critic] + target_critics

        # value distribution approximation
        critic_distribution = self.critic.distribution
        self._loss_fn = self._base_loss
        self._num_heads = self.critic.num_heads
        self._num_critics = len(self.critics)
        self._hyperbolic_constant = self.critic.hyperbolic_constant
        self._gammas = \
            utils.hyperbolic_gammas(
                self._gamma,
                self._hyperbolic_constant,
                self._num_heads
            )
        self._gammas = utils.any2device(self._gammas, device=self._device)
        assert critic_distribution in [None, "categorical", "quantile"]

        if critic_distribution == "categorical":
            self.num_atoms = self.critic.num_atoms
            values_range = self.critic.values_range
            self.v_min, self.v_max = values_range
            self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1)
            z = torch.linspace(start=self.v_min,
                               end=self.v_max,
                               steps=self.num_atoms)
            self.z = utils.any2device(z, device=self._device)
            self._loss_fn = self._categorical_loss
        elif critic_distribution == "quantile":
            self.num_atoms = self.critic.num_atoms
            tau_min = 1 / (2 * self.num_atoms)
            tau_max = 1 - tau_min
            tau = torch.linspace(start=tau_min,
                                 end=tau_max,
                                 steps=self.num_atoms)
            self.tau = utils.any2device(tau, device=self._device)
            self._loss_fn = self._quantile_loss
        else:
            assert self.critic_criterion is not None
Beispiel #3
0
    def __init__(
        self,
        critic: CriticSpec,
        gamma: float,
        n_step: int,
        critic_loss_params: Dict = None,
        critic_optimizer_params: Dict = None,
        critic_scheduler_params: Dict = None,
        critic_grad_clip_params: Dict = None,
        critic_tau: float = 1.0,
        **kwargs
    ):
        self._device = utils.get_device()
        self.critic = critic.to(self._device)
        self.target_critic = copy.deepcopy(critic).to(self._device)

        # preparation
        agent_stuff = utils.get_trainer_components(
            agent=self.critic,
            loss_params=critic_loss_params,
            optimizer_params=critic_optimizer_params,
            scheduler_params=critic_scheduler_params,
            grad_clip_params=critic_grad_clip_params
        )
        # criterion
        self._critic_loss_params = agent_stuff["loss_params"]
        self.critic_criterion = agent_stuff["criterion"]
        # optimizer
        self._critic_optimizer_params = agent_stuff["optimizer_params"]
        self.critic_optimizer = agent_stuff["optimizer"]
        # scheduler
        self._critic_scheduler_params = agent_stuff["scheduler_params"]
        self.critic_scheduler = agent_stuff["scheduler"]
        # grad clipping
        self._critic_grad_clip_params = agent_stuff["grad_clip_params"]
        self.critic_grad_clip_fn = agent_stuff["grad_clip_fn"]

        # other hyperparameters
        self._n_step = n_step
        self._gamma = gamma
        self.critic_tau = critic_tau

        # other init
        self._init(**kwargs)
Beispiel #4
0
    def __init__(
        self,
        actor: ActorSpec,
        gamma: float,
        n_step: int,
        actor_loss_params: Dict = None,
        actor_optimizer_params: Dict = None,
        actor_scheduler_params: Dict = None,
        actor_grad_clip_params: Dict = None,
        **kwargs
    ):
        self._device = utils.get_device()
        self.actor = actor.to(self._device)

        # actor preparation
        actor_components = utils.get_trainer_components(
            agent=self.actor,
            loss_params=actor_loss_params,
            optimizer_params=actor_optimizer_params,
            scheduler_params=actor_scheduler_params,
            grad_clip_params=actor_grad_clip_params
        )
        # criterion
        self._actor_loss_params = actor_components["loss_params"]
        self.actor_criterion = actor_components["criterion"]
        # optimizer
        self._actor_optimizer_params = actor_components["optimizer_params"]
        self.actor_optimizer = actor_components["optimizer"]
        # scheduler
        self._actor_scheduler_params = actor_components["scheduler_params"]
        self.actor_scheduler = actor_components["scheduler"]
        # grad clipping
        self._actor_grad_clip_params = actor_components["grad_clip_params"]
        self.actor_grad_clip_fn = actor_components["grad_clip_fn"]

        # other hyperparameters
        self._n_step = n_step
        self._gamma = gamma

        # other init
        self._init(**kwargs)
Beispiel #5
0
    def __init__(self,
                 actor: ActorSpec,
                 critic: CriticSpec,
                 gamma: float,
                 n_step: int,
                 actor_loss_params: Dict = None,
                 critic_loss_params: Dict = None,
                 actor_optimizer_params: Dict = None,
                 critic_optimizer_params: Dict = None,
                 actor_scheduler_params: Dict = None,
                 critic_scheduler_params: Dict = None,
                 actor_grad_clip_params: Dict = None,
                 critic_grad_clip_params: Dict = None,
                 **kwargs):
        self._device = utils.get_device()

        self.actor = actor.to(self._device)
        self.critic = critic.to(self._device)

        # actor preparation
        actor_components = utils.get_trainer_components(
            agent=self.actor,
            loss_params=actor_loss_params,
            optimizer_params=actor_optimizer_params,
            scheduler_params=actor_scheduler_params,
            grad_clip_params=actor_grad_clip_params)
        # criterion
        self._actor_loss_params = actor_components["loss_params"]
        self.actor_criterion = actor_components["criterion"]
        # optimizer
        self._actor_optimizer_params = actor_components["optimizer_params"]
        self.actor_optimizer = actor_components["optimizer"]
        # scheduler
        self._actor_scheduler_params = actor_components["scheduler_params"]
        self.actor_scheduler = actor_components["scheduler"]
        # grad clipping
        self._actor_grad_clip_params = actor_components["grad_clip_params"]
        self.actor_grad_clip_fn = actor_components["grad_clip_fn"]

        # critic preparation
        critic_components = utils.get_trainer_components(
            agent=self.critic,
            loss_params=critic_loss_params,
            optimizer_params=critic_optimizer_params,
            scheduler_params=critic_scheduler_params,
            grad_clip_params=critic_grad_clip_params)
        # criterion
        self._critic_loss_params = critic_components["loss_params"]
        self.critic_criterion = critic_components["criterion"]
        # optimizer
        self._critic_optimizer_params = critic_components["optimizer_params"]
        self.critic_optimizer = critic_components["optimizer"]
        # scheduler
        self._critic_scheduler_params = critic_components["scheduler_params"]
        self.critic_scheduler = critic_components["scheduler"]
        # grad clipping
        self._critic_grad_clip_params = critic_components["grad_clip_params"]
        self.critic_grad_clip_fn = critic_components["grad_clip_fn"]

        # other hyperparameters
        assert n_step == 1, "For now, on-policy setup works only with n-step=1"
        self._n_step = n_step
        self._gamma = gamma

        # other init
        self._init(**kwargs)
Beispiel #6
0
    def __init__(
        self,
        actor: ActorSpec,
        critic: CriticSpec,
        gamma: float,
        n_step: int,
        actor_loss_params: Dict = None,
        critic_loss_params: Dict = None,
        actor_optimizer_params: Dict = None,
        critic_optimizer_params: Dict = None,
        actor_scheduler_params: Dict = None,
        critic_scheduler_params: Dict = None,
        actor_grad_clip_params: Dict = None,
        critic_grad_clip_params: Dict = None,
        actor_tau: float = 1.0,
        critic_tau: float = 1.0,
        action_boundaries: tuple = None,
        **kwargs
    ):
        self._device = utils.get_device()

        self.actor = actor.to(self._device)
        self.critic = critic.to(self._device)

        self.target_actor = copy.deepcopy(actor).to(self._device)
        self.target_critic = copy.deepcopy(critic).to(self._device)

        # actor preparation
        actor_components = utils.get_trainer_components(
            agent=self.actor,
            loss_params=actor_loss_params,
            optimizer_params=actor_optimizer_params,
            scheduler_params=actor_scheduler_params,
            grad_clip_params=actor_grad_clip_params
        )
        # criterion
        self._actor_loss_params = actor_components["loss_params"]
        self.actor_criterion = actor_components["criterion"]
        # optimizer
        self._actor_optimizer_params = actor_components["optimizer_params"]
        self.actor_optimizer = actor_components["optimizer"]
        # scheduler
        self._actor_scheduler_params = actor_components["scheduler_params"]
        self.actor_scheduler = actor_components["scheduler"]
        # grad clipping
        self._actor_grad_clip_params = actor_components["grad_clip_params"]
        self.actor_grad_clip_fn = actor_components["grad_clip_fn"]

        # critic preparation
        critic_components = utils.get_trainer_components(
            agent=self.critic,
            loss_params=critic_loss_params,
            optimizer_params=critic_optimizer_params,
            scheduler_params=critic_scheduler_params,
            grad_clip_params=critic_grad_clip_params
        )
        # criterion
        self._critic_loss_params = critic_components["loss_params"]
        self.critic_criterion = critic_components["criterion"]
        # optimizer
        self._critic_optimizer_params = critic_components["optimizer_params"]
        self.critic_optimizer = critic_components["optimizer"]
        # scheduler
        self._critic_scheduler_params = critic_components["scheduler_params"]
        self.critic_scheduler = critic_components["scheduler"]
        # grad clipping
        self._critic_grad_clip_params = critic_components["grad_clip_params"]
        self.critic_grad_clip_fn = critic_components["grad_clip_fn"]

        # other hyperparameters
        self._n_step = n_step
        self._gamma = gamma
        self._actor_tau = actor_tau
        self._critic_tau = critic_tau

        if action_boundaries is not None:
            assert len(action_boundaries) == 2, \
                "Should be min and max action boundaries"
            self._action_boundaries = action_boundaries

        # other init
        self._init(**kwargs)