def __init__(self, dt: float, gamma: float, lr: float, tau: float, optimizer: str, val_function: ParametricFunction, adv_function: ParametricFunction) -> None: CompoundStateful.__init__(self) self._reference_obs: Tensor = None self._val_function = val_function self._adv_function = adv_function self._target_val_function = copy.deepcopy(val_function) self._target_adv_function = copy.deepcopy(adv_function) self._adv_optimizer = \ setup_optimizer(self._adv_function.parameters(), opt_name=optimizer, lr=lr, dt=dt, inverse_gradient_magnitude=1, weight_decay=0) self._val_optimizer = \ setup_optimizer(self._val_function.parameters(), opt_name=optimizer, lr=lr, dt=dt, inverse_gradient_magnitude=dt, weight_decay=0) self._dt = dt self._gamma = gamma self._tau = tau info( f"setup> using AdvantageCritic, the provided gamma and rewards are scaled," f" actual values: gamma={gamma ** dt}, rewards=original_rewards * {dt}" ) self._device = 'cpu'
def __init__(self, dt: float, gamma: float, lr: float, optimizer: str, q_function: ParametricFunction, tau: float, noscale: bool) -> None: CompoundStateful.__init__(self) self._reference_obs: Tensor = None self._q_function = q_function self._target_q_function = copy.deepcopy(self._q_function) self._tau = tau ref_dt = 0.02 if noscale: self._gamma = gamma**(dt / ref_dt) else: self._gamma = gamma if noscale: dt = ref_dt self._dt = ref_dt else: self._dt = dt info( f"setup> using ValueCritic, the provided gamma and rewards are scaled," f" actual values: gamma={gamma ** self._dt}," f" rewards=original_rewards * {self._dt}") self._q_optimizer = \ setup_optimizer(self._q_function.parameters(), opt_name=optimizer, lr=lr, dt=self._dt, inverse_gradient_magnitude=self._dt, weight_decay=0) self._device = 'cpu'
def __init__(self, gamma: float, dt: float, v_function: ParametricFunction) -> None: CompoundStateful.__init__(self) self._reference_obs: Tensor = None self._v_function = v_function self._gamma = gamma**dt self._device = 'cpu' self._dt = dt info( f"setup> using OnlineCritic, the provided gamma and rewards are scaled," f" actual values: gamma={gamma ** dt}, rewards=original_rewards * {dt}" )
def __init__(self, policy_function: ParametricFunction, noise: Noise, lr: float, tau: float, opt_name: str, dt: float, weight_decay: float) -> None: CompoundStateful.__init__(self) self._policy_function = policy_function self._target_policy_function = copy.deepcopy(self._policy_function) self._optimizer = setup_optimizer(self._policy_function.parameters(), opt_name=opt_name, lr=lr, dt=dt, inverse_gradient_magnitude=1, weight_decay=weight_decay) self._noise = noise self._tau = tau
def __init__(self, T: int, actor: OnlineActor, critic: OnlineCritic) -> None: CompoundStateful.__init__(self) # reset and set in train mode self.train() # define learning components self._actor = actor self._critic = critic self._count = 0 self._T = T self._device = "cpu" self.reset() # init _nb_train_env and _current_trajectories to None self._nb_train_env: Optional[int] = None self._current_trajectories: List[Trajectory] = []
def __init__(self, memory_size: int, batch_size: int, steps_btw_train: int, learn_per_step: int, alpha: Optional[float], beta: Optional[float], actor: Actor, critic: Critic) -> None: CompoundStateful.__init__(self) # reset and set in train mode self.reset() self.train() # define learning components self._actor = actor self._critic = critic self._sampler = setup_memory(alpha=alpha, beta=beta, memory_size=memory_size, batch_size=batch_size) # counter and parameters self._count = 0 self._warm_up = 10 # prevents learning from a near empty buffer self._steps_btw_train = steps_btw_train self._learn_per_step = learn_per_step
def __init__(self, policy_function: ParametricFunction, dt: float, c_entropy: float) -> None: CompoundStateful.__init__(self) self._policy_function = policy_function self._c_entropy = c_entropy