def update_states(self, env_states: StatesEnv = None, model_states: StatesModel = None, **kwargs): """ Update the States variables that do not contain internal data and \ accumulate the rewards in the internal states if applicable. Args: env_states: States containing the data associated with the Environment. model_states: States containing data associated with the Environment. **kwargs: Internal states will be updated via keyword arguments. """ if kwargs: if kwargs.get("rewards") is not None: self._accumulate_and_update_rewards(kwargs["rewards"]) del kwargs["rewards"] self.states.update(**kwargs) if isinstance(env_states, StatesEnv): dt = model_states.get("dt", 1.0) if model_states is not None else 1.0 times = self._env_states.get("times") + dt self._env_states.update(env_states) self._env_states.update(times=times) if hasattr(env_states, "rewards"): self._accumulate_and_update_rewards(env_states.rewards) if isinstance(model_states, StatesModel): self._model_states.update(model_states) self.update_ids()
def sample( self, batch_size: int, model_states: StatesModel = None, env_states: StatesEnv = None, walkers_states: StatesWalkers = None, **kwargs, ) -> StatesModel: """ Calculate the corresponding data to interact with the Environment and \ store it in model states. Args: batch_size: Number of new points to the sampled. model_states: States corresponding to the environment data. env_states: States corresponding to the model data. walkers_states: States corresponding to the walkers data. kwargs: Passed to the :class:`Critic` if any. Returns: Tuple containing a tensor with the sampled actions and the new model states variable. """ if model_states is None or walkers_states is None: return super(CMAES, self).sample( batch_size=batch_size, model_states=model_states, env_states=env_states, walkers_states=walkers_states, **kwargs ) actions = ( env_states.get("observs") if self._count_eval > self.pop_size * 2 else model_states.get("actions") ) fitness = ( walkers_states.get("virtual_rewards") if self.virtual_reward_fitness else walkers_states.get("cum_rewards") ) sorted_fitness = numpy.argsort(fitness)[: self.mu_const] selected_actions = actions[sorted_fitness].T self._update_evolution_paths(selected_actions) self._adapt_covariance_matrix(selected_actions) self._adapt_sigma() self._cov_matrix_diagonalization() actions = self._sample_actions() return self.update_states_with_critic( actions=actions, batch_size=batch_size, model_states=model_states, **kwargs )