def calculate_virtual_reward(self): """Apply the virtual reward formula to account for all the different goal scores.""" rewards = -1 * self.states.cum_rewards if self.minimize else self.states.cum_rewards processed_rewards = relativize(rewards) score_reward = processed_rewards**self.reward_scale score_dist = self.states.distances**self.distance_scale virt_rw = score_reward * score_dist dist_prob = score_dist / score_dist.sum() reward_prob = score_reward / score_reward.sum() total_entropy = numpy.prod(2 - dist_prob**reward_prob) self._min_entropy = numpy.prod(2 - reward_prob**reward_prob) self.efficiency = self._min_entropy / total_entropy self.update_states(virtual_rewards=virt_rw, processed_rewards=processed_rewards) if self.critic is not None: critic_states = self.critic.calculate( walkers_states=self.states, model_states=self.model_states, env_states=self.env_states, ) self.states.update(other=critic_states) virt_rew = self.states.virtual_rewards * self.states.critic else: virt_rew = self.states.virtual_rewards self.states.update(virtual_rewards=virt_rew)
def calculate_virtual_reward(self) -> None: """ Calculate the virtual reward and update the internal state. The cumulative_reward is transformed with the relativize function. \ The distances stored in the :class:`StatesWalkers` are already transformed. """ processed_rewards = relativize(self.states.cum_rewards) virt_rw = (processed_rewards**self.reward_scale * self.states.distances**self.distance_scale) self.update_states(virtual_rewards=virt_rw, processed_rewards=processed_rewards)
def calculate_distances(self) -> None: """Calculate the corresponding distance function for each observation with \ respect to another observation chosen at random. The internal :class:`StateWalkers` is updated with the relativized distance values. """ # TODO(guillemdb): Check if self.get_in_bounds_compas() works better. compas_ix = numpy.random.permutation(numpy.arange(self.n)) obs = self.env_states.observs.reshape(self.n, -1) distances = self.distance_function(obs, obs[compas_ix]) distances = relativize(distances.flatten()) self.update_states(distances=distances, compas_dist=compas_ix)
def test_update_clone_probs(self, walkers): walkers.reset() walkers.states.update( virtual_rewards=relativize(numpy.arange(walkers.n))) walkers.update_clone_probs() assert 0 < numpy.sum( walkers.states.clone_probs == walkers.states.clone_probs[0]), ( walkers.states.virtual_rewards, walkers.states.clone_probs, ) walkers.reset() walkers.update_clone_probs() assert numpy.sum(walkers.states.clone_probs == walkers.states.clone_probs[0]) == walkers.n assert walkers.states.clone_probs.shape[0] == walkers.n assert len(walkers.states.clone_probs.shape) == 1
def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None): """Get the values assigned by the :class:`Critic` to the regions of the state space.""" if swarm is None: return numpy.ones(self.n_points**self.n_points) if swarm.critic.bounds is None: swarm.critic.bounds = Bounds.from_array(X, scale=1.1) # target grid to interpolate to xi = numpy.linspace(swarm.critic.bounds.low[0], swarm.critic.bounds.high[0], self.n_points) yi = numpy.linspace(swarm.critic.bounds.low[1], swarm.critic.bounds.high[1], self.n_points) xx, yy = numpy.meshgrid(xi, yi) grid = numpy.c_[xx.ravel(), yy.ravel()] if swarm.swarm.critic.warmed: memory_values = swarm.swarm.critic.predict(grid) memory_values = relativize(-memory_values) else: memory_values = numpy.arange(grid.shape[0]) return memory_values
def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None): """Return the normalized ``cum_rewards`` of the walkers.""" rewards: numpy.ndarray = relativize(swarm.get("cum_rewards")) return rewards