Ejemplo n.º 1
0
 def calculate_virtual_reward(self):
     """Apply the virtual reward formula to account for all the different goal scores."""
     rewards = -1 * self.states.cum_rewards if self.minimize else self.states.cum_rewards
     processed_rewards = relativize(rewards)
     score_reward = processed_rewards**self.reward_scale
     score_dist = self.states.distances**self.distance_scale
     virt_rw = score_reward * score_dist
     dist_prob = score_dist / score_dist.sum()
     reward_prob = score_reward / score_reward.sum()
     total_entropy = numpy.prod(2 - dist_prob**reward_prob)
     self._min_entropy = numpy.prod(2 - reward_prob**reward_prob)
     self.efficiency = self._min_entropy / total_entropy
     self.update_states(virtual_rewards=virt_rw,
                        processed_rewards=processed_rewards)
     if self.critic is not None:
         critic_states = self.critic.calculate(
             walkers_states=self.states,
             model_states=self.model_states,
             env_states=self.env_states,
         )
         self.states.update(other=critic_states)
         virt_rew = self.states.virtual_rewards * self.states.critic
     else:
         virt_rew = self.states.virtual_rewards
     self.states.update(virtual_rewards=virt_rew)
Ejemplo n.º 2
0
    def calculate_virtual_reward(self) -> None:
        """
        Calculate the virtual reward and update the internal state.

        The cumulative_reward is transformed with the relativize function. \
        The distances stored in the :class:`StatesWalkers` are already transformed.
        """
        processed_rewards = relativize(self.states.cum_rewards)
        virt_rw = (processed_rewards**self.reward_scale *
                   self.states.distances**self.distance_scale)
        self.update_states(virtual_rewards=virt_rw,
                           processed_rewards=processed_rewards)
Ejemplo n.º 3
0
    def calculate_distances(self) -> None:
        """Calculate the corresponding distance function for each observation with \
        respect to another observation chosen at random.

        The internal :class:`StateWalkers` is updated with the relativized distance values.
        """
        # TODO(guillemdb): Check if self.get_in_bounds_compas() works better.
        compas_ix = numpy.random.permutation(numpy.arange(self.n))
        obs = self.env_states.observs.reshape(self.n, -1)
        distances = self.distance_function(obs, obs[compas_ix])
        distances = relativize(distances.flatten())
        self.update_states(distances=distances, compas_dist=compas_ix)
Ejemplo n.º 4
0
 def test_update_clone_probs(self, walkers):
     walkers.reset()
     walkers.states.update(
         virtual_rewards=relativize(numpy.arange(walkers.n)))
     walkers.update_clone_probs()
     assert 0 < numpy.sum(
         walkers.states.clone_probs == walkers.states.clone_probs[0]), (
             walkers.states.virtual_rewards,
             walkers.states.clone_probs,
         )
     walkers.reset()
     walkers.update_clone_probs()
     assert numpy.sum(walkers.states.clone_probs ==
                      walkers.states.clone_probs[0]) == walkers.n
     assert walkers.states.clone_probs.shape[0] == walkers.n
     assert len(walkers.states.clone_probs.shape) == 1
Ejemplo n.º 5
0
 def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None):
     """Get the values assigned by the :class:`Critic` to the regions of the state space."""
     if swarm is None:
         return numpy.ones(self.n_points**self.n_points)
     if swarm.critic.bounds is None:
         swarm.critic.bounds = Bounds.from_array(X, scale=1.1)
     # target grid to interpolate to
     xi = numpy.linspace(swarm.critic.bounds.low[0],
                         swarm.critic.bounds.high[0], self.n_points)
     yi = numpy.linspace(swarm.critic.bounds.low[1],
                         swarm.critic.bounds.high[1], self.n_points)
     xx, yy = numpy.meshgrid(xi, yi)
     grid = numpy.c_[xx.ravel(), yy.ravel()]
     if swarm.swarm.critic.warmed:
         memory_values = swarm.swarm.critic.predict(grid)
         memory_values = relativize(-memory_values)
     else:
         memory_values = numpy.arange(grid.shape[0])
     return memory_values
Ejemplo n.º 6
0
 def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None):
     """Return the normalized ``cum_rewards`` of the walkers."""
     rewards: numpy.ndarray = relativize(swarm.get("cum_rewards"))
     return rewards