def _clone_to_walker(self, state, obs, reward): if obs is None or state is None: return # Virtual reward with respect to the new state indexes = np.random.choice(np.arange(self.swarm.walkers.n), size=self.n_comp_add) n_walkers = len(indexes) assert n_walkers == self.n_comp_add w_rewards = self.swarm.walkers.states.cum_rewards[indexes] walkers_obs = self.swarm.walkers.env_states.observs[indexes].reshape( n_walkers, -1) distances = np.linalg.norm(walkers_obs - obs.reshape(1, -1), axis=1) distances = relativize( distances.flatten())**self.swarm.walkers.dist_scale distances = distances / distances.sum() rewards = (relativize(np.concatenate( [w_rewards, [reward]]))**self.swarm.walkers.reward_scale) rewards = rewards / rewards.sum() w_virt_rew = 2 - distances**rewards[:-1] other_ix = np.random.permutation(np.arange(n_walkers)) other_virt_rew = 2 - distances[other_ix]**rewards[-1] # Clone probabilities with respect to new state all_virtual_rewards_are_equal = (w_virt_rew == other_virt_rew).all() if all_virtual_rewards_are_equal: clone_probs = np.zeros(n_walkers, dtype=float_type) else: clone_probs = (other_virt_rew - w_virt_rew) / w_virt_rew clone_probs = np.sqrt(np.clip(clone_probs, 0, 1.1)) # Clone the new state to the selected walkers will_clone = clone_probs > self.swarm.walkers.random_state.random_sample( n_walkers) if will_clone.sum() == 0: return new_rewards = np.ones(n_walkers)[will_clone].copy() * reward try: self.swarm.walkers.states.cum_rewards[indexes][ will_clone] = new_rewards for ix, wc in zip(indexes, will_clone): if wc: self.swarm.walkers.env_states.states[ix] = copy.deepcopy( state) self.swarm.walkers.env_states.observs[ix] = copy.deepcopy( obs) self.swarm.walkers.update_best() except Exception as e: return orig_states = self.swarm.walkers.env_states.states msg = "indexes: %s will_clone: %s new_states: %s states shape: %s\n" data = (indexes, will_clone, [], orig_states.shape) msg_2 = "clone_probs: %s rewards: %s reward: %s state: %s\n" data_2 = (clone_probs, rewards, reward, state) x = orig_states[indexes][will_clone] msg_3 = "will_clone shape: %s clone_probs shape: %s SHAPE: %s DATA: %s" % ( will_clone.shape, clone_probs.shape, type(x), x, ) print((msg % data) + (msg_2 % data_2) + msg_3) raise e
def _evaluate_model(self, points): func = partial(evaluate_one_reward, y=np.array(self.y), data=pd.DataFrame(self.X)) result = self.pool.map(func, points.tolist())#, chunksize=points.shape[0] // # multiprocessing.cpu_count()) model_score, scores, ends = tuple(zip(*result)) ends = np.zeros_like(np.array(ends)) scores = np.array(scores) score = relativize(np.array(model_score)) ** 0.5 for i in range(scores.shape[1]): score = score * relativize(scores[:, i]) entropy = np.array(score) #ends = score < score.mean() self.entropy = max(self.entropy, entropy.max()) return score, ends
def calculate_virtual_reward(self): """Apply the virtual reward formula to account for all the different goal scores.""" rewards = -1 * self.states.cum_rewards if self.minimize else self.states.cum_rewards processed_rewards = relativize(rewards) score_reward = processed_rewards**self.reward_scale score_dist = self.states.distances**self.dist_scale virt_rw = score_reward * score_dist dist_prob = score_dist / score_dist.sum() reward_prob = score_reward / score_reward.sum() total_entropy = numpy.prod(2 - dist_prob**reward_prob) self._min_entropy = numpy.prod(2 - reward_prob**reward_prob) self.efficiency = self._min_entropy / total_entropy self.update_states(virtual_rewards=virt_rw, processed_rewards=processed_rewards) if self.critic is not None: critic_states = self.critic.calculate( walkers_states=self.states, model_states=self.model_states, env_states=self.env_states, ) self.states.update(other=critic_states) virt_rew = self.states.virtual_rewards * self.states.critic else: virt_rew = self.states.virtual_rewards self.states.update(virtual_rewards=virt_rew)
def calculate_virtual_reward(self) -> None: """ Calculate the virtual reward and update the internal state. The cumulative_reward is transformed with the relativize function. \ The distances stored in the :class:`StatesWalkers` are already transformed. """ processed_rewards = relativize(self.states.cum_rewards) virt_rw = processed_rewards**self.reward_scale * self.states.distances**self.dist_scale self.update_states(virtual_rewards=virt_rw, processed_rewards=processed_rewards)
def calculate_distances(self) -> None: """Calculate the corresponding distance function for each observation with \ respect to another observation chosen at random. The internal :class:`StateWalkers` is updated with the relativized distance values. """ compas_ix = numpy.random.permutation(numpy.arange( self.n)) # self.get_alive_compas() obs = self.env_states.observs.reshape(self.n, -1) distances = self.distance_function(obs, obs[compas_ix]) distances = relativize(distances.flatten()) self.update_states(distances=distances, compas_dist=compas_ix)
def test_update_clone_probs(self, walkers): walkers.reset() walkers.states.update(virtual_rewards=relativize(np.arange(walkers.n))) walkers.update_clone_probs() assert 0 < np.sum( walkers.states.clone_probs == walkers.states.clone_probs[0]), ( walkers.states.virtual_rewards, walkers.states.clone_probs, ) walkers.reset() walkers.update_clone_probs() assert np.sum(walkers.states.clone_probs == walkers.states.clone_probs[0]) == walkers.n assert walkers.states.clone_probs.shape[0] == walkers.n assert len(walkers.states.clone_probs.shape) == 1
def calculate_distances(self) -> None: """Calculate the corresponding distance function for each state with \ respect to another state chosen at random. The internal state is update with the relativized distance values. The distance is performed on the RAM memory of the Atari emulator """ compas_ix = np.random.permutation(np.arange(self.n)) # This unpacks RAMs from Uber Go-explore custom Montezuma environment rams = self.env_states.states.reshape(self.n, -1)[:, :-12].astype(np.uint8) vec = rams - rams[compas_ix] dist_ram = self.distance_function(vec, axis=1).flatten() distances = relativize(dist_ram) self.update_states(distances=distances, compas_dist=compas_ix)
def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None): if swarm is None: return numpy.ones(self.n_points**self.n_points) if swarm.critic.bounds is None: swarm.critic.bounds = Bounds.from_array(X, scale=1.1) # target grid to interpolate to xi = numpy.linspace(swarm.critic.bounds.low[0], swarm.critic.bounds.high[0], self.n_points) yi = numpy.linspace(swarm.critic.bounds.low[1], swarm.critic.bounds.high[1], self.n_points) xx, yy = numpy.meshgrid(xi, yi) grid = numpy.c_[xx.ravel(), yy.ravel()] if swarm.swarm.critic.warmed: memory_values = swarm.swarm.critic.predict(grid) memory_values = relativize(-memory_values) else: memory_values = numpy.arange(grid.shape[0]) return memory_values
def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None): """Return the normalized ``cum_rewards`` of the walkers.""" rewards: numpy.ndarray = relativize(swarm.walkers.states.cum_rewards) return rewards
def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None): rewards: numpy.ndarray = relativize(swarm.walkers.states.cum_rewards) return rewards