def score_goals(self, sampled_ags, info): """ Lower is better """ density_module = getattr(self, self.density_module) if not density_module.ready: density_module._optimize(force=True) interest_module = None if hasattr(self, self.interest_module): interest_module = getattr(self, self.interest_module) if not interest_module.ready: interest_module = None # sampled_ags is np.array of shape NUM_ENVS x NUM_SAMPLED_GOALS (both arbitrary) num_envs, num_sampled_ags = sampled_ags.shape[:2] # score the sampled_ags to get log densities, and exponentiate to get densities flattened_sampled_ags = sampled_ags.reshape(num_envs * num_sampled_ags, -1) sampled_ag_scores = density_module.evaluate_log_density( flattened_sampled_ags) if interest_module: # Interest is ~(det(feature_transform)), so we subtract it in order to add ~(det(inverse feature_transform)) for COV. sampled_ag_scores -= interest_module.evaluate_log_interest( flattened_sampled_ags) # add in log interest sampled_ag_scores = sampled_ag_scores.reshape( num_envs, num_sampled_ags) # these are log densities # Take softmax of the alpha * log density. # If alpha = -1, this gives us normalized inverse densities (higher is rarer) # If alpha < -1, this skews the density to give us low density samples normalized_inverse_densities = softmax(sampled_ag_scores * self.alpha) normalized_inverse_densities *= -1. # make negative / reverse order so that lower is better. return normalized_inverse_densities
def score_goals(self, sampled_ags, info): """ Higher entropy gain is better """ if not self.ag_kde.ready: self.ag_kde._optimize(force=True) if not self.bg_kde.ready: self.bg_kde._optimize(force=True) if not self.bgag_kde.ready: self.bgag_kde._optimize(force=True) # sampled_ags is np.array of shape NUM_ENVS x NUM_SAMPLED_GOALS (both arbitrary) num_envs, num_sampled_ags = sampled_ags.shape[:2] # Get sample of predicted achieved goal from mixture density network candidate_bgs = sampled_ags.reshape(num_envs * num_sampled_ags, -1) # Reuse the candidate bgs as potential ags # Note: We are using a sliding window to reuse sampled_ags as the potential ag for each bg # Prior that each bgs has one ag that is identical to bg, i.e. that it reaches the bg. num_ags = 10 # TODO: Not make it hard coded indexer = np.arange(num_envs * num_sampled_ags).reshape(-1, 1) + np.arange(num_ags).reshape(1, -1) indexer %= num_envs * num_sampled_ags # To wrap around to the beginning ags_samples = np.concatenate( [candidate_bgs[indexer[i]][np.newaxis, :, :] for i in range(num_envs * num_sampled_ags)], axis=0) candidate_bgs_repeat = np.repeat(candidate_bgs[:, np.newaxis, :], num_ags, axis=1) # Shape num_envs*num_sampled_ags, num_ags, dim joint_candidate_bgags = np.concatenate([candidate_bgs_repeat, ags_samples], axis=-1) joint_candidate_bgags = joint_candidate_bgags.reshape(num_envs * num_sampled_ags * num_ags, -1) # score the sampled_ags to get log densities, and exponentiate to get densities joint_candidate_score = self.bgag_kde.evaluate_log_density(joint_candidate_bgags) joint_candidate_score = joint_candidate_score.reshape(num_envs * num_sampled_ags, num_ags) # these are log densities candidate_bgs_score = self.bg_kde.evaluate_log_density( candidate_bgs_repeat.reshape(num_envs * num_sampled_ags * num_ags, -1)) candidate_bgs_score = candidate_bgs_score.reshape(num_envs * num_sampled_ags, num_ags) # these are log densities cond_candidate_score = joint_candidate_score - candidate_bgs_score cond_candidate_score = softmax(cond_candidate_score, axis=1) # Compute entropy gain for the predicted achieved goal beta = 1 / len(self.replay_buffer.buffer) sampled_ag_entr_new = self.ag_kde.evaluate_elementwise_entropy(candidate_bgs, beta=beta) sampled_ag_entr_old = self.ag_kde.evaluate_elementwise_entropy(candidate_bgs, beta=0.) sampled_ag_entr_gain = sampled_ag_entr_new - sampled_ag_entr_old sampled_ag_entr_gain /= beta # Normalize by beta # TODO: Get rid of this part if not necessary sampled_ag_entr_gain = np.concatenate( [sampled_ag_entr_gain[indexer[i]][np.newaxis, :] for i in range(num_envs * num_sampled_ags)], axis=0) sampled_ag_entr_gain *= cond_candidate_score sampled_ag_entr_gain = sampled_ag_entr_gain.mean(axis=1) scores = sampled_ag_entr_gain.reshape(num_envs, num_sampled_ags) scores *= -1. # make negative / reverse order so that lower is better. return scores