def __init__(self, rnd, ide, capacity, device, gamma=0.99, k=10, L=5, eps=0.001, min_dist=0.008, max_similarity=4, c=1.0): super().__init__() self.rnd = rnd # Change the reward normalization of RND to normalize reward instead of # return. This will allow rnd to modify intrinsic rewards meaningfully. # (rewards will have std of 1, instead of approximately 1-gamma) self.rnd.reward_filter = lambda r, updates: r self.ide = ide self.capacity = capacity self.device = device self.k = k self.L = L self.eps = eps self.min_dist = min_dist self.max_similarity = np.sqrt(max_similarity) self.c = c self.buffer = None self.dist_running_avg = 0. self.dist_running_count = 0. self.reward_filter = RewardForwardFilter(gamma) self.err_norm = RunningNorm((1, )).to(device)
def __init__(self, venv, gamma, eps=1e-5): """Init.""" super().__init__(venv) self.rn = RunningNorm((1, ), eps=eps) self.reward_filter = RewardForwardFilter(gamma) self._dones = np.zeros(self.num_envs, dtype=np.bool) self._eval = False
class VecRewardNormWrapper(VecEnvWrapper): """Reward normalization for vecorized environments.""" def __init__(self, venv, gamma, eps=1e-5): """Init.""" super().__init__(venv) self.rn = RunningNorm((1, ), eps=eps) self.reward_filter = RewardForwardFilter(gamma) self._dones = np.zeros(self.num_envs, dtype=np.bool) self._eval = False def state_dict(self): """State dict.""" return {'rn': self.rn.state_dict()} def load_state_dict(self, state_dict): """Load state dict.""" self.rn.load_state_dict(state_dict['rn']) def eval(self): """Set the environment to eval mode. Eval mode doesn't update the running norm of returns. """ self._eval = True def train(self): """Set the environment to train mode.""" self._eval = False def step(self, action): """Step.""" obs, rews, dones, infos = self.venv.step(action) if not self._eval: updates = np.logical_not(self._dones) rets = self.reward_filter(rews, updates)[updates] var = 0 if rets.shape[0] <= 1 else rets.var() self.rn.update(rets.mean(), var, rets.shape[0]) self._dones = np.logical_or(dones, self._dones) if self.rn.std > self.rn.eps: rews = rews / (self.rn.std.numpy() + self.rn.eps) return obs, rews, dones, infos def reset(self, force=True): """Reset.""" obs = self.venv.reset(force=force) self._dones[:] = False return obs def step_wait(self): return self.venv.step_wait()
def __init__(self, net, opt, gamma, shape, device): """ Args: net (function): a function mapping input shape to a pytorch module. opt : PyTorch optimizer shape (tuple) : input shape for networks device : The device to place the networks on """ super().__init__() self.target_net = net(shape).to(device) self.prediction_net = net(shape).to(device) self.device = device self.opt = opt(self.prediction_net.parameters()) self.ob_norm = RunningNorm(shape).to(device) self.err_norm = RunningNorm((1,)).to(device) self.reward_filter = RewardForwardFilter(gamma)
class NGU(nn.Module): def __init__(self, rnd, ide, capacity, device, gamma=0.99, k=10, L=5, eps=0.001, min_dist=0.008, max_similarity=4, c=1.0): super().__init__() self.rnd = rnd # Change the reward normalization of RND to normalize reward instead of # return. This will allow rnd to modify intrinsic rewards meaningfully. # (rewards will have std of 1, instead of approximately 1-gamma) self.rnd.reward_filter = lambda r, updates: r self.ide = ide self.capacity = capacity self.device = device self.k = k self.L = L self.eps = eps self.min_dist = min_dist self.max_similarity = np.sqrt(max_similarity) self.c = c self.buffer = None self.dist_running_avg = 0. self.dist_running_count = 0. self.reward_filter = RewardForwardFilter(gamma) self.err_norm = RunningNorm((1, )).to(device) def forward(self, obs, update_norm=False, updates=None): with torch.no_grad(): embeddings = self.ide(obs) if self.buffer is None: self.buffer = EpisodeBuffer(self.capacity, embeddings.shape[0], embeddings.shape[1], self.device) dists = self.buffer.get_nearest_neighbors(embeddings, self.k) # Normalize distances by average of kth-neighbors distance self._update_dist_running_mean(dists[:, -1]) if self.dist_running_avg > 1e-5: dists /= self.dist_running_avg # Set close distances to 0 dists = torch.max(dists - self.min_dist, torch.zeros_like(dists)) # Compute inverse kernel of distances k = self.eps / (dists + self.eps) # Compute similarity s = torch.sqrt(self.c + k.sum(dim=1)) # Compute short term intrinsic reward r = torch.where(s > self.max_similarity, torch.zeros_like(s), 1 / s) # combine with long term intrinsic reward # RND divides error by a running estimate of error std. # NGU also subtracts the mean and adds 1. r_rnd = self.rnd(obs, update_norm, updates) rnd_mean = self.rnd.err_norm.mean / (self.rnd.err_norm.std + 1e-5) r_rnd += 1.0 - rnd_mean modifier = torch.clamp(r_rnd, 1, self.L) reward = r * modifier self.buffer.insert(embeddings) # update running norm if update_norm and (updates is None or updates.sum() > 0): rets = self.reward_filter(reward, updates)[updates] var = 0 if rets.shape[0] == 1 else rets.var() self.err_norm.update(rets.mean(), var, rets.shape[0]) return reward / (self.err_norm.std + 1e-5) def _update_dist_running_mean(self, max_dists): inds = self.buffer.n_in_buffer >= self.k max_dists = max_dists[inds] if max_dists.shape[0] == 0: return d = max_dists.mean() c = len(max_dists) new_c = self.dist_running_count + c self.dist_running_avg *= self.dist_running_count / new_c self.dist_running_avg += (c / new_c) * d self.dist_running_count += c def reset(self, dones=None): if self.buffer: self.buffer.reset(dones) def update_rnd(self, obs): return self.rnd.update(obs) def update_ide(self, obs, next_obs, actions): return self.ide.update(obs, next_obs, actions) def state_dict(self): return { 'rnd': self.rnd.state_dict(), 'ide': self.ide.state_dict(), } def load_state_dict(self, state_dict): self.rnd.load_state_dict(state_dict['rnd']) self.ide.load_state_dict(state_dict['ide'])
class RND(nn.Module): """Implementation of Random Network Distillation.""" def __init__(self, net, opt, gamma, shape, device): """ Args: net (function): a function mapping input shape to a pytorch module. opt : PyTorch optimizer shape (tuple) : input shape for networks device : The device to place the networks on """ super().__init__() self.target_net = net(shape).to(device) self.prediction_net = net(shape).to(device) self.device = device self.opt = opt(self.prediction_net.parameters()) self.ob_norm = RunningNorm(shape).to(device) self.err_norm = RunningNorm((1,)).to(device) self.reward_filter = RewardForwardFilter(gamma) def forward(self, obs, update_norm=False, updates=None): """Get intrinsic reward.""" should_update = update_norm and (updates is None or updates.sum() > 0) if should_update: obs_to_update = obs if updates is None else obs[updates] if obs_to_update.shape[0] == 1: var = torch.zeros_like(obs[0]) else: var = obs_to_update.var(dim=0) self.ob_norm.update(obs_to_update.mean(dim=0), var, obs_to_update.shape[0]) obs = torch.clamp(self.ob_norm(obs), -5, 5) with torch.no_grad(): err = torch.norm(self.target_net(obs) - self.prediction_net(obs), dim=1) if should_update: rets = self.reward_filter(err, updates)[updates] var = 0 if rets.shape[0] == 1 else rets.var() self.err_norm.update(rets.mean(), var, rets.shape[0]) return err / (self.err_norm.std + 1e-5) def update(self, obs): obs = torch.clamp(self.ob_norm(obs), -5, 5) self.opt.zero_grad() loss = torch.norm(self.target_net(obs) - self.prediction_net(obs), dim=1).mean() loss.backward() self.opt.step() return loss def state_dict(self, *args, **kwargs): return { 'rnd': super().state_dict(*args, **kwargs), 'opt': self.opt.state_dict() } def load_state_dict(self, state_dict): self.opt.load_state_dict(state_dict['opt']) super().load_state_dict(state_dict['rnd'])