コード例 #1
0
 def __init__(self,
              rnd,
              ide,
              capacity,
              device,
              gamma=0.99,
              k=10,
              L=5,
              eps=0.001,
              min_dist=0.008,
              max_similarity=4,
              c=1.0):
     super().__init__()
     self.rnd = rnd
     # Change the reward normalization of RND to normalize reward instead of
     # return. This will allow rnd to modify intrinsic rewards meaningfully.
     # (rewards will have std of 1, instead of approximately 1-gamma)
     self.rnd.reward_filter = lambda r, updates: r
     self.ide = ide
     self.capacity = capacity
     self.device = device
     self.k = k
     self.L = L
     self.eps = eps
     self.min_dist = min_dist
     self.max_similarity = np.sqrt(max_similarity)
     self.c = c
     self.buffer = None
     self.dist_running_avg = 0.
     self.dist_running_count = 0.
     self.reward_filter = RewardForwardFilter(gamma)
     self.err_norm = RunningNorm((1, )).to(device)
コード例 #2
0
 def __init__(self, venv, gamma, eps=1e-5):
     """Init."""
     super().__init__(venv)
     self.rn = RunningNorm((1, ), eps=eps)
     self.reward_filter = RewardForwardFilter(gamma)
     self._dones = np.zeros(self.num_envs, dtype=np.bool)
     self._eval = False
コード例 #3
0
class VecRewardNormWrapper(VecEnvWrapper):
    """Reward normalization for vecorized environments."""
    def __init__(self, venv, gamma, eps=1e-5):
        """Init."""
        super().__init__(venv)
        self.rn = RunningNorm((1, ), eps=eps)
        self.reward_filter = RewardForwardFilter(gamma)
        self._dones = np.zeros(self.num_envs, dtype=np.bool)
        self._eval = False

    def state_dict(self):
        """State dict."""
        return {'rn': self.rn.state_dict()}

    def load_state_dict(self, state_dict):
        """Load state dict."""
        self.rn.load_state_dict(state_dict['rn'])

    def eval(self):
        """Set the environment to eval mode.

        Eval mode doesn't update the running norm of returns.
        """
        self._eval = True

    def train(self):
        """Set the environment to train mode."""
        self._eval = False

    def step(self, action):
        """Step."""
        obs, rews, dones, infos = self.venv.step(action)
        if not self._eval:
            updates = np.logical_not(self._dones)
            rets = self.reward_filter(rews, updates)[updates]
            var = 0 if rets.shape[0] <= 1 else rets.var()
            self.rn.update(rets.mean(), var, rets.shape[0])
        self._dones = np.logical_or(dones, self._dones)
        if self.rn.std > self.rn.eps:
            rews = rews / (self.rn.std.numpy() + self.rn.eps)
        return obs, rews, dones, infos

    def reset(self, force=True):
        """Reset."""
        obs = self.venv.reset(force=force)
        self._dones[:] = False
        return obs

    def step_wait(self):
        return self.venv.step_wait()
コード例 #4
0
    def __init__(self, net, opt, gamma, shape, device):
        """
        Args:
            net (function): a function mapping input shape to a pytorch module.
            opt : PyTorch optimizer
            shape (tuple) : input shape for networks
            device : The device to place the networks on
        """

        super().__init__()
        self.target_net = net(shape).to(device)
        self.prediction_net = net(shape).to(device)
        self.device = device
        self.opt = opt(self.prediction_net.parameters())
        self.ob_norm = RunningNorm(shape).to(device)
        self.err_norm = RunningNorm((1,)).to(device)
        self.reward_filter = RewardForwardFilter(gamma)
コード例 #5
0
class NGU(nn.Module):
    def __init__(self,
                 rnd,
                 ide,
                 capacity,
                 device,
                 gamma=0.99,
                 k=10,
                 L=5,
                 eps=0.001,
                 min_dist=0.008,
                 max_similarity=4,
                 c=1.0):
        super().__init__()
        self.rnd = rnd
        # Change the reward normalization of RND to normalize reward instead of
        # return. This will allow rnd to modify intrinsic rewards meaningfully.
        # (rewards will have std of 1, instead of approximately 1-gamma)
        self.rnd.reward_filter = lambda r, updates: r
        self.ide = ide
        self.capacity = capacity
        self.device = device
        self.k = k
        self.L = L
        self.eps = eps
        self.min_dist = min_dist
        self.max_similarity = np.sqrt(max_similarity)
        self.c = c
        self.buffer = None
        self.dist_running_avg = 0.
        self.dist_running_count = 0.
        self.reward_filter = RewardForwardFilter(gamma)
        self.err_norm = RunningNorm((1, )).to(device)

    def forward(self, obs, update_norm=False, updates=None):
        with torch.no_grad():
            embeddings = self.ide(obs)
            if self.buffer is None:
                self.buffer = EpisodeBuffer(self.capacity, embeddings.shape[0],
                                            embeddings.shape[1], self.device)

            dists = self.buffer.get_nearest_neighbors(embeddings, self.k)
            # Normalize distances by average of kth-neighbors distance
            self._update_dist_running_mean(dists[:, -1])
            if self.dist_running_avg > 1e-5:
                dists /= self.dist_running_avg

            # Set close distances to 0
            dists = torch.max(dists - self.min_dist, torch.zeros_like(dists))

            # Compute inverse kernel of distances
            k = self.eps / (dists + self.eps)

            # Compute similarity
            s = torch.sqrt(self.c + k.sum(dim=1))

            # Compute short term intrinsic reward
            r = torch.where(s > self.max_similarity, torch.zeros_like(s),
                            1 / s)

            # combine with long term intrinsic reward
            # RND divides error by a running estimate of error std.
            # NGU also subtracts the mean and adds 1.
            r_rnd = self.rnd(obs, update_norm, updates)
            rnd_mean = self.rnd.err_norm.mean / (self.rnd.err_norm.std + 1e-5)
            r_rnd += 1.0 - rnd_mean
            modifier = torch.clamp(r_rnd, 1, self.L)
            reward = r * modifier

            self.buffer.insert(embeddings)

            # update running norm
            if update_norm and (updates is None or updates.sum() > 0):
                rets = self.reward_filter(reward, updates)[updates]
                var = 0 if rets.shape[0] == 1 else rets.var()
                self.err_norm.update(rets.mean(), var, rets.shape[0])

            return reward / (self.err_norm.std + 1e-5)

    def _update_dist_running_mean(self, max_dists):
        inds = self.buffer.n_in_buffer >= self.k
        max_dists = max_dists[inds]
        if max_dists.shape[0] == 0:
            return
        d = max_dists.mean()
        c = len(max_dists)
        new_c = self.dist_running_count + c
        self.dist_running_avg *= self.dist_running_count / new_c
        self.dist_running_avg += (c / new_c) * d
        self.dist_running_count += c

    def reset(self, dones=None):
        if self.buffer:
            self.buffer.reset(dones)

    def update_rnd(self, obs):
        return self.rnd.update(obs)

    def update_ide(self, obs, next_obs, actions):
        return self.ide.update(obs, next_obs, actions)

    def state_dict(self):
        return {
            'rnd': self.rnd.state_dict(),
            'ide': self.ide.state_dict(),
        }

    def load_state_dict(self, state_dict):
        self.rnd.load_state_dict(state_dict['rnd'])
        self.ide.load_state_dict(state_dict['ide'])
コード例 #6
0
class RND(nn.Module):
    """Implementation of Random Network Distillation."""

    def __init__(self, net, opt, gamma, shape, device):
        """
        Args:
            net (function): a function mapping input shape to a pytorch module.
            opt : PyTorch optimizer
            shape (tuple) : input shape for networks
            device : The device to place the networks on
        """

        super().__init__()
        self.target_net = net(shape).to(device)
        self.prediction_net = net(shape).to(device)
        self.device = device
        self.opt = opt(self.prediction_net.parameters())
        self.ob_norm = RunningNorm(shape).to(device)
        self.err_norm = RunningNorm((1,)).to(device)
        self.reward_filter = RewardForwardFilter(gamma)

    def forward(self, obs, update_norm=False, updates=None):
        """Get intrinsic reward."""
        should_update = update_norm and (updates is None or updates.sum() > 0)
        if should_update:
            obs_to_update = obs if updates is None else obs[updates]
            if obs_to_update.shape[0] == 1:
                var = torch.zeros_like(obs[0])
            else:
                var = obs_to_update.var(dim=0)
            self.ob_norm.update(obs_to_update.mean(dim=0), var,
                                obs_to_update.shape[0])

        obs = torch.clamp(self.ob_norm(obs), -5, 5)
        with torch.no_grad():
            err = torch.norm(self.target_net(obs) - self.prediction_net(obs),
                             dim=1)
            if should_update:
                rets = self.reward_filter(err, updates)[updates]
                var = 0 if rets.shape[0] == 1 else rets.var()
                self.err_norm.update(rets.mean(), var, rets.shape[0])
            return err / (self.err_norm.std + 1e-5)

    def update(self, obs):
        obs = torch.clamp(self.ob_norm(obs), -5, 5)
        self.opt.zero_grad()
        loss = torch.norm(self.target_net(obs) - self.prediction_net(obs),
                          dim=1).mean()
        loss.backward()
        self.opt.step()
        return loss

    def state_dict(self, *args, **kwargs):
        return {
            'rnd': super().state_dict(*args, **kwargs),
            'opt': self.opt.state_dict()
        }

    def load_state_dict(self, state_dict):
        self.opt.load_state_dict(state_dict['opt'])
        super().load_state_dict(state_dict['rnd'])