Ejemplo n.º 1
0
    def _encode_sample(self, idxes):
        batch = nest.map_structure(lambda x: x[idxes], self.data)

        def _batch(obs):
            return np.concatenate([ob[np.newaxis, :] for ob in obs], 0)

        obs = [self._encode_observation(idx) for idx in idxes]
        batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs))
        next_obs = [self._encode_observation(idx + 1) for idx in idxes]
        batch['next_obs'] = nest.map_structure(_batch,
                                               nest.zip_structure(*next_obs))
        return batch
Ejemplo n.º 2
0
def _get_env_ob_norm(env, steps, eps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps):
        ob, _, done, _ = env.step(env.action_space.sample())
        if done:
            ob = env.reset()
        obs.append(ob)
    obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs))
    data = nest.zip_structure(obs, unpack_space(env.observation_space))
    mean = nest.map_structure(_compute_mean, data)
    std = nest.map_structure(_compute_std(eps), data)
    return mean, std
Ejemplo n.º 3
0
    def _encode_sample(self, idxes):
        batch = {}

        def _batch(obs):
            return np.concatenate([ob[np.newaxis, :] for ob in obs], 0)

        obs = [self._encode_observation(idx) for idx in idxes]
        batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs))
        for k in self.data.keys():
            batch[k] = self.data[k][idxes]
        next_obs = [self._encode_observation(idx + 1) for idx in idxes]
        batch['next_obs'] = nest.map_structure(_batch,
                                               nest.zip_structure(*next_obs))
        return batch
Ejemplo n.º 4
0
def _get_venv_ob_norm(env, steps, eps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps // env.num_envs):
        ob, _, done, _ = env.step(
            np.array([env.action_space.sample() for _ in range(env.num_envs)]))
        if np.any(done):
            ob = env.reset(force=False)
        obs.append(ob)

    obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs))
    data = nest.zip_structure(obs, unpack_space(env.observation_space))
    mean = nest.map_structure(_compute_mean, data)
    std = nest.map_structure(_compute_std(eps), data)
    return mean, std
Ejemplo n.º 5
0
    def store_observation(self, obs):
        """Store a single observation in the buffer at the next available index.

        Overwrites old observations if necessary.
        Parameters
        ----------
        obs: nest of np.array

        Returns
        -------
        idx: int
            Index at which the obs is stored. To be used for `store_effect`
            later.

        """
        if self.obs is None:
            self.obs = nest.map_structure(self._init_obs_data, obs)

        def _store_ob(item):
            buffer, ob = item
            buffer[self.next_idx] = ob

        nest.map_structure(_store_ob, nest.zip_structure(self.obs, obs))

        ret = self.next_idx
        self.next_idx = (self.next_idx + 1) % self.size
        self.num_in_buffer = min(self.size, self.num_in_buffer + 1)

        return ret
Ejemplo n.º 6
0
    def store_effect(self, idx, step_data):
        """Store effects of action taken after obeserving obs stored at idx.

        The reason `store_observation` and `store_effect` is broken
        up into two functions is so that one can call
        `encode_recent_observation` in between.
        Paramters
        ---------
        idx: int
            Index in buffer of recent observation
            (returned by `store_observation`).
        data: dict
            The data to store in the buffer.
        """
        if self.data == {}:
            self._init_replay_data(step_data)
        if not nest.has_same_structure(self.data, step_data):
            raise ValueError("The data passed to ReplayBuffer must the same"
                             " at all time steps.")

        def _insert(item):
            buffer, x = item
            buffer[idx] = x

        nest.map_structure(_insert, nest.zip_structure(self.data, step_data))
Ejemplo n.º 7
0
 def _reset_done_envs(self):
     obs = []
     for e in range(self.num_envs):
         if self.transitions[e] is None or self.transitions[e][2]:
             self.transitions[e] = None
             obs.append(self.envs[e].reset())
         else:
             obs.append(self.transitions[e][0])
     return nest.map_structure(np.stack, nest.zip_structure(*obs))
Ejemplo n.º 8
0
    def log_prob(self, ac):
        """Log prob."""
        def _log_prob(item):
            dist, action = item
            return dist.log_prob(action)

        log_probs = nest.map_structure(_log_prob,
                                       nest.zip_structure(self.dists, ac))
        return sum(nest.flatten(log_probs))
Ejemplo n.º 9
0
 def forward(self, ob, ac):
     """Forward."""
     ob = self.obs_filter.get_value_fn_observation(ob)
     ob = nest.map_structure(lambda z: z.float(), ob)
     if self.device is None:
         self.device = nest.flatten(ob)[0].device
         self.ac_mean = nest.map_structure(self._to_torch, self.ac_mean)
         self.ac_std = nest.map_structure(self._to_torch, self.ac_std)
     # combine actions, but action observations are normalized so we have to
     # unnormalize them first
     combined_ac = nest.map_structure(
         self._unnorm_action,
         nest.zip_structure(ob['action'], self.ac_mean, self.ac_std))
     combined_ac['torque'] = (combined_ac['torque'] +
                              self.params.max_torque * ac)
     ob['action'] = nest.map_structure(
         self._norm_action,
         nest.zip_structure(combined_ac, self.ac_mean, self.ac_std))
     return self.net(self.embedding(ob))
Ejemplo n.º 10
0
    def _encode_sample(self, idxes):
        def _batch(obs):
            return np.concatenate([ob[np.newaxis, :] for ob in obs], 0)

        batch = {}
        batch['obs'] = []
        for k in self.data.keys():
            batch[k] = []

        for i in range(self.n):
            obs = [self._encode_observation(idx + i) for idx in idxes]
            batch['obs'].append(
                nest.map_structure(_batch, nest.zip_structure(*obs)))
            for k in self.data.keys():
                batch[k].append(self.data[k][[idx + i for idx in idxes]])
        obs = [self._encode_observation(idx + self.n) for idx in idxes]
        batch['obs'].append(
            nest.map_structure(_batch, nest.zip_structure(*obs)))
        return batch
Ejemplo n.º 11
0
def _get_env_ob_norm(env, steps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps):
        ob, _, done, _ = env.step(env.action_space.sample())
        if done:
            ob = env.reset()
        obs.append(ob)
    obs = nest.map_structure(np.stack, nest.zip_structure(*obs))
    mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs)
    std = nest.map_structure(lambda x: np.std(x, axis=0), obs)
    return mean, std
Ejemplo n.º 12
0
    def insert(self, step_data):
        """Insert new data into storage.

        Transfers to the correct device if needed.
        """
        if self.data is None:
            self.init_data(step_data)

        if self.rollout_complete:
            raise ValueError("Tried to insert data when the rollout is "
                             " complete. Call rollout.reset() to reset.")

        if self.step >= self.num_steps:
            self.extend_storage()

        if set(step_data.keys()) != self.keys:
            raise ValueError("The same data must be provided at every step.")

        def _copy_data(item):
            storage, step_data = item
            if step_data.device != self.device:
                storage[self.step].copy_(step_data.to(self.device))
            else:
                storage[self.step].copy_(step_data)

        def _check_shape(data, key):
            if data.shape[0] != self.num_processes:
                raise ValueError(f"data '{key}' is expected to have its "
                                 f"0th dimension equal to the number "
                                 f"of processes: {self.num_processes}")

        for k in self.keys:
            nest.map_structure(partial(_check_shape, key=k), step_data[k])
            nest.map_structure(_copy_data,
                               nest.zip_structure(self.data[k], step_data[k]))

        if self.step == 0:
            self.data['return'].fill_(0.)
            self.data['q_mc'].fill_(0.)
            done = torch.zeros_like(self.data['done'][0])
        else:
            done = self.data['done'][self.step - 1]
        if len(step_data['reward'].shape) == 2:
            r = torch.logical_not(done.unsqueeze(-1)) * step_data['reward'].to(
                self.device)
        else:
            r = torch.logical_not(done) * step_data['reward'].to(self.device)
        self.data['return'] += r

        self.sequence_lengths += torch.logical_not(step_data['done'].cpu())
        self.step = self.step + 1
        self.rollout_complete = bool(torch.all(step_data['done']))
Ejemplo n.º 13
0
    def update(self, mean, var, count):
        if self.count == 0:
            self.mean = mean
            self.var = var
            self.count = count

        else:
            self.batch_count = count
            self.new_count = count + self.count
            nest.map_structure(
                self._update, nest.zip_structure(self.mean, self.var, mean, var)
            )
            self.count = self.new_count
Ejemplo n.º 14
0
def _get_venv_ob_norm(env, steps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps // env.num_envs):
        ob, _, done, _ = env.step(
            np.array([env.action_space.sample() for _ in range(env.num_envs)]))
        if np.any(done):
            ob = env.reset(force=False)
        obs.append(ob)
    obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs))
    mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs)
    std = nest.map_structure(lambda x: np.std(x, axis=0), obs)
    return mean, std
Ejemplo n.º 15
0
    def step_wait(self):
        active = [False for _ in range(self.num_envs)]

        for e in range(self.num_envs):
            if self.transitions[e] is None or not self.transitions[e][
                    2]:  # if episode is over:
                action = nest.map_structure(lambda ac: ac[e], self.actions)
                self.transitions[e] = self.envs[e].step(action)
                active[e] = True

        obs, rs, dones, infos = zip(*self.transitions)
        for e, info in enumerate(infos):
            info['active'] = active[e]
        obs = nest.map_structure(np.stack, nest.zip_structure(*obs))
        return obs, np.stack(rs), np.stack(dones), infos
Ejemplo n.º 16
0
def _get_venv_ob_norm(env, steps):
    # Only collect obs from the first environment. This is hacky and
    # inefficient but is that simplest solution given that environments sync
    # their resets.
    ob = env.reset()
    obs = [nest.map_structure(lambda x: x[0], ob)]
    for _ in range(steps):
        ob, _, done, _ = env.step(
            np.array([env.action_space.sample() for _ in range(env.num_envs)]))
        if done[0]:
            ob = env.reset()
        obs.append(nest.map_structure(lambda x: x[0], ob))
    obs = nest.map_structure(np.stack, nest.zip_structure(*obs))
    mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs)
    std = nest.map_structure(lambda x: np.std(x, axis=0), obs)
    return mean, std
Ejemplo n.º 17
0
    def _normalize(self, obs):
        if not self.should_norm:
            return obs
        if self.mean is None or self.std is None:
            self.find_norm_params()
        obs = nest.map_structure(np.asarray, obs)
        obs = nest.map_structure(np.float32, obs)
        if not nest.has_same_structure(self.mean, obs):
            raise ValueError("mean and obs do not have the same structure!")

        def norm(item):
            ob, mean, std = item
            return (ob - mean) / std

        return nest.map_structure(norm,
                                  nest.zip_structure(obs, self.mean, self.std))
Ejemplo n.º 18
0
    def step(self, action):
        """Step."""
        ob, reward, done, info = self.venv.step(action)

        def _zero_frames(frames):
            for i, d in enumerate(done):
                if d:
                    frames[i] = 0
            return frames

        def _add_ob(item):
            return self._add_new_observation(*item)

        self.frames = nest.map_structure(_zero_frames, self.frames)
        self.frames = nest.map_structure(_add_ob,
                                         nest.zip_structure(self.frames, ob))
        ob = nest.map_structure(lambda x: x.copy(), self.frames)
        self._dones = np.logical_or(done, self._dones)
        return ob, reward, done, info
Ejemplo n.º 19
0
    def reset(self, force=True):
        """Reset."""
        ob = self.venv.reset(force=force)

        def _zero_frames(frames):
            if force:
                frames[:] = 0
            else:
                frames[self._dones] = 0
            return frames

        def _add_ob(item):
            return self._add_new_observation(*item)

        self.frames = nest.map_structure(_zero_frames, self.frames)
        self.frames = nest.map_structure(_add_ob,
                                         nest.zip_structure(self.frames, ob))
        self._dones[:] = False
        return nest.map_structure(lambda x: x.copy(), self.frames)
Ejemplo n.º 20
0
    def sample(self, batch_size):
        """Sample `batch_size` different transitions.

        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        Returns
        -------
        batched data: dict
            a dictionary containing batched observations, next_observations,
            action, reward, done, and other data stored in the replay buffer.

        """
        sizes = self._get_sizes(batch_size)
        batches = [
            buffer.sample(s) for buffer, s in zip(self.buffers, sizes) if s > 0
        ]
        return nest.map_structure(lambda x: np.concatenate(x, axis=0),
                                  nest.zip_structure(*batches))
Ejemplo n.º 21
0
    def get_observation(self, obs, base_action):
        # Add base action to obs
        obs = self._add_action_to_obs(obs, base_action)

        # Add batch dim
        obs = nest.map_structure(lambda x: x[None], obs)

        # Normalize observations
        def norm(item):
            ob, mean, std = item
            if mean is not None:
                return (ob - mean) / std
            else:
                return ob

        obs = nest.map_structure(
            norm,
            nest.zip_structure(obs, self.ob_norm['mean'], self.ob_norm['std']))

        # convert to torch tensors
        return nest.map_structure(
            lambda x: torch.from_numpy(x).to(self.device), obs)
Ejemplo n.º 22
0
def _flatten_obs(obs):
    assert isinstance(obs, (list, tuple))
    assert len(obs) > 0
    return nest.map_structure(np.stack, nest.zip_structure(*obs))
Ejemplo n.º 23
0
 def reset(self, force=True):
     if not force:
         return self._reset_done_envs()
     obs = [self.envs[e].reset() for e in range(self.num_envs)]
     self.transitions = [None for _ in range(self.num_envs)]
     return nest.map_structure(np.stack, nest.zip_structure(*obs))
Ejemplo n.º 24
0
 def encode_recent_observation(self):
     obs = [buf.encode_recent_observation() for buf in self.buffers]
     return nest.map_structure(np.stack, nest.zip_structure(*obs))
Ejemplo n.º 25
0
 def kl(self, other):
     "KL divergence."
     kls = nest.map_structure(lambda dists: dists[0].kl(dists[1]),
                              nest.zip_structure(self.dists, other.dists))
     return sum(nest.flatten(kls))