コード例 #1
0
ファイル: test_buffer.py プロジェクト: whoiszyc/imitation-1
def test_replay_buffer_from_data():
    obs = np.array([5, 2], dtype=int)
    acts = np.ones((2, 6), dtype=float)
    next_obs = np.array([7, 8], dtype=int)
    dones = np.array([True, False])
    infos = np.array([{}, {"a": "sdf"}])

    def _check_buf(buf):
        assert np.array_equal(buf._buffer._arrays["obs"], obs)
        assert np.array_equal(buf._buffer._arrays["next_obs"], next_obs)
        assert np.array_equal(buf._buffer._arrays["acts"], acts)
        assert np.array_equal(buf._buffer._arrays["infos"], infos)

    buf_std = ReplayBuffer.from_data(
        types.Transitions(
            obs=obs, acts=acts, next_obs=next_obs, dones=dones, infos=infos
        )
    )
    _check_buf(buf_std)

    rews = np.array([0.5, 1.0], dtype=float)
    buf_rew = ReplayBuffer.from_data(
        types.TransitionsWithRew(
            obs=obs,
            acts=acts,
            next_obs=next_obs,
            rews=rews,
            dones=dones,
            infos=infos,
        )
    )
    _check_buf(buf_rew)
コード例 #2
0
ファイル: rollout.py プロジェクト: robinbg/imitation
def flatten_trajectories(
    trajectories: Sequence[types.Trajectory], ) -> types.Transitions:
    """Flatten a series of trajectory dictionaries into arrays.
    Returns observations, actions, next observations, rewards.
    Args:
        trajectories: list of trajectories.
    Returns:
      The trajectories flattened into a single batch of Transitions.
    """
    keys = ["obs", "next_obs", "acts", "dones", "infos"]
    parts = {key: [] for key in keys}
    for traj in trajectories:
        parts["acts"].append(traj.acts)

        obs = traj.obs
        parts["obs"].append(obs[:-1])
        parts["next_obs"].append(obs[1:])

        dones = np.zeros(len(traj.acts), dtype=np.bool)
        dones[-1] = True
        parts["dones"].append(dones)

        if traj.infos is None:
            infos = np.array([{}] * len(traj))
        else:
            infos = traj.infos
        parts["infos"].append(infos)

    cat_parts = {
        key: np.concatenate(part_list, axis=0)
        for key, part_list in parts.items()
    }
    lengths = set(map(len, cat_parts.values()))
    assert len(lengths) == 1, f"expected one length, got {lengths}"
    return types.Transitions(**cat_parts)
コード例 #3
0
def mesh_evaluate_models_slow(
    models: Mapping[epic_sample.K, rewards.RewardModel],
    obs: np.ndarray,
    actions: np.ndarray,
    next_obs: np.ndarray,
) -> Mapping[epic_sample.K, np.ndarray]:
    """
    Evaluate models on the Cartesian product of `obs`, `actions`, `next_obs`.

    Same interface as `canonical_sample.mesh_evaluate_models`. However, this is much simpler, but
    also much slower (around 20x). We use it for testing to verify they produce the same results.
    It might also be useful in the future for other optimisations (e.g. a JIT like Numba).
    """
    transitions = list(itertools.product(obs, actions, next_obs))
    tiled_obs, tiled_acts, tiled_next_obs = (
        np.array([m[i] for m in transitions]) for i in range(3)  # pylint:disable=not-an-iterable
    )
    dones = np.zeros(len(tiled_obs), dtype=np.bool)
    transitions = types.Transitions(
        obs=tiled_obs,
        acts=tiled_acts,
        next_obs=tiled_next_obs,
        dones=dones,
        infos=None,
    )
    rews = rewards.evaluate_models(models, transitions)
    rews = {
        k: v.reshape(len(obs), len(actions), len(next_obs))
        for k, v in rews.items()
    }
    return rews
コード例 #4
0
        def f(n: int) -> types.Transitions:
            target_size = max(min_buffer, n)
            delta = target_size - len(buf["obs"])
            if delta > 0:
                transitions = transitions_callable(delta)
                assert len(transitions.obs) == delta
                for k, v in buf.items():
                    new_v = getattr(transitions, k)
                    if len(v) > 0:
                        new_v = np.concatenate((v, new_v), axis=0)
                    buf[k] = new_v

                # Note this assert may not hold outside this branch: if f was previously called
                # with a larger `n`, then `len(buf["obs"])` may be greater than `target_size`.
                assert len(buf["obs"]) == target_size

            assert len(buf["obs"]) >= target_size
            idxs = {
                k: rng.choice(target_size, size=n, replace=False)
                for k in buf.keys()
            }
            res = {k: buf[k][idx] for k, idx in idxs.items()}
            res = types.Transitions(**res, infos=None)

            for k, idx in idxs.items():
                buf[k] = np.delete(buf[k], idx, axis=0)

            return res
コード例 #5
0
def transitions(transitions_min: types.TransitionsMinimal,
                obs_space: gym.Space, length: int) -> types.Transitions:
    """Fixture to generate transitions of length `length` iid sampled from spaces."""
    next_obs = np.array([obs_space.sample() for _ in range(length)])
    dones = np.zeros(length, dtype=np.bool)
    return types.Transitions(**dataclasses.asdict(transitions_min),
                             next_obs=next_obs,
                             dones=dones)
コード例 #6
0
    def step(self, obs, state=None, mask=None, deterministic=False):
        del deterministic
        # actions: (n_samples, ) + ac_space, obs: (batch_size, ) + ob_space
        actions = np.array(
            [self.ac_space.sample() for _ in range(self.n_samples)])
        # dup_actions: (1, n_samples) + ac_space
        # dup_obs: (batch_size, 1) + ob_space
        dup_actions = actions[np.newaxis, :]
        dup_obs = obs[:, np.newaxis]
        # dup_actions: (batch_size, n_samples) + ac_space
        # dup_obs: (batch_size, n_samples) + ob_space
        batch_size = obs.shape[0]
        dup_actions = dup_actions.repeat(batch_size, axis=0)
        dup_obs = dup_obs.repeat(self.n_samples, axis=1)
        # dup_actions: (batch_size * n_samples, ) + ac_space
        # dup_obs: (batch_size * n_samples, ) + ob_space
        dup_actions = dup_actions.reshape(batch_size * self.n_samples, -1)
        dup_obs = dup_obs.reshape(batch_size * self.n_samples, -1)

        try:
            # TODO(): vectorizing transition would improve performance
            next_obs = []
            for old_ob, act in zip(dup_obs, dup_actions):
                old_s = self.venv.env_method("state_from_obs",
                                             old_ob,
                                             indices=[0])[0]
                new_s = self.venv.env_method("transition",
                                             old_s,
                                             act,
                                             indices=[0])[0]
                next_ob = self.venv.env_method("obs_from_state",
                                               new_s,
                                               indices=[0])[0]
                next_obs.append(next_ob)
            next_obs = np.array(next_obs)
        except AttributeError:
            warnings.warn("Environment is not model-based: will assume next "
                          "observation is the same as current observation.")
            next_obs = dup_obs

        dones = np.zeros(batch_size * self.n_samples, dtype=np.bool)
        batch = types.Transitions(obs=dup_obs,
                                  acts=dup_actions,
                                  next_obs=next_obs,
                                  dones=dones,
                                  infos=None)
        feed_dict = base.make_feed_dict([self.reward_model], batch)
        # TODO(): add a function to RewardModel to compute this?
        reward = self.sess.run(self.reward_model.reward, feed_dict=feed_dict)

        reward = np.reshape(reward, (batch_size, self.n_samples))
        best_actions_idx = reward.argmax(axis=1)
        best_actions = actions[best_actions_idx]

        return best_actions, None, None, None
コード例 #7
0
ファイル: buffer.py プロジェクト: whoiszyc/imitation-1
    def sample(self, n_samples: int) -> types.Transitions:
        """Sample obs-act-obs triples.

        Args:
            n_samples: The number of samples.

        Returns:
            A Transitions named tuple containing n_samples transitions.
        """
        sample = self._buffer.sample(n_samples)
        return types.Transitions(**sample)
コード例 #8
0
def mesh_input(
    env: point_mass.PointMassEnv,
    goal: np.ndarray,
    pos_lim: float = 1.0,
    pos_density: int = 9,
    vel_lim: float = 1.0,
    act_lim: float = 1.0,
    density: int = 21,
) -> Tuple[List[List[int]], types.Transitions]:
    """Computes a grid dataset of observation, actions and next observations.

    Specifically, it computes a grid of position, velocity and actions
    with the corresponding limits and density. It uses a fixed, specified goal.
    It then computes the next observation for each possible combination
    under the environment transition dynamics.

    Arguments:
        env: The PointMass environment.
        goal: A goal position, an (env.ndim)-dimensional vector.
        pos_lim: Position limit: the mesh will include range [-pos_lim, pos_lim].
        pos_density: The number of points in the position axis.
        vel_lim: Velocity limit: the mesh will include range [-vel_lim, vel_lim].
        act_lim: Action limit: the mesh will include range [-act_lim, act_lim].
        density: The number of points in the velocity and acceleration axes.

    Returns:
        Indexes (before turning into a grid) and a batch of resulting
        observation, action and next-observation triples.
    """
    n = env.ndim

    ranges = [(pos_lim, pos_density), (vel_lim, density), (act_lim, density)]
    idxs = [np.linspace(-lim, lim, density) for lim, density in ranges]
    idxs = list(itertools.chain(*[[idx for _ in range(n)] for idx in idxs]))
    mesh = np.meshgrid(*idxs, indexing="ij")

    pos = np.stack([x.flatten() for x in mesh[0:n]], axis=-1)
    vel = np.stack([x.flatten() for x in mesh[n:2 * n]], axis=-1)
    goal_obs = np.broadcast_to(goal, (pos.shape[0], n))
    obs = np.concatenate((pos, vel, goal_obs),
                         axis=-1).astype(env.observation_space.dtype)
    actions = np.stack([x.flatten() for x in mesh[2 * n:3 * n]], axis=-1)

    states = env.state_from_obs(obs)
    next_states = env.transition(states, actions)
    next_obs = env.obs_from_state(next_states)

    dones = np.zeros(len(obs), dtype=np.bool)
    dataset = types.Transitions(obs=obs,
                                acts=actions,
                                next_obs=next_obs,
                                dones=dones,
                                infos=None)
    return idxs, dataset
コード例 #9
0
ファイル: test_data.py プロジェクト: tigerneil/imitation
def transitions(obs_space: gym.Space, act_space: gym.Space,
                length: int) -> types.Transitions:
    """Fixture to generate transitions of length `length` iid sampled from spaces."""
    obs = np.array([obs_space.sample() for _ in range(length)])
    next_obs = np.array([obs_space.sample() for _ in range(length)])
    acts = np.array([act_space.sample() for _ in range(length)])
    dones = np.zeros(length, dtype=np.bool)
    return types.Transitions(obs=obs,
                             acts=acts,
                             next_obs=next_obs,
                             dones=dones)
コード例 #10
0
ファイル: test_data.py プロジェクト: tigerneil/imitation
def test_zero_length_fails():
    """Check zero-length trajectory and transitions fail."""
    empty = np.array([])
    with pytest.raises(ValueError, match=r"Degenerate trajectory.*"):
        types.Trajectory(obs=np.array([42]), acts=empty, infos=None)
    with pytest.raises(ValueError, match=r"Must have non-zero number of.*"):
        types.Transitions(
            obs=empty,
            acts=empty,
            next_obs=empty,
            dones=empty.astype(np.bool),
        )
コード例 #11
0
 def f(total_timesteps: int) -> types.Transitions:
     obses = obs_dist(total_timesteps)
     acts = act_dist(total_timesteps)
     next_obses = obs_dist(total_timesteps)
     dones = np.zeros(total_timesteps, dtype=np.bool)
     return types.Transitions(
         obs=np.array(obses),
         acts=np.array(acts),
         next_obs=np.array(next_obses),
         dones=dones,
         infos=None,
     )
コード例 #12
0
 def reward_fn(obs: np.ndarray, actions: np.ndarray,
               next_obs: np.ndarray,
               steps: np.ndarray) -> np.ndarray:
     """Helper method computing reward for registered model."""
     del steps
     # TODO(adam): RewardFn should probably include dones?
     dones = np.zeros(len(obs), dtype=np.bool)
     transitions = types.Transitions(
         obs=obs,
         acts=actions,
         next_obs=next_obs,
         dones=dones,
         infos=None,
     )
     fd = rewards.make_feed_dict([reward_model], transitions)
     return sess.run(reward_model.reward, feed_dict=fd)
コード例 #13
0
    def _make_feed_dict(self, preferences: List[TrajectoryPreference]):
        """Builds a feed dictionary.

        Args:
            preferences: A list of trajectory comparisons.

        Returns:
            A feed dict.
        """
        obs = _concatenate(preferences, "obs", slice(0, -1))
        acts = _concatenate(preferences, "acts", slice(None))
        next_obs = _concatenate(preferences, "obs", slice(1, None))
        dones = np.zeros(len(obs), dtype=np.bool)
        batch = types.Transitions(obs=obs,
                                  acts=acts,
                                  next_obs=next_obs,
                                  dones=dones,
                                  infos=None)
        feed_dict = base.make_feed_dict([self.model], batch)
        labels = np.array([p.label for p in preferences])
        feed_dict[self._preference_labels] = labels
        return feed_dict
コード例 #14
0
    def f(total_timesteps: int) -> types.Transitions:
        """Helper function."""
        obses = []
        acts = []
        next_obses = []
        for _ in range(total_timesteps):
            old_state = env.state_space.sample()
            obs = env.obs_from_state(old_state)
            act = env.action_space.sample()
            new_state = env.transition(old_state,
                                       act)  # may be non-deterministic
            next_obs = env.obs_from_state(new_state)

            obses.append(obs)
            acts.append(act)
            next_obses.append(next_obs)
        dones = np.zeros(total_timesteps, dtype=np.bool)
        return types.Transitions(
            obs=np.array(obses),
            acts=np.array(acts),
            next_obs=np.array(next_obses),
            dones=dones,
            infos=None,
        )
コード例 #15
0
 def dataset_generator(total_timesteps):
     obs = np.array([obs_space.sample() for _ in range(total_timesteps)])
     actions = np.array([act_space.sample() for _ in range(total_timesteps)])
     next_obs = (obs + actions).clip(0.0, 1.0)
     dones = np.zeros(total_timesteps, dtype=np.bool)
     return types.Transitions(obs=obs, acts=actions, next_obs=next_obs, dones=dones, infos=None)
コード例 #16
0
ファイル: test_buffer.py プロジェクト: whoiszyc/imitation-1
def test_replay_buffer(capacity, chunk_len, obs_shape, act_shape, dtype):
    """Builds a ReplayBuffer with the provided `capacity` and inserts.

    `capacity * 3` observation-action-observation samples into the buffer in
    chunks of length `chunk_len`.

    All chunks are of the appropriate observation or action shape, and contain
    the value fill_val.

    `len(buffer)` should increase until we reach capacity.
    `buffer._idx` should loop between 0 and `capacity - 1`.
    After every insertion, samples should only contain 66.6.
    """
    buf = ReplayBuffer(
        capacity,
        obs_shape=obs_shape,
        act_shape=act_shape,
        obs_dtype=dtype,
        act_dtype=dtype,
    )

    for i in range(0, capacity * 3, chunk_len):
        assert buf.size() == min(i, capacity)
        assert buf._buffer._idx == i % capacity

        dones = np.arange(i, i + chunk_len, dtype=np.int32) % 2
        dones = dones.astype(np.bool)
        infos = _fill_chunk(9 * capacity + i, chunk_len, (), dtype=dtype)
        infos = np.array([{"a": val} for val in infos])
        batch = types.Transitions(
            obs=_fill_chunk(i, chunk_len, obs_shape, dtype=dtype),
            next_obs=_fill_chunk(3 * capacity + i, chunk_len, obs_shape, dtype=dtype),
            acts=_fill_chunk(6 * capacity + i, chunk_len, act_shape, dtype=dtype),
            dones=dones,
            infos=infos,
        )
        buf.store(batch)

        # Are samples right shape?
        sample = buf.sample(100)
        info_vals = np.array([info["a"] for info in sample.infos])

        assert sample.obs.shape == sample.next_obs.shape == (100,) + obs_shape
        assert sample.acts.shape == (100,) + act_shape
        assert sample.dones.shape == (100,)
        assert info_vals.shape == (100,)

        # Are samples right data type?
        assert sample.obs.dtype == dtype
        assert sample.acts.dtype == dtype
        assert sample.next_obs.dtype == dtype
        assert info_vals.dtype == dtype
        assert sample.dones.dtype == np.bool
        assert sample.infos.dtype == np.object

        # Are samples in range?
        _check_bound(i + chunk_len, capacity, sample.obs)
        _check_bound(i + chunk_len, capacity, sample.next_obs, 3 * capacity)
        _check_bound(i + chunk_len, capacity, sample.acts, 6 * capacity)
        _check_bound(i + chunk_len, capacity, info_vals, 9 * capacity)

        # Are samples in-order?
        obs_fill = _get_fill_from_chunk(sample.obs)
        next_obs_fill = _get_fill_from_chunk(sample.next_obs)
        act_fill = _get_fill_from_chunk(sample.acts)
        info_vals_fill = _get_fill_from_chunk(info_vals)

        assert np.all(next_obs_fill - obs_fill == 3 * capacity), "out of order"
        assert np.all(act_fill - next_obs_fill == 3 * capacity), "out of order"
        assert np.all(info_vals_fill - act_fill == 3 * capacity), "out of order"
        # Can't do much other than parity check for boolean values.
        # `samples.done` has the same parity as `obs_fill` by construction.
        assert np.all(obs_fill % 2 == sample.dones), "out of order"
コード例 #17
0
def sample_mean_rews(
    models: Mapping[K, rewards.RewardModel],
    mean_from_obs: np.ndarray,
    act_samples: np.ndarray,
    next_obs_samples: np.ndarray,
    batch_size: int = 2**28,
) -> Mapping[K, np.ndarray]:
    """
    Estimates the mean reward from observations `mean_from_obs` using given samples.

    Evaluates in batches of at most `batch_size` bytes to avoid running out of memory. Note that
    the observations and actions, being vectors, often take up much more memory in RAM than the
    results, a scalar value.

    Args:
        models: A mapping from keys to reward models.
        mean_from_obs: Observations to compute the mean starting from.
        act_samples: Actions to compute the mean with respect to.
        next_obs_samples: Next observations to compute the mean with respect to.
        batch_size: The maximum number of points to compute the reward with respect to in a single
            batch.

    Returns:
        A mapping from keys to NumPy array of shape `(len(mean_from_obs),)`, containing the
        mean reward of the model over triples:
            `(obs, act, next_obs) for act, next_obs in zip(act_samples, next_obs_samples)`
    """
    assert act_samples.shape[0] == next_obs_samples.shape[0]
    assert mean_from_obs.shape[1:] == next_obs_samples.shape[1:]

    # Compute indexes to not exceed batch size
    sample_mem_usage = act_samples.nbytes + mean_from_obs.nbytes
    obs_per_batch = batch_size // sample_mem_usage
    if obs_per_batch <= 0:
        msg = f"`batch_size` too small to compute a batch: {batch_size} < {sample_mem_usage}."
        raise ValueError(msg)
    idxs = np.arange(0, len(mean_from_obs), obs_per_batch)
    idxs = np.concatenate((idxs, [len(mean_from_obs)]))  # include end point

    # Compute mean rewards
    mean_rews = {k: [] for k in models.keys()}
    reps = min(obs_per_batch, len(mean_from_obs))
    act_tiled = _tile_first_dim(act_samples, reps)
    next_obs_tiled = _tile_first_dim(next_obs_samples, reps)
    for start, end in zip(idxs[:-1], idxs[1:]):
        obs = mean_from_obs[start:end]
        obs_repeated = np.repeat(obs, len(act_samples), axis=0)
        batch = types.Transitions(
            obs=obs_repeated,
            acts=act_tiled[:len(obs_repeated), :],
            next_obs=next_obs_tiled[:len(obs_repeated), :],
            dones=np.zeros(len(obs_repeated), dtype=np.bool),
            infos=None,
        )
        rews = rewards.evaluate_models(models, batch)
        rews = {k: v.reshape(len(obs), -1) for k, v in rews.items()}
        for k, m in mean_rews.items():
            means = np.mean(rews[k], axis=1)
            m.extend(means)

    mean_rews = {k: np.array(v) for k, v in mean_rews.items()}
    for v in mean_rews.values():
        assert v.shape == (len(mean_from_obs), )
    return mean_rews