def test_replay_buffer_from_data(): obs = np.array([5, 2], dtype=int) acts = np.ones((2, 6), dtype=float) next_obs = np.array([7, 8], dtype=int) dones = np.array([True, False]) infos = np.array([{}, {"a": "sdf"}]) def _check_buf(buf): assert np.array_equal(buf._buffer._arrays["obs"], obs) assert np.array_equal(buf._buffer._arrays["next_obs"], next_obs) assert np.array_equal(buf._buffer._arrays["acts"], acts) assert np.array_equal(buf._buffer._arrays["infos"], infos) buf_std = ReplayBuffer.from_data( types.Transitions( obs=obs, acts=acts, next_obs=next_obs, dones=dones, infos=infos ) ) _check_buf(buf_std) rews = np.array([0.5, 1.0], dtype=float) buf_rew = ReplayBuffer.from_data( types.TransitionsWithRew( obs=obs, acts=acts, next_obs=next_obs, rews=rews, dones=dones, infos=infos, ) ) _check_buf(buf_rew)
def flatten_trajectories( trajectories: Sequence[types.Trajectory], ) -> types.Transitions: """Flatten a series of trajectory dictionaries into arrays. Returns observations, actions, next observations, rewards. Args: trajectories: list of trajectories. Returns: The trajectories flattened into a single batch of Transitions. """ keys = ["obs", "next_obs", "acts", "dones", "infos"] parts = {key: [] for key in keys} for traj in trajectories: parts["acts"].append(traj.acts) obs = traj.obs parts["obs"].append(obs[:-1]) parts["next_obs"].append(obs[1:]) dones = np.zeros(len(traj.acts), dtype=np.bool) dones[-1] = True parts["dones"].append(dones) if traj.infos is None: infos = np.array([{}] * len(traj)) else: infos = traj.infos parts["infos"].append(infos) cat_parts = { key: np.concatenate(part_list, axis=0) for key, part_list in parts.items() } lengths = set(map(len, cat_parts.values())) assert len(lengths) == 1, f"expected one length, got {lengths}" return types.Transitions(**cat_parts)
def mesh_evaluate_models_slow( models: Mapping[epic_sample.K, rewards.RewardModel], obs: np.ndarray, actions: np.ndarray, next_obs: np.ndarray, ) -> Mapping[epic_sample.K, np.ndarray]: """ Evaluate models on the Cartesian product of `obs`, `actions`, `next_obs`. Same interface as `canonical_sample.mesh_evaluate_models`. However, this is much simpler, but also much slower (around 20x). We use it for testing to verify they produce the same results. It might also be useful in the future for other optimisations (e.g. a JIT like Numba). """ transitions = list(itertools.product(obs, actions, next_obs)) tiled_obs, tiled_acts, tiled_next_obs = ( np.array([m[i] for m in transitions]) for i in range(3) # pylint:disable=not-an-iterable ) dones = np.zeros(len(tiled_obs), dtype=np.bool) transitions = types.Transitions( obs=tiled_obs, acts=tiled_acts, next_obs=tiled_next_obs, dones=dones, infos=None, ) rews = rewards.evaluate_models(models, transitions) rews = { k: v.reshape(len(obs), len(actions), len(next_obs)) for k, v in rews.items() } return rews
def f(n: int) -> types.Transitions: target_size = max(min_buffer, n) delta = target_size - len(buf["obs"]) if delta > 0: transitions = transitions_callable(delta) assert len(transitions.obs) == delta for k, v in buf.items(): new_v = getattr(transitions, k) if len(v) > 0: new_v = np.concatenate((v, new_v), axis=0) buf[k] = new_v # Note this assert may not hold outside this branch: if f was previously called # with a larger `n`, then `len(buf["obs"])` may be greater than `target_size`. assert len(buf["obs"]) == target_size assert len(buf["obs"]) >= target_size idxs = { k: rng.choice(target_size, size=n, replace=False) for k in buf.keys() } res = {k: buf[k][idx] for k, idx in idxs.items()} res = types.Transitions(**res, infos=None) for k, idx in idxs.items(): buf[k] = np.delete(buf[k], idx, axis=0) return res
def transitions(transitions_min: types.TransitionsMinimal, obs_space: gym.Space, length: int) -> types.Transitions: """Fixture to generate transitions of length `length` iid sampled from spaces.""" next_obs = np.array([obs_space.sample() for _ in range(length)]) dones = np.zeros(length, dtype=np.bool) return types.Transitions(**dataclasses.asdict(transitions_min), next_obs=next_obs, dones=dones)
def step(self, obs, state=None, mask=None, deterministic=False): del deterministic # actions: (n_samples, ) + ac_space, obs: (batch_size, ) + ob_space actions = np.array( [self.ac_space.sample() for _ in range(self.n_samples)]) # dup_actions: (1, n_samples) + ac_space # dup_obs: (batch_size, 1) + ob_space dup_actions = actions[np.newaxis, :] dup_obs = obs[:, np.newaxis] # dup_actions: (batch_size, n_samples) + ac_space # dup_obs: (batch_size, n_samples) + ob_space batch_size = obs.shape[0] dup_actions = dup_actions.repeat(batch_size, axis=0) dup_obs = dup_obs.repeat(self.n_samples, axis=1) # dup_actions: (batch_size * n_samples, ) + ac_space # dup_obs: (batch_size * n_samples, ) + ob_space dup_actions = dup_actions.reshape(batch_size * self.n_samples, -1) dup_obs = dup_obs.reshape(batch_size * self.n_samples, -1) try: # TODO(): vectorizing transition would improve performance next_obs = [] for old_ob, act in zip(dup_obs, dup_actions): old_s = self.venv.env_method("state_from_obs", old_ob, indices=[0])[0] new_s = self.venv.env_method("transition", old_s, act, indices=[0])[0] next_ob = self.venv.env_method("obs_from_state", new_s, indices=[0])[0] next_obs.append(next_ob) next_obs = np.array(next_obs) except AttributeError: warnings.warn("Environment is not model-based: will assume next " "observation is the same as current observation.") next_obs = dup_obs dones = np.zeros(batch_size * self.n_samples, dtype=np.bool) batch = types.Transitions(obs=dup_obs, acts=dup_actions, next_obs=next_obs, dones=dones, infos=None) feed_dict = base.make_feed_dict([self.reward_model], batch) # TODO(): add a function to RewardModel to compute this? reward = self.sess.run(self.reward_model.reward, feed_dict=feed_dict) reward = np.reshape(reward, (batch_size, self.n_samples)) best_actions_idx = reward.argmax(axis=1) best_actions = actions[best_actions_idx] return best_actions, None, None, None
def sample(self, n_samples: int) -> types.Transitions: """Sample obs-act-obs triples. Args: n_samples: The number of samples. Returns: A Transitions named tuple containing n_samples transitions. """ sample = self._buffer.sample(n_samples) return types.Transitions(**sample)
def mesh_input( env: point_mass.PointMassEnv, goal: np.ndarray, pos_lim: float = 1.0, pos_density: int = 9, vel_lim: float = 1.0, act_lim: float = 1.0, density: int = 21, ) -> Tuple[List[List[int]], types.Transitions]: """Computes a grid dataset of observation, actions and next observations. Specifically, it computes a grid of position, velocity and actions with the corresponding limits and density. It uses a fixed, specified goal. It then computes the next observation for each possible combination under the environment transition dynamics. Arguments: env: The PointMass environment. goal: A goal position, an (env.ndim)-dimensional vector. pos_lim: Position limit: the mesh will include range [-pos_lim, pos_lim]. pos_density: The number of points in the position axis. vel_lim: Velocity limit: the mesh will include range [-vel_lim, vel_lim]. act_lim: Action limit: the mesh will include range [-act_lim, act_lim]. density: The number of points in the velocity and acceleration axes. Returns: Indexes (before turning into a grid) and a batch of resulting observation, action and next-observation triples. """ n = env.ndim ranges = [(pos_lim, pos_density), (vel_lim, density), (act_lim, density)] idxs = [np.linspace(-lim, lim, density) for lim, density in ranges] idxs = list(itertools.chain(*[[idx for _ in range(n)] for idx in idxs])) mesh = np.meshgrid(*idxs, indexing="ij") pos = np.stack([x.flatten() for x in mesh[0:n]], axis=-1) vel = np.stack([x.flatten() for x in mesh[n:2 * n]], axis=-1) goal_obs = np.broadcast_to(goal, (pos.shape[0], n)) obs = np.concatenate((pos, vel, goal_obs), axis=-1).astype(env.observation_space.dtype) actions = np.stack([x.flatten() for x in mesh[2 * n:3 * n]], axis=-1) states = env.state_from_obs(obs) next_states = env.transition(states, actions) next_obs = env.obs_from_state(next_states) dones = np.zeros(len(obs), dtype=np.bool) dataset = types.Transitions(obs=obs, acts=actions, next_obs=next_obs, dones=dones, infos=None) return idxs, dataset
def transitions(obs_space: gym.Space, act_space: gym.Space, length: int) -> types.Transitions: """Fixture to generate transitions of length `length` iid sampled from spaces.""" obs = np.array([obs_space.sample() for _ in range(length)]) next_obs = np.array([obs_space.sample() for _ in range(length)]) acts = np.array([act_space.sample() for _ in range(length)]) dones = np.zeros(length, dtype=np.bool) return types.Transitions(obs=obs, acts=acts, next_obs=next_obs, dones=dones)
def test_zero_length_fails(): """Check zero-length trajectory and transitions fail.""" empty = np.array([]) with pytest.raises(ValueError, match=r"Degenerate trajectory.*"): types.Trajectory(obs=np.array([42]), acts=empty, infos=None) with pytest.raises(ValueError, match=r"Must have non-zero number of.*"): types.Transitions( obs=empty, acts=empty, next_obs=empty, dones=empty.astype(np.bool), )
def f(total_timesteps: int) -> types.Transitions: obses = obs_dist(total_timesteps) acts = act_dist(total_timesteps) next_obses = obs_dist(total_timesteps) dones = np.zeros(total_timesteps, dtype=np.bool) return types.Transitions( obs=np.array(obses), acts=np.array(acts), next_obs=np.array(next_obses), dones=dones, infos=None, )
def reward_fn(obs: np.ndarray, actions: np.ndarray, next_obs: np.ndarray, steps: np.ndarray) -> np.ndarray: """Helper method computing reward for registered model.""" del steps # TODO(adam): RewardFn should probably include dones? dones = np.zeros(len(obs), dtype=np.bool) transitions = types.Transitions( obs=obs, acts=actions, next_obs=next_obs, dones=dones, infos=None, ) fd = rewards.make_feed_dict([reward_model], transitions) return sess.run(reward_model.reward, feed_dict=fd)
def _make_feed_dict(self, preferences: List[TrajectoryPreference]): """Builds a feed dictionary. Args: preferences: A list of trajectory comparisons. Returns: A feed dict. """ obs = _concatenate(preferences, "obs", slice(0, -1)) acts = _concatenate(preferences, "acts", slice(None)) next_obs = _concatenate(preferences, "obs", slice(1, None)) dones = np.zeros(len(obs), dtype=np.bool) batch = types.Transitions(obs=obs, acts=acts, next_obs=next_obs, dones=dones, infos=None) feed_dict = base.make_feed_dict([self.model], batch) labels = np.array([p.label for p in preferences]) feed_dict[self._preference_labels] = labels return feed_dict
def f(total_timesteps: int) -> types.Transitions: """Helper function.""" obses = [] acts = [] next_obses = [] for _ in range(total_timesteps): old_state = env.state_space.sample() obs = env.obs_from_state(old_state) act = env.action_space.sample() new_state = env.transition(old_state, act) # may be non-deterministic next_obs = env.obs_from_state(new_state) obses.append(obs) acts.append(act) next_obses.append(next_obs) dones = np.zeros(total_timesteps, dtype=np.bool) return types.Transitions( obs=np.array(obses), acts=np.array(acts), next_obs=np.array(next_obses), dones=dones, infos=None, )
def dataset_generator(total_timesteps): obs = np.array([obs_space.sample() for _ in range(total_timesteps)]) actions = np.array([act_space.sample() for _ in range(total_timesteps)]) next_obs = (obs + actions).clip(0.0, 1.0) dones = np.zeros(total_timesteps, dtype=np.bool) return types.Transitions(obs=obs, acts=actions, next_obs=next_obs, dones=dones, infos=None)
def test_replay_buffer(capacity, chunk_len, obs_shape, act_shape, dtype): """Builds a ReplayBuffer with the provided `capacity` and inserts. `capacity * 3` observation-action-observation samples into the buffer in chunks of length `chunk_len`. All chunks are of the appropriate observation or action shape, and contain the value fill_val. `len(buffer)` should increase until we reach capacity. `buffer._idx` should loop between 0 and `capacity - 1`. After every insertion, samples should only contain 66.6. """ buf = ReplayBuffer( capacity, obs_shape=obs_shape, act_shape=act_shape, obs_dtype=dtype, act_dtype=dtype, ) for i in range(0, capacity * 3, chunk_len): assert buf.size() == min(i, capacity) assert buf._buffer._idx == i % capacity dones = np.arange(i, i + chunk_len, dtype=np.int32) % 2 dones = dones.astype(np.bool) infos = _fill_chunk(9 * capacity + i, chunk_len, (), dtype=dtype) infos = np.array([{"a": val} for val in infos]) batch = types.Transitions( obs=_fill_chunk(i, chunk_len, obs_shape, dtype=dtype), next_obs=_fill_chunk(3 * capacity + i, chunk_len, obs_shape, dtype=dtype), acts=_fill_chunk(6 * capacity + i, chunk_len, act_shape, dtype=dtype), dones=dones, infos=infos, ) buf.store(batch) # Are samples right shape? sample = buf.sample(100) info_vals = np.array([info["a"] for info in sample.infos]) assert sample.obs.shape == sample.next_obs.shape == (100,) + obs_shape assert sample.acts.shape == (100,) + act_shape assert sample.dones.shape == (100,) assert info_vals.shape == (100,) # Are samples right data type? assert sample.obs.dtype == dtype assert sample.acts.dtype == dtype assert sample.next_obs.dtype == dtype assert info_vals.dtype == dtype assert sample.dones.dtype == np.bool assert sample.infos.dtype == np.object # Are samples in range? _check_bound(i + chunk_len, capacity, sample.obs) _check_bound(i + chunk_len, capacity, sample.next_obs, 3 * capacity) _check_bound(i + chunk_len, capacity, sample.acts, 6 * capacity) _check_bound(i + chunk_len, capacity, info_vals, 9 * capacity) # Are samples in-order? obs_fill = _get_fill_from_chunk(sample.obs) next_obs_fill = _get_fill_from_chunk(sample.next_obs) act_fill = _get_fill_from_chunk(sample.acts) info_vals_fill = _get_fill_from_chunk(info_vals) assert np.all(next_obs_fill - obs_fill == 3 * capacity), "out of order" assert np.all(act_fill - next_obs_fill == 3 * capacity), "out of order" assert np.all(info_vals_fill - act_fill == 3 * capacity), "out of order" # Can't do much other than parity check for boolean values. # `samples.done` has the same parity as `obs_fill` by construction. assert np.all(obs_fill % 2 == sample.dones), "out of order"
def sample_mean_rews( models: Mapping[K, rewards.RewardModel], mean_from_obs: np.ndarray, act_samples: np.ndarray, next_obs_samples: np.ndarray, batch_size: int = 2**28, ) -> Mapping[K, np.ndarray]: """ Estimates the mean reward from observations `mean_from_obs` using given samples. Evaluates in batches of at most `batch_size` bytes to avoid running out of memory. Note that the observations and actions, being vectors, often take up much more memory in RAM than the results, a scalar value. Args: models: A mapping from keys to reward models. mean_from_obs: Observations to compute the mean starting from. act_samples: Actions to compute the mean with respect to. next_obs_samples: Next observations to compute the mean with respect to. batch_size: The maximum number of points to compute the reward with respect to in a single batch. Returns: A mapping from keys to NumPy array of shape `(len(mean_from_obs),)`, containing the mean reward of the model over triples: `(obs, act, next_obs) for act, next_obs in zip(act_samples, next_obs_samples)` """ assert act_samples.shape[0] == next_obs_samples.shape[0] assert mean_from_obs.shape[1:] == next_obs_samples.shape[1:] # Compute indexes to not exceed batch size sample_mem_usage = act_samples.nbytes + mean_from_obs.nbytes obs_per_batch = batch_size // sample_mem_usage if obs_per_batch <= 0: msg = f"`batch_size` too small to compute a batch: {batch_size} < {sample_mem_usage}." raise ValueError(msg) idxs = np.arange(0, len(mean_from_obs), obs_per_batch) idxs = np.concatenate((idxs, [len(mean_from_obs)])) # include end point # Compute mean rewards mean_rews = {k: [] for k in models.keys()} reps = min(obs_per_batch, len(mean_from_obs)) act_tiled = _tile_first_dim(act_samples, reps) next_obs_tiled = _tile_first_dim(next_obs_samples, reps) for start, end in zip(idxs[:-1], idxs[1:]): obs = mean_from_obs[start:end] obs_repeated = np.repeat(obs, len(act_samples), axis=0) batch = types.Transitions( obs=obs_repeated, acts=act_tiled[:len(obs_repeated), :], next_obs=next_obs_tiled[:len(obs_repeated), :], dones=np.zeros(len(obs_repeated), dtype=np.bool), infos=None, ) rews = rewards.evaluate_models(models, batch) rews = {k: v.reshape(len(obs), -1) for k, v in rews.items()} for k, m in mean_rews.items(): means = np.mean(rews[k], axis=1) m.extend(means) mean_rews = {k: np.array(v) for k, v in mean_rews.items()} for v in mean_rews.values(): assert v.shape == (len(mean_from_obs), ) return mean_rews