def summary_stats( observation_space: gym.Space, action_space: gym.Space, dataset: types.Transitions, reward_hids: Optional[Iterable[int]] = None, potential_hids: Optional[Iterable[int]] = None, ): """Compute summary statistics of a random reward and potential model.""" # Construct randomly initialized reward and potential rew_model = rewards.MLPRewardModel(observation_space, action_space, reward_hids) pot_model = rewards.MLPPotentialShaping(observation_space, action_space, potential_hids) tf.get_default_session().run(tf.global_variables_initializer()) # Compute their predictions on dataset models = {"reward": rew_model, "shaping": pot_model} preds = rewards.evaluate_models(models, dataset) potentials = rewards.evaluate_potentials([pot_model], dataset) old_potential = potentials[0][0] new_potential = potentials[1][0] # Compute summary statistics res = dict(**preds, old_potential=old_potential, new_potential=new_potential) return {k: sp.stats.describe(v) for k, v in res.items()}
def mesh_evaluate_models_slow( models: Mapping[epic_sample.K, rewards.RewardModel], obs: np.ndarray, actions: np.ndarray, next_obs: np.ndarray, ) -> Mapping[epic_sample.K, np.ndarray]: """ Evaluate models on the Cartesian product of `obs`, `actions`, `next_obs`. Same interface as `canonical_sample.mesh_evaluate_models`. However, this is much simpler, but also much slower (around 20x). We use it for testing to verify they produce the same results. It might also be useful in the future for other optimisations (e.g. a JIT like Numba). """ transitions = list(itertools.product(obs, actions, next_obs)) tiled_obs, tiled_acts, tiled_next_obs = ( np.array([m[i] for m in transitions]) for i in range(3) # pylint:disable=not-an-iterable ) dones = np.zeros(len(tiled_obs), dtype=np.bool) transitions = types.Transitions( obs=tiled_obs, acts=tiled_acts, next_obs=tiled_next_obs, dones=dones, infos=None, ) rews = rewards.evaluate_models(models, transitions) rews = { k: v.reshape(len(obs), len(actions), len(next_obs)) for k, v in rews.items() } return rews
def constant_baseline( match: comparisons.RegressModel, target: rewards.RewardModel, dataset: datasets.TransitionsCallable, test_size: int = 4096, ) -> Dict[str, Any]: """Computes the error in predictions of the model matched and some baselines. Arguments: match: The (fitted) match object. target: The reward model we are trying to predict. dataset: The dataset to evaluate on. test_size: The number of samples to evaluate on. Returns: A dictionary containing summary statistics. """ test_set = dataset(test_size) models = {"matched": match.model, "target": target} preds = rewards.evaluate_models(models, test_set) actual_delta = preds["matched"] - preds["target"] return { "int_l1": norm_diff(actual_delta, preds["target"], norm=1), "int_l2": norm_diff(actual_delta, preds["target"], norm=2), "baseline_l1": norm_diff(np.median(preds["target"]), preds["target"], norm=1), "baseline_l2": norm_diff(np.mean(preds["target"]), preds["target"], norm=2), }
def test_potential_shaping_invariants(graph, session, venv, potential_cls, discount: float, num_timesteps: int = 100): """Test that potential shaping obeys several invariants. Specifically: 1. new_potential must be constant when dones is true, and zero when `discount == 1.0`. 2. new_potential depends only on next observation. 3. old_potential depends only on current observation. 4. Shaping is discount * new_potential - old_potential. """ # Invariants: # When done, new_potential should always be zero. # self.discount * new_potential - old_potential should equal the output # Same old_obs should have same old_potential; same new_obs should have same new_potential. policy = base.RandomPolicy(venv.observation_space, venv.action_space) transitions = rollout.generate_transitions(policy, venv, n_timesteps=num_timesteps) with graph.as_default(), session.as_default(): potential = potential_cls(venv.observation_space, venv.action_space, discount=discount) session.run(tf.global_variables_initializer()) (old_pot, ), (new_pot, ) = rewards.evaluate_potentials([potential], transitions) # Check invariant 1: new_potential must be zero when dones is true transitions_all_done = dataclasses.replace(transitions, dones=np.ones_like( transitions.dones, dtype=np.bool)) with session.as_default(): _, new_pot_done = rewards.evaluate_potentials([potential], transitions_all_done) expected_new_pot_done = 0.0 if discount == 1.0 else np.mean(new_pot_done) assert np.allclose(new_pot_done, expected_new_pot_done) # Check invariants 2 and 3: {new,old}_potential depend only on {next,current} observation def _shuffle(fld: str): arr = np.array(getattr(transitions, fld)) np.random.shuffle(arr) trans = dataclasses.replace(transitions, **{fld: arr}) with session.as_default(): return rewards.evaluate_potentials([potential], trans) (old_pot_shuffled, ), _ = _shuffle("next_obs") _, (new_pot_shuffled, ) = _shuffle("obs") assert np.all(old_pot == old_pot_shuffled) assert np.all(new_pot == new_pot_shuffled) # Check invariant 4: that reward output is as expected given potentials with session.as_default(): rew = rewards.evaluate_models({"m": potential}, transitions)["m"] assert np.allclose(rew, discount * new_pot - old_pot)
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id): """Checks that reward models predictions match those of Gym reward.""" # Generate rollouts, recording Gym reward policy = base.RandomPolicy(venv.observation_space, venv.action_space) transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024) gym_reward = transitions.rews # Make predictions using reward model with graph.as_default(), session.as_default(): reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0) pred_reward = rewards.evaluate_models({"m": reward_model}, transitions)["m"] # Are the predictions close to true Gym reward? np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
def test_potential_shaping_cycle(graph, session, venv, potential_cls, discount: float, num_episodes: int = 10) -> None: """Test that potential shaping is constant on any fixed-length cycle. Specifically, performs rollouts of a random policy in the environment. Fixes the starting state for each trajectory at the all-zero state. Then computes episode return, and checks they're all equal. Requires environment be fixed length, otherwise the episode return will vary (except in the undiscounted case). """ policy = base.RandomPolicy(venv.observation_space, venv.action_space) trajectories = rollout.generate_trajectories( policy, venv, sample_until=rollout.min_episodes(num_episodes)) transitions = rollout.flatten_trajectories(trajectories) # Make initial state fixed as all-zero. # Note don't need to change final state, since `dones` being `True` should # force potential to be zero at those states. obs = np.array(transitions.obs) idxs = np.where(transitions.dones)[0] + 1 idxs = np.pad(idxs[:-1], (1, 0), "constant") obs[idxs, :] = 0 transitions = dataclasses.replace(transitions, obs=obs) with graph.as_default(), session.as_default(): reward_model = potential_cls(venv.observation_space, venv.action_space, discount=discount) session.run(tf.global_variables_initializer()) rews = rewards.evaluate_models({"m": reward_model}, transitions) rets = rewards.compute_return_from_rews(rews, transitions.dones, discount=discount)["m"] if discount == 1.0: assert np.allclose(rets, 0.0, atol=1e-5) assert np.allclose(rets, np.mean(rets), atol=1e-5)
def summary_comparison( original: rewards.RewardModel, matched: rewards.RewardModel, target: rewards.RewardModel, test_set: types.Transitions, shaping: Optional[rewards.RewardModel] = None, ) -> Tuple[float, float, float]: """Compare rewards in terms of intrinsic and shaping difference. Args: original: The inferred reward model. matched: The reward model after trying to match target via shaping. target: The target reward model (e.g. ground truth, if available). test_set: A dataset to evaluate on. shaping: A reward model adding potential shaping to original. If unspecified, will return 0 for the shaping component. Returns: A tuple (intrinsic, shaping, extrinsic). The intrinsic difference is the approximation of the nearest point between the equivalence classes for original and target. Shaping is the magnitude of the potential shaping term we are adding. Extrinsic is the raw difference between original and target without any transformations. """ models = {"original": original, "matched": matched, "target": target} if shaping is not None: models["shaping"] = shaping preds = rewards.evaluate_models(models, test_set) intrinsic_l2 = _scaled_norm(preds["matched"] - preds["target"]) if "shaping" in preds: shaping_l2 = _scaled_norm(preds["shaping"]) else: shaping_l2 = 0.0 extrinsic_l2 = _scaled_norm(preds["original"] - preds["target"]) return intrinsic_l2, shaping_l2, extrinsic_l2
def evaluate_reward_model(env: point_mass.PointMassEnv, model: rewards.RewardModel, goal: np.ndarray, **kwargs) -> xarray.DataArray: """Computes the reward predicted by model on environment. Arguments: env: A point mass environment. model: A reward model. goal: The position of the goal in env. **kwargs: Passed through to `mesh_input`. Returns: A 3D-tensor of rewards, indexed by position, velocity and acceleration. """ assert model.observation_space == env.observation_space assert model.action_space == env.action_space idxs, dataset = mesh_input(env, goal=goal, **kwargs) reward = rewards.evaluate_models({"m": model}, dataset)["m"] reward = reward.reshape(*[len(idx) for idx in idxs]) reward = xarray.DataArray(reward, coords=idxs, dims=["position", "velocity", "acceleration"]) return reward
def f(make_model): policy = base.RandomPolicy(venv.observation_space, venv.action_space) with datasets.transitions_factory_from_policy( venv, policy) as dataset_callable: batch = dataset_callable(1024) with graph.as_default(), session.as_default(): original = make_model(venv) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory( prefix="eval-rew-serialize") as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded_direct"): loaded_direct = util_serialize.Serializable.load( tmpdir) model_name = "evaluating_rewards/RewardModel-v0" loaded_indirect = serialize.load_reward( model_name, tmpdir, venv) models = { "o": original, "ld": loaded_direct, "li": loaded_indirect } preds = rewards.evaluate_models(models, batch) for model in models.values(): assert original.observation_space == model.observation_space assert original.action_space == model.action_space assert len(preds) == len(models) for pred in preds.values(): assert np.allclose(preds["o"], pred)
def _compare_synthetic_eval( metrics: Mapping[str, List[Mapping[Tuple[float, float], Any]]], originals, matchings, test_set: types.Transitions, initial_constants: Mapping[Tuple[float, float], float], initial_scales: Mapping[Tuple[float, float], float], gt_constant: float, gt_scale: float, model_affine: bool, model_potential: bool, ground_truth: rewards.RewardModel, noise_reward: rewards.RewardModel, reward_noise: np.ndarray, potential_noise: np.ndarray, ): intrinsics = {} shapings = {} extrinsics = {} ub_intrinsic = rewards.evaluate_models({"n": noise_reward}, test_set)["n"] ub_intrinsic = np.linalg.norm(ub_intrinsic) / np.sqrt(len(ub_intrinsic)) ub_intrinsics = {} final_constants = {} final_scales = {} # TODO(): this is a sequential bottleneck for rew_nm in reward_noise: for pot_nm in potential_noise: original = originals[(rew_nm, pot_nm)] matched = matchings[(rew_nm, pot_nm)] shaping_model = None if model_potential: shaping_model = matched.model_extra["shaping"].models[ "shaping"][0] res = comparisons.summary_comparison( original=original, matched=matched.model, target=ground_truth, shaping=shaping_model, test_set=test_set, ) intrinsic, shaping, extrinsic = res intrinsics[(rew_nm, pot_nm)] = intrinsic shapings[(rew_nm, pot_nm)] = shaping extrinsics[(rew_nm, pot_nm)] = extrinsic ub_intrinsics[(rew_nm, pot_nm)] = rew_nm * ub_intrinsic if model_affine: final = matched.model_extra["affine"].get_weights() else: final = rewards.AffineParameters(shift=0, scale=1.0) final_constants[(rew_nm, pot_nm)] = final.shift final_scales[(rew_nm, pot_nm)] = final.scale res = { "Intrinsic": intrinsics, "Intrinsic Upper Bound": ub_intrinsics, "Shaping": shapings, "Extrinsic": extrinsics, # Report scale from the perspective of the transformation needed to # map the generated reward model back to the target. So we need to # invert the gt_scale and gt_constant parameters, but can report the # parameters from the AffineTransform verbatim. "Real Scale": 1 / gt_scale, "Real Constant": -gt_constant / gt_scale, "Initial Scale": initial_scales, "Initial Constant": initial_constants, "Inferred Scale": final_scales, "Inferred Constant": final_constants, } df = pd.DataFrame(res) df.index.names = ["Reward Noise", "Potential Noise"] return df, metrics
def sample_canon_shaping( models: Mapping[K, rewards.RewardModel], batch: types.Transitions, act_dist: datasets.SampleDist, obs_dist: datasets.SampleDist, n_mean_samples: int, discount: float = 1.0, p: int = 1, ) -> Mapping[K, np.ndarray]: r""" Canonicalize `batch` for `models` using a sample-based estimate of mean reward. Specifically, the algorithm works by sampling `n_mean_samples` from `act_dist` and `obs_dist` to form a dataset of pairs $D = \{(a,s')\}$. We then consider a transition dynamics where, for any state $s$, the probability of transitioning to $s'$ after taking action $a$ is given by its measure in $D$. The policy takes actions $a$ independent of the state given by the measure of $(a,\cdot)$ in $D$. This gives value function: \[V(s) = \expectation_{(a,s') \sim D}\left[R(s,a,s') + \gamma V(s')\right]\]. The resulting shaping works out to be: \[F(s,a,s') = \gamma \expectation_{(a',s'') \sim D}\left[R(s',a',s'')\right] - \expectation_{(a,s') \sim D}\left[R(s,a,s')\right] - \gamma \expectation_{(s, \cdot) \sim D, (a,s') \sim D}\left[R(s,a,s')\right] \]. If `batch` was a mesh of $S \times A \times S$ and $D$ is a mesh on $A \times S$, where $S$ and $A$ are i.i.d. sampled from some observation and action distributions, then this is the same as discretizing the reward model by $S$ and $A$ and then using `tabular.fully_connected_random_canonical_reward`. The action and next-observation in $D$ are sampled i.i.d., but since we are not computing an entire mesh, the sampling process introduces a faux dependency. Additionally, `batch` may have an arbitrary distribution. Empirically, however, the two methods produce very similar results. The main advantage of this method is its computational efficiency, for similar reasons to why random search is often preferred over grid search when some unknown subset of parameters are relatively unimportant. Args: models: A mapping from keys to reward models. batch: A batch to evaluate the models with respect to. act_dist: The distribution to sample actions from. obs_dist: The distribution to sample next observations from. n_mean_samples: The number of samples to take. discount: The discount parameter to use for potential shaping. p: Controls power in the L^p norm used for normalization. Returns: A mapping from keys to NumPy arrays containing rewards from the model evaluated on batch and then canonicalized to be invariant to potential shaping and scale. """ raw_rew = rewards.evaluate_models(models, batch) # Sample-based estimate of mean reward act_samples = act_dist(n_mean_samples) next_obs_samples = obs_dist(n_mean_samples) all_obs = np.concatenate((next_obs_samples, batch.obs, batch.next_obs), axis=0) unique_obs, unique_inv = np.unique(all_obs, return_inverse=True, axis=0) mean_rews = sample_mean_rews(models, unique_obs, act_samples, next_obs_samples) mean_rews = {k: v[unique_inv] for k, v in mean_rews.items()} dataset_mean_rews = {k: v[0:n_mean_samples] for k, v in mean_rews.items()} total_mean = {k: np.mean(v) for k, v in dataset_mean_rews.items()} batch_mean_rews = { k: v[n_mean_samples:].reshape(2, -1) for k, v in mean_rews.items() } # Use mean rewards to canonicalize reward up to shaping deshaped_rew = {} for k in models.keys(): raw = raw_rew[k] mean = batch_mean_rews[k] total = total_mean[k] mean_obs = mean[0, :] mean_next_obs = mean[1, :] # Note this is the only part of the computation that depends on discount, so it'd be # cheap to evaluate for many values of `discount` if needed. deshaped = raw + discount * mean_next_obs - mean_obs - discount * total deshaped *= tabular.canonical_scale_normalizer(deshaped, p) deshaped_rew[k] = deshaped return deshaped_rew
def sample_mean_rews( models: Mapping[K, rewards.RewardModel], mean_from_obs: np.ndarray, act_samples: np.ndarray, next_obs_samples: np.ndarray, batch_size: int = 2**28, ) -> Mapping[K, np.ndarray]: """ Estimates the mean reward from observations `mean_from_obs` using given samples. Evaluates in batches of at most `batch_size` bytes to avoid running out of memory. Note that the observations and actions, being vectors, often take up much more memory in RAM than the results, a scalar value. Args: models: A mapping from keys to reward models. mean_from_obs: Observations to compute the mean starting from. act_samples: Actions to compute the mean with respect to. next_obs_samples: Next observations to compute the mean with respect to. batch_size: The maximum number of points to compute the reward with respect to in a single batch. Returns: A mapping from keys to NumPy array of shape `(len(mean_from_obs),)`, containing the mean reward of the model over triples: `(obs, act, next_obs) for act, next_obs in zip(act_samples, next_obs_samples)` """ assert act_samples.shape[0] == next_obs_samples.shape[0] assert mean_from_obs.shape[1:] == next_obs_samples.shape[1:] # Compute indexes to not exceed batch size sample_mem_usage = act_samples.nbytes + mean_from_obs.nbytes obs_per_batch = batch_size // sample_mem_usage if obs_per_batch <= 0: msg = f"`batch_size` too small to compute a batch: {batch_size} < {sample_mem_usage}." raise ValueError(msg) idxs = np.arange(0, len(mean_from_obs), obs_per_batch) idxs = np.concatenate((idxs, [len(mean_from_obs)])) # include end point # Compute mean rewards mean_rews = {k: [] for k in models.keys()} reps = min(obs_per_batch, len(mean_from_obs)) act_tiled = _tile_first_dim(act_samples, reps) next_obs_tiled = _tile_first_dim(next_obs_samples, reps) for start, end in zip(idxs[:-1], idxs[1:]): obs = mean_from_obs[start:end] obs_repeated = np.repeat(obs, len(act_samples), axis=0) batch = types.Transitions( obs=obs_repeated, acts=act_tiled[:len(obs_repeated), :], next_obs=next_obs_tiled[:len(obs_repeated), :], dones=np.zeros(len(obs_repeated), dtype=np.bool), infos=None, ) rews = rewards.evaluate_models(models, batch) rews = {k: v.reshape(len(obs), -1) for k, v in rews.items()} for k, m in mean_rews.items(): means = np.mean(rews[k], axis=1) m.extend(means) mean_rews = {k: np.array(v) for k, v in mean_rews.items()} for v in mean_rews.values(): assert v.shape == (len(mean_from_obs), ) return mean_rews