Esempio n. 1
0
def summary_stats(
    observation_space: gym.Space,
    action_space: gym.Space,
    dataset: types.Transitions,
    reward_hids: Optional[Iterable[int]] = None,
    potential_hids: Optional[Iterable[int]] = None,
):
    """Compute summary statistics of a random reward and potential model."""
    # Construct randomly initialized reward and potential
    rew_model = rewards.MLPRewardModel(observation_space, action_space,
                                       reward_hids)
    pot_model = rewards.MLPPotentialShaping(observation_space, action_space,
                                            potential_hids)
    tf.get_default_session().run(tf.global_variables_initializer())

    # Compute their predictions on dataset
    models = {"reward": rew_model, "shaping": pot_model}
    preds = rewards.evaluate_models(models, dataset)
    potentials = rewards.evaluate_potentials([pot_model], dataset)
    old_potential = potentials[0][0]
    new_potential = potentials[1][0]

    # Compute summary statistics
    res = dict(**preds,
               old_potential=old_potential,
               new_potential=new_potential)
    return {k: sp.stats.describe(v) for k, v in res.items()}
Esempio n. 2
0
def mesh_evaluate_models_slow(
    models: Mapping[epic_sample.K, rewards.RewardModel],
    obs: np.ndarray,
    actions: np.ndarray,
    next_obs: np.ndarray,
) -> Mapping[epic_sample.K, np.ndarray]:
    """
    Evaluate models on the Cartesian product of `obs`, `actions`, `next_obs`.

    Same interface as `canonical_sample.mesh_evaluate_models`. However, this is much simpler, but
    also much slower (around 20x). We use it for testing to verify they produce the same results.
    It might also be useful in the future for other optimisations (e.g. a JIT like Numba).
    """
    transitions = list(itertools.product(obs, actions, next_obs))
    tiled_obs, tiled_acts, tiled_next_obs = (
        np.array([m[i] for m in transitions]) for i in range(3)  # pylint:disable=not-an-iterable
    )
    dones = np.zeros(len(tiled_obs), dtype=np.bool)
    transitions = types.Transitions(
        obs=tiled_obs,
        acts=tiled_acts,
        next_obs=tiled_next_obs,
        dones=dones,
        infos=None,
    )
    rews = rewards.evaluate_models(models, transitions)
    rews = {
        k: v.reshape(len(obs), len(actions), len(next_obs))
        for k, v in rews.items()
    }
    return rews
Esempio n. 3
0
def constant_baseline(
    match: comparisons.RegressModel,
    target: rewards.RewardModel,
    dataset: datasets.TransitionsCallable,
    test_size: int = 4096,
) -> Dict[str, Any]:
    """Computes the error in predictions of the model matched and some baselines.

    Arguments:
        match: The (fitted) match object.
        target: The reward model we are trying to predict.
        dataset: The dataset to evaluate on.
        test_size: The number of samples to evaluate on.

    Returns:
        A dictionary containing summary statistics.
    """
    test_set = dataset(test_size)
    models = {"matched": match.model, "target": target}
    preds = rewards.evaluate_models(models, test_set)

    actual_delta = preds["matched"] - preds["target"]
    return {
        "int_l1":
        norm_diff(actual_delta, preds["target"], norm=1),
        "int_l2":
        norm_diff(actual_delta, preds["target"], norm=2),
        "baseline_l1":
        norm_diff(np.median(preds["target"]), preds["target"], norm=1),
        "baseline_l2":
        norm_diff(np.mean(preds["target"]), preds["target"], norm=2),
    }
Esempio n. 4
0
def test_potential_shaping_invariants(graph,
                                      session,
                                      venv,
                                      potential_cls,
                                      discount: float,
                                      num_timesteps: int = 100):
    """Test that potential shaping obeys several invariants.

    Specifically:
        1. new_potential must be constant when dones is true, and zero when `discount == 1.0`.
        2. new_potential depends only on next observation.
        3. old_potential depends only on current observation.
        4. Shaping is discount * new_potential - old_potential.
    """
    # Invariants:
    # When done, new_potential should always be zero.
    # self.discount * new_potential - old_potential should equal the output
    # Same old_obs should have same old_potential; same new_obs should have same new_potential.
    policy = base.RandomPolicy(venv.observation_space, venv.action_space)
    transitions = rollout.generate_transitions(policy,
                                               venv,
                                               n_timesteps=num_timesteps)

    with graph.as_default(), session.as_default():
        potential = potential_cls(venv.observation_space,
                                  venv.action_space,
                                  discount=discount)
        session.run(tf.global_variables_initializer())
        (old_pot, ), (new_pot, ) = rewards.evaluate_potentials([potential],
                                                               transitions)

    # Check invariant 1: new_potential must be zero when dones is true
    transitions_all_done = dataclasses.replace(transitions,
                                               dones=np.ones_like(
                                                   transitions.dones,
                                                   dtype=np.bool))
    with session.as_default():
        _, new_pot_done = rewards.evaluate_potentials([potential],
                                                      transitions_all_done)
    expected_new_pot_done = 0.0 if discount == 1.0 else np.mean(new_pot_done)
    assert np.allclose(new_pot_done, expected_new_pot_done)

    # Check invariants 2 and 3: {new,old}_potential depend only on {next,current} observation
    def _shuffle(fld: str):
        arr = np.array(getattr(transitions, fld))
        np.random.shuffle(arr)
        trans = dataclasses.replace(transitions, **{fld: arr})
        with session.as_default():
            return rewards.evaluate_potentials([potential], trans)

    (old_pot_shuffled, ), _ = _shuffle("next_obs")
    _, (new_pot_shuffled, ) = _shuffle("obs")
    assert np.all(old_pot == old_pot_shuffled)
    assert np.all(new_pot == new_pot_shuffled)

    # Check invariant 4: that reward output is as expected given potentials
    with session.as_default():
        rew = rewards.evaluate_models({"m": potential}, transitions)["m"]
    assert np.allclose(rew, discount * new_pot - old_pot)
Esempio n. 5
0
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id):
    """Checks that reward models predictions match those of Gym reward."""
    # Generate rollouts, recording Gym reward
    policy = base.RandomPolicy(venv.observation_space, venv.action_space)
    transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024)
    gym_reward = transitions.rews

    # Make predictions using reward model
    with graph.as_default(), session.as_default():
        reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0)
        pred_reward = rewards.evaluate_models({"m": reward_model},
                                              transitions)["m"]

    # Are the predictions close to true Gym reward?
    np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
Esempio n. 6
0
def test_potential_shaping_cycle(graph,
                                 session,
                                 venv,
                                 potential_cls,
                                 discount: float,
                                 num_episodes: int = 10) -> None:
    """Test that potential shaping is constant on any fixed-length cycle.

    Specifically, performs rollouts of a random policy in the environment.
    Fixes the starting state for each trajectory at the all-zero state.
    Then computes episode return, and checks they're all equal.

    Requires environment be fixed length, otherwise the episode return will vary
    (except in the undiscounted case).
    """
    policy = base.RandomPolicy(venv.observation_space, venv.action_space)
    trajectories = rollout.generate_trajectories(
        policy, venv, sample_until=rollout.min_episodes(num_episodes))
    transitions = rollout.flatten_trajectories(trajectories)

    # Make initial state fixed as all-zero.
    # Note don't need to change final state, since `dones` being `True` should
    # force potential to be zero at those states.
    obs = np.array(transitions.obs)
    idxs = np.where(transitions.dones)[0] + 1
    idxs = np.pad(idxs[:-1], (1, 0), "constant")
    obs[idxs, :] = 0
    transitions = dataclasses.replace(transitions, obs=obs)

    with graph.as_default(), session.as_default():
        reward_model = potential_cls(venv.observation_space,
                                     venv.action_space,
                                     discount=discount)
        session.run(tf.global_variables_initializer())
        rews = rewards.evaluate_models({"m": reward_model}, transitions)

    rets = rewards.compute_return_from_rews(rews,
                                            transitions.dones,
                                            discount=discount)["m"]
    if discount == 1.0:
        assert np.allclose(rets, 0.0, atol=1e-5)
    assert np.allclose(rets, np.mean(rets), atol=1e-5)
Esempio n. 7
0
def summary_comparison(
    original: rewards.RewardModel,
    matched: rewards.RewardModel,
    target: rewards.RewardModel,
    test_set: types.Transitions,
    shaping: Optional[rewards.RewardModel] = None,
) -> Tuple[float, float, float]:
    """Compare rewards in terms of intrinsic and shaping difference.

    Args:
        original: The inferred reward model.
        matched: The reward model after trying to match target via shaping.
        target: The target reward model (e.g. ground truth, if available).
        test_set: A dataset to evaluate on.
        shaping: A reward model adding potential shaping to original.
                If unspecified, will return 0 for the shaping component.

    Returns:
        A tuple (intrinsic, shaping, extrinsic). The intrinsic difference is the
        approximation of the nearest point between the equivalence classes for
        original and target. Shaping is the magnitude of the potential shaping
        term we are adding. Extrinsic is the raw difference between original and
        target without any transformations.
    """
    models = {"original": original, "matched": matched, "target": target}

    if shaping is not None:
        models["shaping"] = shaping

    preds = rewards.evaluate_models(models, test_set)
    intrinsic_l2 = _scaled_norm(preds["matched"] - preds["target"])
    if "shaping" in preds:
        shaping_l2 = _scaled_norm(preds["shaping"])
    else:
        shaping_l2 = 0.0
    extrinsic_l2 = _scaled_norm(preds["original"] - preds["target"])

    return intrinsic_l2, shaping_l2, extrinsic_l2
Esempio n. 8
0
def evaluate_reward_model(env: point_mass.PointMassEnv,
                          model: rewards.RewardModel, goal: np.ndarray,
                          **kwargs) -> xarray.DataArray:
    """Computes the reward predicted by model on environment.

    Arguments:
        env: A point mass environment.
        model: A reward model.
        goal: The position of the goal in env.
        **kwargs: Passed through to `mesh_input`.

    Returns:
        A 3D-tensor of rewards, indexed by position, velocity and acceleration.
    """
    assert model.observation_space == env.observation_space
    assert model.action_space == env.action_space
    idxs, dataset = mesh_input(env, goal=goal, **kwargs)
    reward = rewards.evaluate_models({"m": model}, dataset)["m"]
    reward = reward.reshape(*[len(idx) for idx in idxs])
    reward = xarray.DataArray(reward,
                              coords=idxs,
                              dims=["position", "velocity", "acceleration"])
    return reward
Esempio n. 9
0
    def f(make_model):
        policy = base.RandomPolicy(venv.observation_space, venv.action_space)
        with datasets.transitions_factory_from_policy(
                venv, policy) as dataset_callable:
            batch = dataset_callable(1024)

            with graph.as_default(), session.as_default():
                original = make_model(venv)
                session.run(tf.global_variables_initializer())

                with tempfile.TemporaryDirectory(
                        prefix="eval-rew-serialize") as tmpdir:
                    original.save(tmpdir)

                    with tf.variable_scope("loaded_direct"):
                        loaded_direct = util_serialize.Serializable.load(
                            tmpdir)

                    model_name = "evaluating_rewards/RewardModel-v0"
                    loaded_indirect = serialize.load_reward(
                        model_name, tmpdir, venv)

                models = {
                    "o": original,
                    "ld": loaded_direct,
                    "li": loaded_indirect
                }
                preds = rewards.evaluate_models(models, batch)

            for model in models.values():
                assert original.observation_space == model.observation_space
                assert original.action_space == model.action_space

            assert len(preds) == len(models)
            for pred in preds.values():
                assert np.allclose(preds["o"], pred)
Esempio n. 10
0
def _compare_synthetic_eval(
    metrics: Mapping[str, List[Mapping[Tuple[float, float], Any]]],
    originals,
    matchings,
    test_set: types.Transitions,
    initial_constants: Mapping[Tuple[float, float], float],
    initial_scales: Mapping[Tuple[float, float], float],
    gt_constant: float,
    gt_scale: float,
    model_affine: bool,
    model_potential: bool,
    ground_truth: rewards.RewardModel,
    noise_reward: rewards.RewardModel,
    reward_noise: np.ndarray,
    potential_noise: np.ndarray,
):
    intrinsics = {}
    shapings = {}
    extrinsics = {}
    ub_intrinsic = rewards.evaluate_models({"n": noise_reward}, test_set)["n"]
    ub_intrinsic = np.linalg.norm(ub_intrinsic) / np.sqrt(len(ub_intrinsic))
    ub_intrinsics = {}
    final_constants = {}
    final_scales = {}
    # TODO(): this is a sequential bottleneck
    for rew_nm in reward_noise:
        for pot_nm in potential_noise:
            original = originals[(rew_nm, pot_nm)]
            matched = matchings[(rew_nm, pot_nm)]
            shaping_model = None
            if model_potential:
                shaping_model = matched.model_extra["shaping"].models[
                    "shaping"][0]

            res = comparisons.summary_comparison(
                original=original,
                matched=matched.model,
                target=ground_truth,
                shaping=shaping_model,
                test_set=test_set,
            )
            intrinsic, shaping, extrinsic = res
            intrinsics[(rew_nm, pot_nm)] = intrinsic
            shapings[(rew_nm, pot_nm)] = shaping
            extrinsics[(rew_nm, pot_nm)] = extrinsic
            ub_intrinsics[(rew_nm, pot_nm)] = rew_nm * ub_intrinsic

            if model_affine:
                final = matched.model_extra["affine"].get_weights()
            else:
                final = rewards.AffineParameters(shift=0, scale=1.0)
            final_constants[(rew_nm, pot_nm)] = final.shift
            final_scales[(rew_nm, pot_nm)] = final.scale

    res = {
        "Intrinsic": intrinsics,
        "Intrinsic Upper Bound": ub_intrinsics,
        "Shaping": shapings,
        "Extrinsic": extrinsics,
        # Report scale from the perspective of the transformation needed to
        # map the generated reward model back to the target. So we need to
        # invert the gt_scale and gt_constant parameters, but can report the
        # parameters from the AffineTransform verbatim.
        "Real Scale": 1 / gt_scale,
        "Real Constant": -gt_constant / gt_scale,
        "Initial Scale": initial_scales,
        "Initial Constant": initial_constants,
        "Inferred Scale": final_scales,
        "Inferred Constant": final_constants,
    }
    df = pd.DataFrame(res)
    df.index.names = ["Reward Noise", "Potential Noise"]
    return df, metrics
Esempio n. 11
0
def sample_canon_shaping(
    models: Mapping[K, rewards.RewardModel],
    batch: types.Transitions,
    act_dist: datasets.SampleDist,
    obs_dist: datasets.SampleDist,
    n_mean_samples: int,
    discount: float = 1.0,
    p: int = 1,
) -> Mapping[K, np.ndarray]:
    r"""
    Canonicalize `batch` for `models` using a sample-based estimate of mean reward.

    Specifically, the algorithm works by sampling `n_mean_samples` from `act_dist` and `obs_dist`
    to form a dataset of pairs $D = \{(a,s')\}$. We then consider a transition dynamics where,
    for any state $s$, the probability of transitioning to $s'$ after taking action $a$ is given by
    its measure in $D$. The policy takes actions $a$ independent of the state given by the measure
    of $(a,\cdot)$ in $D$.

    This gives value function:
        \[V(s) = \expectation_{(a,s') \sim D}\left[R(s,a,s') + \gamma V(s')\right]\].
    The resulting shaping works out to be:
        \[F(s,a,s') = \gamma \expectation_{(a',s'') \sim D}\left[R(s',a',s'')\right]
                    - \expectation_{(a,s') \sim D}\left[R(s,a,s')\right]
                    - \gamma \expectation_{(s, \cdot) \sim D, (a,s') \sim D}\left[R(s,a,s')\right]
        \].

    If `batch` was a mesh of $S \times A \times S$ and $D$ is a mesh on $A \times S$,
    where $S$ and $A$ are i.i.d. sampled from some observation and action distributions, then this
    is the same as discretizing the reward model by $S$ and $A$ and then using
    `tabular.fully_connected_random_canonical_reward`. The action and next-observation in $D$ are
    sampled i.i.d., but since we are not computing an entire mesh, the sampling process introduces a
    faux dependency. Additionally, `batch` may have an arbitrary distribution.

    Empirically, however, the two methods produce very similar results. The main advantage of this
    method is its computational efficiency, for similar reasons to why random search is often
    preferred over grid search when some unknown subset of parameters are relatively unimportant.

    Args:
        models: A mapping from keys to reward models.
        batch: A batch to evaluate the models with respect to.
        act_dist: The distribution to sample actions from.
        obs_dist: The distribution to sample next observations from.
        n_mean_samples: The number of samples to take.
        discount: The discount parameter to use for potential shaping.
        p: Controls power in the L^p norm used for normalization.

    Returns:
        A mapping from keys to NumPy arrays containing rewards from the model evaluated on batch
        and then canonicalized to be invariant to potential shaping and scale.
    """
    raw_rew = rewards.evaluate_models(models, batch)

    # Sample-based estimate of mean reward
    act_samples = act_dist(n_mean_samples)
    next_obs_samples = obs_dist(n_mean_samples)

    all_obs = np.concatenate((next_obs_samples, batch.obs, batch.next_obs),
                             axis=0)
    unique_obs, unique_inv = np.unique(all_obs, return_inverse=True, axis=0)
    mean_rews = sample_mean_rews(models, unique_obs, act_samples,
                                 next_obs_samples)
    mean_rews = {k: v[unique_inv] for k, v in mean_rews.items()}

    dataset_mean_rews = {k: v[0:n_mean_samples] for k, v in mean_rews.items()}
    total_mean = {k: np.mean(v) for k, v in dataset_mean_rews.items()}

    batch_mean_rews = {
        k: v[n_mean_samples:].reshape(2, -1)
        for k, v in mean_rews.items()
    }

    # Use mean rewards to canonicalize reward up to shaping
    deshaped_rew = {}
    for k in models.keys():
        raw = raw_rew[k]
        mean = batch_mean_rews[k]
        total = total_mean[k]
        mean_obs = mean[0, :]
        mean_next_obs = mean[1, :]
        # Note this is the only part of the computation that depends on discount, so it'd be
        # cheap to evaluate for many values of `discount` if needed.
        deshaped = raw + discount * mean_next_obs - mean_obs - discount * total
        deshaped *= tabular.canonical_scale_normalizer(deshaped, p)
        deshaped_rew[k] = deshaped

    return deshaped_rew
Esempio n. 12
0
def sample_mean_rews(
    models: Mapping[K, rewards.RewardModel],
    mean_from_obs: np.ndarray,
    act_samples: np.ndarray,
    next_obs_samples: np.ndarray,
    batch_size: int = 2**28,
) -> Mapping[K, np.ndarray]:
    """
    Estimates the mean reward from observations `mean_from_obs` using given samples.

    Evaluates in batches of at most `batch_size` bytes to avoid running out of memory. Note that
    the observations and actions, being vectors, often take up much more memory in RAM than the
    results, a scalar value.

    Args:
        models: A mapping from keys to reward models.
        mean_from_obs: Observations to compute the mean starting from.
        act_samples: Actions to compute the mean with respect to.
        next_obs_samples: Next observations to compute the mean with respect to.
        batch_size: The maximum number of points to compute the reward with respect to in a single
            batch.

    Returns:
        A mapping from keys to NumPy array of shape `(len(mean_from_obs),)`, containing the
        mean reward of the model over triples:
            `(obs, act, next_obs) for act, next_obs in zip(act_samples, next_obs_samples)`
    """
    assert act_samples.shape[0] == next_obs_samples.shape[0]
    assert mean_from_obs.shape[1:] == next_obs_samples.shape[1:]

    # Compute indexes to not exceed batch size
    sample_mem_usage = act_samples.nbytes + mean_from_obs.nbytes
    obs_per_batch = batch_size // sample_mem_usage
    if obs_per_batch <= 0:
        msg = f"`batch_size` too small to compute a batch: {batch_size} < {sample_mem_usage}."
        raise ValueError(msg)
    idxs = np.arange(0, len(mean_from_obs), obs_per_batch)
    idxs = np.concatenate((idxs, [len(mean_from_obs)]))  # include end point

    # Compute mean rewards
    mean_rews = {k: [] for k in models.keys()}
    reps = min(obs_per_batch, len(mean_from_obs))
    act_tiled = _tile_first_dim(act_samples, reps)
    next_obs_tiled = _tile_first_dim(next_obs_samples, reps)
    for start, end in zip(idxs[:-1], idxs[1:]):
        obs = mean_from_obs[start:end]
        obs_repeated = np.repeat(obs, len(act_samples), axis=0)
        batch = types.Transitions(
            obs=obs_repeated,
            acts=act_tiled[:len(obs_repeated), :],
            next_obs=next_obs_tiled[:len(obs_repeated), :],
            dones=np.zeros(len(obs_repeated), dtype=np.bool),
            infos=None,
        )
        rews = rewards.evaluate_models(models, batch)
        rews = {k: v.reshape(len(obs), -1) for k, v in rews.items()}
        for k, m in mean_rews.items():
            means = np.mean(rews[k], axis=1)
            m.extend(means)

    mean_rews = {k: np.array(v) for k, v in mean_rews.items()}
    for v in mean_rews.values():
        assert v.shape == (len(mean_from_obs), )
    return mean_rews