Example #1
0
def plot_pm_reward(
    styles: Iterable[str],
    env_name: str,
    discount: float,
    models: Sequence[Tuple[str, str, str]],
    data_root: str,
    # Mesh parameters
    pos_lim: float,
    pos_density: int,
    vel_lim: float,
    act_lim: float,
    density: int,
    # Figure parameters
    ncols: int,
    cbar_kwargs: Mapping[str, Any],
    log_dir: str,
    fmt: str,
) -> xr.DataArray:
    """Entry-point into script to visualize a reward model for point mass."""
    with stylesheets.setup_styles(styles):
        env = gym.make(env_name)
        venv = vec_env.DummyVecEnv([lambda: env])
        goal = np.array([0.0])

        rewards = {}
        with networks.make_session():
            for model_name, reward_type, reward_path in models:
                reward_path = os.path.join(data_root, reward_path)
                model = serialize.load_reward(reward_type, reward_path, venv,
                                              discount)
                reward = point_mass_analysis.evaluate_reward_model(
                    env,
                    model,
                    goal=goal,
                    pos_lim=pos_lim,
                    pos_density=pos_density,
                    vel_lim=vel_lim,
                    act_lim=act_lim,
                    density=density,
                )
                rewards[model_name] = reward

        if len(rewards) == 1:
            reward = next(iter(rewards.values()))
            kwargs = {"col_wrap": ncols}
        else:
            reward = xr.Dataset(rewards).to_array("model")
            kwargs = {"row": "Model"}

        fig = point_mass_analysis.plot_reward(reward,
                                              cbar_kwargs=cbar_kwargs,
                                              **kwargs)
        save_path = os.path.join(log_dir, "reward")
        visualize.save_fig(save_path, fig, fmt=fmt)

        return reward
Example #2
0
def regress(
    seed: int,
    # Dataset
    env_name: str,
    discount: float,
    # Target specification
    target_reward_type: str,
    target_reward_path: str,
    # Model parameters
    make_source: MakeModelFn,
    source_init: bool,
    make_trainer: MakeTrainerFn,
    do_training: DoTrainingFn,
    # Logging
    log_dir: str,
    checkpoint_interval: int,
) -> V:
    """Train a model on target and save the results, reporting training stats."""
    # This venv is needed by serialize.load_reward, but is never stepped.
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])

    with networks.make_session() as (_, sess):
        tf.random.set_random_seed(seed)

        with tf.variable_scope("source") as model_scope:
            model = make_source(venv)

        with tf.variable_scope("target"):
            target = serialize.load_reward(target_reward_type,
                                           target_reward_path, venv, discount)

        with tf.variable_scope("train") as train_scope:
            trainer = make_trainer(model, model_scope, target)

        # Do not initialize any variables from target, which have already been
        # set during serialization.
        init_vars = train_scope.global_variables()
        if source_init:
            init_vars += model_scope.global_variables()
        sess.run(tf.initializers.variables(init_vars))

        def callback(epoch: int) -> None:
            if checkpoint_interval > 0 and epoch % checkpoint_interval == 0:
                trainer.model.save(
                    os.path.join(log_dir, "checkpoints", f"{epoch:05d}"))

        stats = do_training(target, trainer, callback)

        # Trainer may wrap source, so save `trainer.model` not source directly
        # (see e.g. RegressWrappedModel).
        trainer.model.save(os.path.join(log_dir, "checkpoints", "final"))

        with open(os.path.join(log_dir, "stats.pkl"), "wb") as f:
            pickle.dump(stats, f)

    return stats
Example #3
0
def _sample_fixed_length_trajectories(
    episode_lengths: Sequence[int], min_episodes: int, **kwargs,
) -> Sequence[types.Trajectory]:
    venv = vec_env.DummyVecEnv(
        [functools.partial(TerminalSentinelEnv, length) for length in episode_lengths]
    )
    policy = RandomPolicy(venv.observation_space, venv.action_space)
    sample_until = rollout.min_episodes(min_episodes)
    trajectories = rollout.generate_trajectories(
        policy, venv, sample_until=sample_until, **kwargs,
    )
    return trajectories
Example #4
0
def get_affine_from_models(env_name: str, paths: Iterable[str]):
    """Extract affine parameters from reward model."""
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])
    res = {}
    with networks.make_session():
        for path in paths:
            model = serialize.load_reward(
                "evaluating_rewards/RewardModel-v0",
                os.path.join(path, "model"),
                venv,
            )
            return model.models["wrapped"][0].get_weights()
    return res
def test_regress(
    graph: tf.Graph,
    session: tf.Session,
    target: str,
    loss_ub: float,
    rel_loss_lb: float,
    discount: float = 0.99,
):
    """Test regression onto target.

    Args:
        target: The target reward model type. Must be a hardcoded reward:
            we always load with a path "dummy".
        loss_ub: The maximum loss of the model at the end of training.
        rel_loss_lb: The minimum relative improvement to the initial loss.
    """
    env_name = "evaluating_rewards/PointMassLine-v0"
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])

    with datasets.transitions_factory_from_random_model(
            env_name) as dataset_generator:
        with graph.as_default():
            with session.as_default():
                with tf.variable_scope("source") as source_scope:
                    source = base.MLPRewardModel(venv.observation_space,
                                                 venv.action_space)

                with tf.variable_scope("target"):
                    target_model = serialize.load_reward(
                        target, "dummy", venv, discount)

                with tf.variable_scope("match") as match_scope:
                    match = comparisons.RegressModel(source, target_model)

                init_vars = source_scope.global_variables(
                ) + match_scope.global_variables()
                session.run(tf.initializers.variables(init_vars))

                stats = match.fit(dataset_generator,
                                  total_timesteps=1e5,
                                  batch_size=512)

        loss = pd.DataFrame(stats["loss"])["singleton"]
        logging.info(f"Loss: {loss.iloc[::10]}")
        initial_loss = loss.iloc[0]
        logging.info(f"Initial loss: {initial_loss}")
        final_loss = loss.iloc[-10:].mean()
        logging.info(f"Final loss: {final_loss}")

        assert initial_loss / final_loss > rel_loss_lb
        assert final_loss < loss_ub
Example #6
0
def load_models(
    env_name: str,
    discount: float,
    reward_cfgs: Iterable[common_config.RewardCfg],
) -> Mapping[common_config.RewardCfg, base.RewardModel]:
    """Load models specified by the `reward_cfgs`.

    Args:
        - env_name: The environment name in the Gym registry of the rewards to compare.
        - discount: Discount to use for reward models (mostly for shaping).
        - reward_cfgs: Iterable of reward configurations.

    Returns:
         A mapping from reward configurations to the loaded reward model.
    """
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])
    return {(kind, path): serialize.load_reward(kind, path, venv, discount)
            for kind, path in reward_cfgs}
Example #7
0
def test_rollout_stats():
    """Applying `ObsRewIncrementWrapper` halves the reward mean.

    `rollout_stats` should reflect this.
    """
    env = gym.make("CartPole-v1")
    env = bench.Monitor(env, None)
    env = ObsRewHalveWrapper(env)
    venv = vec_env.DummyVecEnv([lambda: env])

    with serialize.load_policy("zero", "UNUSED", venv) as policy:
        trajs = rollout.generate_trajectories(policy, venv, rollout.min_episodes(10))
    s = rollout.rollout_stats(trajs)

    np.testing.assert_allclose(s["return_mean"], s["monitor_return_mean"] / 2)
    np.testing.assert_allclose(s["return_std"], s["monitor_return_std"] / 2)
    np.testing.assert_allclose(s["return_min"], s["monitor_return_min"] / 2)
    np.testing.assert_allclose(s["return_max"], s["monitor_return_max"] / 2)
Example #8
0
def test_unwrap_traj():
    """Check that unwrap_traj reverses `ObsRewIncrementWrapper`.

    Also check that unwrapping twice is a no-op.
    """
    env = gym.make("CartPole-v1")
    env = wrappers.RolloutInfoWrapper(env)
    env = ObsRewHalveWrapper(env)
    venv = vec_env.DummyVecEnv([lambda: env])

    with serialize.load_policy("zero", "UNUSED", venv) as policy:
        trajs = rollout.generate_trajectories(policy, venv, rollout.min_episodes(10))
    trajs_unwrapped = [rollout.unwrap_traj(t) for t in trajs]
    trajs_unwrapped_twice = [rollout.unwrap_traj(t) for t in trajs_unwrapped]

    for t, t_unwrapped in zip(trajs, trajs_unwrapped):
        np.testing.assert_allclose(t.acts, t_unwrapped.acts)
        np.testing.assert_allclose(t.obs, t_unwrapped.obs / 2)
        np.testing.assert_allclose(t.rews, t_unwrapped.rews / 2)

    for t1, t2 in zip(trajs_unwrapped, trajs_unwrapped_twice):
        np.testing.assert_equal(t1.acts, t2.acts)
        np.testing.assert_equal(t1.obs, t2.obs)
        np.testing.assert_equal(t1.rews, t2.rews)
def batch_reward_heatmaps(
    checkpoints_dir: Union[str, pathlib.Path],
    n_gen_trajs: int = 50,
    exp_trajs: Optional[List[types.Trajectory]] = None,
) -> Dict[pathlib.Path, plt.Figure]:
    """Build multiple mountain car reward heatmaps from a checkpoint directory.

    One plot is generated for every combination of action and checkpoint timestep.

    Args:
        checkpoints_dir: Path to `checkpoint/` directory from AIRL or GAIL output
            directory.
        n_gen_trajs: The number of trajectories to rollout using each generator
            checkpoint. The transitions in the trajectory are scatterplotted on top of
            the heatmap from the same checkpoint timestamp. Nonpositive indicates that
            no trajectories should be plotted.
        exp_trajs: Expert trajectories for scatterplotting. Generator trajectories
            are dynamically generated from generator checkpoints.

    Returns:
        A dictionary mapping relative paths to `plt.Figure`. Every key is of the
        form "{action_name}/{checkpoint_step}" where action_name is "left",
        "neutral", or "right".
    """
    result = {}
    venv = vec_env.DummyVecEnv([lambda: gym.make("MountainCar-v0")])
    checkpoints_dir = pathlib.Path(checkpoints_dir)
    for checkpoint_dir in sorted(checkpoints_dir.iterdir()):
        vec_normalize_path = checkpoint_dir / "gen_policy" / "vec_normalize.pkl"
        discrim_path = checkpoint_dir / "discrim"
        policy_path = checkpoint_dir / "gen_policy"

        if n_gen_trajs > 0:
            # `load_policy` automatically loads VecNormalize for policy evaluation.
            with policies_serialize.load_policy(
                "ppo2", str(policy_path), venv
            ) as gen_policy:
                gen_trajs = rollout.generate_trajectories(
                    gen_policy, venv, sample_until=rollout.min_episodes(n_gen_trajs)
                )
        else:
            gen_trajs = None

        # `gen_trajs` contains unnormalized observations.
        # Load VecNormalize for use in RewardFn, which doesn't automatically
        # normalize input observations.
        with open(vec_normalize_path, "rb") as f:
            vec_normalize = pickle.load(f)  # type: vec_env.VecNormalize
        vec_normalize.training = False

        reward_fn_ctx = rewards_serialize.load_reward("DiscrimNet", discrim_path, venv)
        with reward_fn_ctx as reward_fn:
            norm_rew_fn = common.build_norm_reward_fn(
                reward_fn=reward_fn, vec_normalize=vec_normalize
            )
            for act in range(MC_NUM_ACTS):
                fig = make_heatmap(
                    act=act,
                    reward_fn=norm_rew_fn,
                    gen_trajs=gen_trajs,
                    exp_trajs=exp_trajs,
                )
                path = pathlib.Path(ACT_NAMES[act], checkpoint_dir.name)
                result[path] = fig
    return result
Example #10
0
def make_venv(env_name):
    with make_env_ctx(env_name) as env:
        yield vec_env.DummyVecEnv([lambda: env])
Example #11
0
def test_sample_canon_shaping(
    graph: tf.Graph,
    session: tf.Session,
    discount: float,
    eps: float = 1e-4,
):
    """Tests canonical_sample.sample_canon_shaping.

    Specifically, verifies that sparse, sparse affine-transformed and dense rewards in PointMass
    compare equal (distance < eps); and than sparse and the ground-truth (norm) reward are unequal
    (distance > 0.1).
    """
    venv = vec_env.DummyVecEnv(
        [lambda: gym.make("evaluating_rewards/PointMassLine-v0")])
    reward_types = [
        "evaluating_rewards/PointMassSparseWithCtrl-v0",
        "evaluating_rewards/PointMassDenseWithCtrl-v0",
        "evaluating_rewards/PointMassGroundTruth-v0",
    ]
    with graph.as_default():
        with session.as_default():
            models = {
                k: serialize.load_reward(k, "dummy", venv, discount)
                for k in reward_types
            }
            constant = rewards.ConstantReward(venv.observation_space,
                                              venv.action_space)
            constant.constant.set_constant(42.0)
            models["big_sparse"] = rewards.LinearCombinationModelWrapper({
                "model": (
                    models["evaluating_rewards/PointMassSparseWithCtrl-v0"],
                    tf.constant(10.0),
                ),
                "shift": (constant, tf.constant(1.0)),
            })

    with datasets.sample_dist_from_space(venv.observation_space) as obs_dist:
        with datasets.sample_dist_from_space(venv.action_space) as act_dist:
            with datasets.transitions_factory_iid_from_sample_dist(
                    obs_dist, act_dist) as iid_generator:
                batch = iid_generator(256)
    canon_rew = epic_sample.sample_canon_shaping(
        models,
        batch,
        act_dist,
        obs_dist,
        n_mean_samples=256,
        discount=discount,
    )

    sparse_vs_affine = tabular.direct_distance(
        canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"],
        canon_rew["big_sparse"],
        p=1,
    )
    assert sparse_vs_affine < eps
    sparse_vs_dense = tabular.direct_distance(
        canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"],
        canon_rew["evaluating_rewards/PointMassDenseWithCtrl-v0"],
        p=1,
    )
    assert sparse_vs_dense < eps
    sparse_vs_gt = tabular.direct_distance(
        canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"],
        canon_rew["evaluating_rewards/PointMassGroundTruth-v0"],
        p=1,
    )
    assert sparse_vs_gt > 0.1