Beispiel #1
0
    def f(make_model):
        policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space)
        with datasets.transitions_factory_from_policy(venv, policy) as dataset_callable:
            batch = dataset_callable(1024)

            with graph.as_default(), session.as_default():
                original = make_model(venv)
                session.run(tf.global_variables_initializer())

                with tempfile.TemporaryDirectory(prefix="eval-rew-serialize") as tmpdir:
                    original.save(tmpdir)

                    with tf.variable_scope("loaded_direct"):
                        loaded_direct = util_serialize.Serializable.load(tmpdir)

                    model_name = "evaluating_rewards/RewardModel-v0"
                    loaded_indirect = serialize.load_reward(model_name, tmpdir, venv)

                models = {"o": original, "ld": loaded_direct, "li": loaded_indirect}
                preds = base.evaluate_models(models, batch)

            for model in models.values():
                assert original.observation_space == model.observation_space
                assert original.action_space == model.action_space

            assert len(preds) == len(models)
            for pred in preds.values():
                assert np.allclose(preds["o"], pred)
Beispiel #2
0
def plot_pm_reward(
    styles: Iterable[str],
    env_name: str,
    discount: float,
    models: Sequence[Tuple[str, str, str]],
    data_root: str,
    # Mesh parameters
    pos_lim: float,
    pos_density: int,
    vel_lim: float,
    act_lim: float,
    density: int,
    # Figure parameters
    ncols: int,
    cbar_kwargs: Mapping[str, Any],
    log_dir: str,
    fmt: str,
) -> xr.DataArray:
    """Entry-point into script to visualize a reward model for point mass."""
    with stylesheets.setup_styles(styles):
        env = gym.make(env_name)
        venv = vec_env.DummyVecEnv([lambda: env])
        goal = np.array([0.0])

        rewards = {}
        with networks.make_session():
            for model_name, reward_type, reward_path in models:
                reward_path = os.path.join(data_root, reward_path)
                model = serialize.load_reward(reward_type, reward_path, venv,
                                              discount)
                reward = point_mass_analysis.evaluate_reward_model(
                    env,
                    model,
                    goal=goal,
                    pos_lim=pos_lim,
                    pos_density=pos_density,
                    vel_lim=vel_lim,
                    act_lim=act_lim,
                    density=density,
                )
                rewards[model_name] = reward

        if len(rewards) == 1:
            reward = next(iter(rewards.values()))
            kwargs = {"col_wrap": ncols}
        else:
            reward = xr.Dataset(rewards).to_array("model")
            kwargs = {"row": "Model"}

        fig = point_mass_analysis.plot_reward(reward,
                                              cbar_kwargs=cbar_kwargs,
                                              **kwargs)
        save_path = os.path.join(log_dir, "reward")
        visualize.save_fig(save_path, fig, fmt=fmt)

        return reward
Beispiel #3
0
def regress(
    seed: int,
    # Dataset
    env_name: str,
    discount: float,
    # Target specification
    target_reward_type: str,
    target_reward_path: str,
    # Model parameters
    make_source: MakeModelFn,
    source_init: bool,
    make_trainer: MakeTrainerFn,
    do_training: DoTrainingFn,
    # Logging
    log_dir: str,
    checkpoint_interval: int,
) -> V:
    """Train a model on target and save the results, reporting training stats."""
    # This venv is needed by serialize.load_reward, but is never stepped.
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])

    with networks.make_session() as (_, sess):
        tf.random.set_random_seed(seed)

        with tf.variable_scope("source") as model_scope:
            model = make_source(venv)

        with tf.variable_scope("target"):
            target = serialize.load_reward(target_reward_type,
                                           target_reward_path, venv, discount)

        with tf.variable_scope("train") as train_scope:
            trainer = make_trainer(model, model_scope, target)

        # Do not initialize any variables from target, which have already been
        # set during serialization.
        init_vars = train_scope.global_variables()
        if source_init:
            init_vars += model_scope.global_variables()
        sess.run(tf.initializers.variables(init_vars))

        def callback(epoch: int) -> None:
            if checkpoint_interval > 0 and epoch % checkpoint_interval == 0:
                trainer.model.save(
                    os.path.join(log_dir, "checkpoints", f"{epoch:05d}"))

        stats = do_training(target, trainer, callback)

        # Trainer may wrap source, so save `trainer.model` not source directly
        # (see e.g. RegressWrappedModel).
        trainer.model.save(os.path.join(log_dir, "checkpoints", "final"))

        with open(os.path.join(log_dir, "stats.pkl"), "wb") as f:
            pickle.dump(stats, f)

    return stats
Beispiel #4
0
def get_affine_from_models(env_name: str, paths: Iterable[str]):
    """Extract affine parameters from reward model."""
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])
    res = {}
    with networks.make_session():
        for path in paths:
            model = serialize.load_reward(
                "evaluating_rewards/RewardModel-v0",
                os.path.join(path, "model"),
                venv,
            )
            return model.models["wrapped"][0].get_weights()
    return res
def test_regress(
    graph: tf.Graph,
    session: tf.Session,
    target: str,
    loss_ub: float,
    rel_loss_lb: float,
    discount: float = 0.99,
):
    """Test regression onto target.

    Args:
        target: The target reward model type. Must be a hardcoded reward:
            we always load with a path "dummy".
        loss_ub: The maximum loss of the model at the end of training.
        rel_loss_lb: The minimum relative improvement to the initial loss.
    """
    env_name = "evaluating_rewards/PointMassLine-v0"
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])

    with datasets.transitions_factory_from_random_model(
            env_name) as dataset_generator:
        with graph.as_default():
            with session.as_default():
                with tf.variable_scope("source") as source_scope:
                    source = base.MLPRewardModel(venv.observation_space,
                                                 venv.action_space)

                with tf.variable_scope("target"):
                    target_model = serialize.load_reward(
                        target, "dummy", venv, discount)

                with tf.variable_scope("match") as match_scope:
                    match = comparisons.RegressModel(source, target_model)

                init_vars = source_scope.global_variables(
                ) + match_scope.global_variables()
                session.run(tf.initializers.variables(init_vars))

                stats = match.fit(dataset_generator,
                                  total_timesteps=1e5,
                                  batch_size=512)

        loss = pd.DataFrame(stats["loss"])["singleton"]
        logging.info(f"Loss: {loss.iloc[::10]}")
        initial_loss = loss.iloc[0]
        logging.info(f"Initial loss: {initial_loss}")
        final_loss = loss.iloc[-10:].mean()
        logging.info(f"Final loss: {final_loss}")

        assert initial_loss / final_loss > rel_loss_lb
        assert final_loss < loss_ub
Beispiel #6
0
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id):
    """Checks that reward models predictions match those of Gym reward."""
    # Generate rollouts, recording Gym reward
    policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space)
    transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024)
    gym_reward = transitions.rews

    # Make predictions using reward model
    with graph.as_default(), session.as_default():
        reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0)
        pred_reward = base.evaluate_models({"m": reward_model}, transitions)["m"]

    # Are the predictions close to true Gym reward?
    np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
Beispiel #7
0
def load_models(
    env_name: str,
    discount: float,
    reward_cfgs: Iterable[common_config.RewardCfg],
) -> Mapping[common_config.RewardCfg, base.RewardModel]:
    """Load models specified by the `reward_cfgs`.

    Args:
        - env_name: The environment name in the Gym registry of the rewards to compare.
        - discount: Discount to use for reward models (mostly for shaping).
        - reward_cfgs: Iterable of reward configurations.

    Returns:
         A mapping from reward configurations to the loaded reward model.
    """
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])
    return {(kind, path): serialize.load_reward(kind, path, venv, discount)
            for kind, path in reward_cfgs}
Beispiel #8
0
 def make_source(venv):
     kind, path = source_reward_cfg
     return serialize.load_reward(kind, path, venv, discount)
def load_monte_carlo_greedy(path: str,
                            env: vec_env.VecEnv) -> MonteCarloGreedyPolicy:
    reward_type, reward_path, discount = path.split(":")
    reward_model = serialize.load_reward(reward_type, reward_path, env,
                                         float(discount))
    return MonteCarloGreedyPolicy(env, reward_model=reward_model)
Beispiel #10
0
 def make_source(venv):
     return serialize.load_reward(source_reward_type,
                                  source_reward_path, venv, discount)
Beispiel #11
0
def test_sample_canon_shaping(
    graph: tf.Graph,
    session: tf.Session,
    discount: float,
    eps: float = 1e-4,
):
    """Tests canonical_sample.sample_canon_shaping.

    Specifically, verifies that sparse, sparse affine-transformed and dense rewards in PointMass
    compare equal (distance < eps); and than sparse and the ground-truth (norm) reward are unequal
    (distance > 0.1).
    """
    venv = vec_env.DummyVecEnv(
        [lambda: gym.make("evaluating_rewards/PointMassLine-v0")])
    reward_types = [
        "evaluating_rewards/PointMassSparseWithCtrl-v0",
        "evaluating_rewards/PointMassDenseWithCtrl-v0",
        "evaluating_rewards/PointMassGroundTruth-v0",
    ]
    with graph.as_default():
        with session.as_default():
            models = {
                k: serialize.load_reward(k, "dummy", venv, discount)
                for k in reward_types
            }
            constant = rewards.ConstantReward(venv.observation_space,
                                              venv.action_space)
            constant.constant.set_constant(42.0)
            models["big_sparse"] = rewards.LinearCombinationModelWrapper({
                "model": (
                    models["evaluating_rewards/PointMassSparseWithCtrl-v0"],
                    tf.constant(10.0),
                ),
                "shift": (constant, tf.constant(1.0)),
            })

    with datasets.sample_dist_from_space(venv.observation_space) as obs_dist:
        with datasets.sample_dist_from_space(venv.action_space) as act_dist:
            with datasets.transitions_factory_iid_from_sample_dist(
                    obs_dist, act_dist) as iid_generator:
                batch = iid_generator(256)
    canon_rew = epic_sample.sample_canon_shaping(
        models,
        batch,
        act_dist,
        obs_dist,
        n_mean_samples=256,
        discount=discount,
    )

    sparse_vs_affine = tabular.direct_distance(
        canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"],
        canon_rew["big_sparse"],
        p=1,
    )
    assert sparse_vs_affine < eps
    sparse_vs_dense = tabular.direct_distance(
        canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"],
        canon_rew["evaluating_rewards/PointMassDenseWithCtrl-v0"],
        p=1,
    )
    assert sparse_vs_dense < eps
    sparse_vs_gt = tabular.direct_distance(
        canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"],
        canon_rew["evaluating_rewards/PointMassGroundTruth-v0"],
        p=1,
    )
    assert sparse_vs_gt > 0.1