Beispiel #1
0
def model_comparison(
    _seed: int,  # pylint:disable=invalid-name
    # Dataset
    env_name: str,
    discount: float,
    dataset_factory: datasets.TransitionsFactory,
    dataset_factory_kwargs: Dict[str, Any],
    # Source specification
    source_reward_type: str,
    source_reward_path: str,
    # Target specification
    target_reward_type: str,
    target_reward_path: str,
    # Model parameters
    comparison_class: Type[comparisons.RegressModel],
    comparison_kwargs: Dict[str, Any],
    affine_size: int,
    total_timesteps: int,
    batch_size: int,
    fit_kwargs: Dict[str, Any],
    # Logging
    log_dir: str,
) -> Mapping[str, Any]:
    """Entry-point into script to regress source onto target reward model."""
    with dataset_factory(env_name, seed=_seed,
                         **dataset_factory_kwargs) as dataset_generator:

        def make_source(venv):
            return serialize.load_reward(source_reward_type,
                                         source_reward_path, venv, discount)

        def make_trainer(model, model_scope, target):
            del model_scope
            return comparison_class(model, target, **comparison_kwargs)

        def do_training(target, trainer):
            del target
            return trainer.fit(
                dataset_generator,
                total_timesteps=total_timesteps,
                batch_size=batch_size,
                affine_size=affine_size,
                **fit_kwargs,
            )

        return regress_utils.regress(
            seed=_seed,
            env_name=env_name,
            discount=discount,
            make_source=make_source,
            source_init=False,
            make_trainer=make_trainer,
            do_training=do_training,
            target_reward_type=target_reward_type,
            target_reward_path=target_reward_path,
            log_dir=log_dir,
        )
Beispiel #2
0
def train_regress(
    _seed: int,  # pylint:disable=invalid-name
    # Dataset
    env_name: str,
    discount: float,
    dataset_factory: datasets.TransitionsFactory,
    dataset_factory_kwargs: Dict[str, Any],
    # Target specification
    target_reward_type: str,
    target_reward_path: str,
    # Model parameters
    model_reward_type: regress_utils.EnvRewardFactory,
    total_timesteps: int,
    batch_size: int,
    learning_rate: float,
    # Logging
    checkpoint_interval: int,
    log_dir: str,
) -> Mapping[str, Any]:
    """Entry-point into script to regress source onto target reward model."""
    with dataset_factory(env_name, seed=_seed,
                         **dataset_factory_kwargs) as dataset_generator:
        make_source = functools.partial(regress_utils.make_model,
                                        model_reward_type)

        def make_trainer(model, model_scope, target):
            del model_scope
            return comparisons.RegressModel(model,
                                            target,
                                            learning_rate=learning_rate)

        def do_training(target, trainer, callback: Optional[base.Callback]):
            del target
            return trainer.fit(
                dataset_generator,
                total_timesteps=total_timesteps,
                batch_size=batch_size,
                callback=callback,
            )

        return regress_utils.regress(
            seed=_seed,
            env_name=env_name,
            discount=discount,
            make_source=make_source,
            source_init=True,
            make_trainer=make_trainer,
            do_training=do_training,
            target_reward_type=target_reward_type,
            target_reward_path=target_reward_path,
            log_dir=log_dir,
            checkpoint_interval=checkpoint_interval,
        )
def train_preferences(
    _seed: int,  # pylint:disable=invalid-name
    # Dataset
    env_name: str,
    discount: float,
    num_vec: int,
    policy_type: str,
    policy_path: str,
    # Target specification
    target_reward_type: str,
    target_reward_path: str,
    # Model parameters
    model_reward_type: regress_utils.EnvRewardFactory,
    trajectory_length: int,
    total_timesteps: int,
    batch_timesteps: int,
    learning_rate: float,
    weight_l2_reg: float,
    reward_l2_reg: float,
    accuracy_threshold: float,
    # Logging
    log_dir: str,
    checkpoint_interval: int,
) -> Mapping[str, Any]:
    """Entry-point into script for synthetic preference comparisons."""
    venv = util.make_vec_env(env_name, n_envs=num_vec, seed=_seed)

    make_source = functools.partial(regress_utils.make_model,
                                    model_reward_type)

    def make_trainer(model, model_scope, target):
        del target
        model_params = model_scope.global_variables()
        batch_size = batch_timesteps // trajectory_length
        kwargs = {"learning_rate": learning_rate}
        return preferences.PreferenceComparisonTrainer(
            model,
            model_params,
            batch_size=batch_size,
            optimizer_kwargs=kwargs,
            weight_l2_reg=weight_l2_reg,
            reward_l2_reg=reward_l2_reg,
            accuracy_threshold=accuracy_threshold,
        )

    with policies_serialize.load_policy(policy_type, policy_path,
                                        venv) as policy:

        def do_training(target, trainer, callback: Optional[base.Callback]):
            # Specify in terms of total_timesteps so longer trajectory_length
            # does not give model more data.
            total_comparisons = total_timesteps // trajectory_length

            return trainer.fit_synthetic(
                venv,
                policy=policy,
                target=target,
                trajectory_length=trajectory_length,
                total_comparisons=total_comparisons,
                callback=callback,
            )

        return regress_utils.regress(
            seed=_seed,
            env_name=env_name,
            discount=discount,
            make_source=make_source,
            source_init=True,
            make_trainer=make_trainer,
            do_training=do_training,
            target_reward_type=target_reward_type,
            target_reward_path=target_reward_path,
            log_dir=log_dir,
            checkpoint_interval=checkpoint_interval,
        )
Beispiel #4
0
def npec_worker(
    seed: int,
    # Dataset
    env_name: str,
    discount: float,
    visitations_factory,
    visitations_factory_kwargs: Dict[str, Any],
    # Models to compare
    source_reward_cfg: common_config.RewardCfg,
    target_reward_cfg: common_config.RewardCfg,
    # Model parameters
    comparison_class: Type[comparisons.RegressModel],
    comparison_kwargs: Dict[str, Any],
    total_timesteps: int,
    batch_size: int,
    fit_kwargs: Dict[str, Any],
    # Logging
    log_dir: str,
) -> comparisons.FitStats:
    """Performs a single NPEC comparison by fitting a model.

    Args:
        seed: Seed used for visitation factory and model initialization.
        env_name: the name of the environment to compare rewards for.
        discount: discount to use for reward models (mostly for shaping).
        visitations_factory: factory to sample transitions from during training.
        visitations_factory_kwargs: keyword arguments for the visitations factory.
        source_reward_cfg: specifies the serialized source reward.
        target_reward_cfg: specifies the serialized target reward to fit the source onto.
        comparison_class: how to fit the source onto target.
        comparison_kwargs: keyword arguments customizing `comparison_class`.
        total_timesteps: the total number of timesteps to train for.
        batch_size: the number of timesteps in each training batch.
        fit_kwargs: extra arguments to pass to the `fit` method of `comparison_class`.
        log_dir: directory to save data to.

    Returns:
        Statistics for training, including the final loss aka estimated NPEC distance.
    """
    # Configure logging, since Ray children do not by default inherit logging configs.
    script_utils.configure_logging()

    with visitations_factory(seed=seed, **visitations_factory_kwargs) as dataset_generator:

        def make_source(venv):
            kind, path = source_reward_cfg
            return serialize.load_reward(kind, path, venv, discount)

        def make_trainer(model, model_scope, target):
            del model_scope
            return comparison_class(model, target, **comparison_kwargs)

        def do_training(target, trainer, callback):
            del target
            return trainer.fit(
                dataset_generator,
                total_timesteps=total_timesteps,
                batch_size=batch_size,
                callback=callback,
                **fit_kwargs,
            )

        target_reward_type, target_reward_path = target_reward_cfg
        return regress_utils.regress(
            seed=seed,
            env_name=env_name,
            discount=discount,
            make_source=make_source,
            source_init=False,
            make_trainer=make_trainer,
            do_training=do_training,
            target_reward_type=target_reward_type,
            target_reward_path=target_reward_path,
            log_dir=log_dir,
            checkpoint_interval=0,  # disable checkpoints
        )