Beispiel #1
0
def train_eval(
    # harness
    # tensorboard files
    root_dir,
    # Params for collect
    num_environment_steps,
    # Params for eval
    num_eval_episodes,
    eval_interval,
    # Params for summaries
    summary_interval,
    # environment
    env_name,
    gym_random_seed,
    # agent
    random_seed,
    num_hidden_layers_agent,
    num_hidden_nodes_agent,
    discount_factor,
    lambda_value,
    max_kl,
    backtrack_coefficient,
    backtrack_iters,
    cg_iters,
    reward_normalizer,
    reward_norm_clipping,
    log_prob_clipping,
    value_train_iters,
    value_optimizer,
    gradient_clipping,
    debug,
    # agent trainer
    steps_per_policy_update,
    # agent specific harness parameters
    replay_buffer_capacity,
    use_tf_function,
):
    """
    This function will train and evaluate a TRPO agent.

    :param root_dir: Root directory where all experiments are stored.
    :param num_environment_steps: The number of environment steps to run the
            experiment for.
    :param num_eval_episodes: Number of episodes at each evaluation point.
    :param eval_interval: Interval for evaluation points.
    :param summary_interval: Interval for summaries.
    :param env_name: Name for the environment to load.
    :param gym_random_seed: Value to use as seed for the environment.
    :param random_seed: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
    :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of hidden layers in the neural network.
    :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of nodes in each hidden layer. Parameter is
            shared across all layers.
    :param discount_factor: discount factor in [0, 1]
    :param lambda_value: trace decay used by the GAE critic in [0, 1]
    :param max_kl: maximum KL distance between updated and old policy
    :param backtrack_coefficient: coefficient used in step size search
    :param backtrack_iters: number of iterations to performa in line search
    :param cg_iters: number of conjugate gradient iterations to approximate natural gradient
    :param reward_normalizer: TensorNormalizer applied to rewards
    :param reward_norm_clipping: value to clip rewards
    :param log_prob_clipping: clip value for log probs in policy gradient , None for no clipping
    :param value_train_iters: number of gradient steps to perform on value estimator
            for every policy update
    :param value_optimizer: optimizer used to train value_function (default: Adam)
    :param gradient_clipping: clip born value gradient (None for no clipping)
    :param debug: debug flag to check computations for Nans
    :param steps_per_policy_update: steps between policy updates
    :param replay_buffer_capacity: Capacity of the buffer collecting real samples.
    :param use_tf_function: If `True`, use a `tf.function` for data collection.
    """
    tf.compat.v1.set_random_seed(random_seed)

    environment = create_real_tf_environment(env_name, gym_random_seed)
    evaluation_environment = create_real_tf_environment(
        env_name, gym_random_seed)

    network_architecture = (num_hidden_nodes_agent, ) * num_hidden_layers_agent
    actor_net = ActorDistributionNetwork(
        environment.observation_spec(),
        environment.action_spec(),
        fc_layer_params=network_architecture,
    )
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=network_architecture)
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = TRPOAgent(
        environment.time_step_spec(),
        environment.action_spec(),
        actor_net,
        value_net,
        discount_factor,
        lambda_value,
        max_kl,
        backtrack_coefficient,
        backtrack_iters,
        cg_iters,
        reward_normalizer,
        reward_norm_clipping,
        log_prob_clipping,
        value_train_iters,
        value_optimizer,
        gradient_clipping,
        debug,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(steps_per_policy_update)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=use_tf_function,
    )
    experiment_harness.run()
Beispiel #2
0
def train_eval(
    # harness
    # tensorboard files
    root_dir,
    # Params for collect
    num_environment_steps,
    # Params for eval
    num_eval_episodes,
    eval_interval,
    # Params for summaries
    summary_interval,
    # environment
    env_name,
    gym_random_seed,
    reward_model_class,
    initial_state_distribution_model_class,
    # agent
    random_seed,
    transition_model_type,
    num_hidden_layers_model,
    num_hidden_nodes_model,
    activation_function_model,
    ensemble_size,
    predict_state_difference,
    epochs,
    training_batch_size,
    trajectory_sampler_type,
    horizon,
    population_size,
    model_free_agent_type,
    num_hidden_layers_agent,
    num_hidden_nodes_agent,
    activation_function_agent,
    model_free_training_iterations,
    debug_summaries,
    # agent trainer
    steps_per_transition_model_update,
    steps_per_model_free_agent_update,
    # agent specific harness parameters
    replay_buffer_capacity,
    number_of_initial_random_policy_steps,
    use_tf_function,
):
    """
    This function will train and evaluate an MEPO agent.

    :param root_dir: Root directory where all experiments are stored.
    :param num_environment_steps: The number of environment steps to run the
            experiment for.
    :param num_eval_episodes: Number of episodes at each evaluation point.
    :param eval_interval: Interval for evaluation points.
    :param summary_interval: Interval for summaries.
    :param env_name: Name for the environment to load.
    :param gym_random_seed: Value to use as seed for the environment.
    :param reward_model_class: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
    :param initial_state_distribution_model_class: A component of the environment model that
            describes the initial state distribution (can be both deterministic or
            probabilistic). At the moment only pre-specified initial state distribution models
            are allowed, i.e. agent assumes initial state distribution is known.
    :param random_seed: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
    :param transition_model_type: An indicator which of the available transition models
            should be used - list can be found in `TransitionModelType`. A component of the
            environment model that describes the transition dynamics.
    :param num_hidden_layers_model: A transition model parameter, used for constructing a neural
            network. A number of hidden layers in the neural network.
    :param num_hidden_nodes_model: A transition model parameter, used for constructing a neural
            network. A number of nodes in each hidden layer. Parameter is shared across all layers.
    :param activation_function_model: A transition model parameter, used for constructing a
            neural network. An activation function of the hidden nodes.
    :param ensemble_size: A transition model parameter, used for constructing a neural
            network. The number of networks in the ensemble.
    :param predict_state_difference: A transition model parameter, used for constructing a
            neural network. A boolean indicating whether transition model will be predicting a
            difference between current and a next state or the next state directly.
    :param epochs: A transition model parameter, used by Keras fit method. A number of epochs
            used for training the neural network.
    :param training_batch_size: A transition model parameter, used by Keras fit method. A
            batch size used for training the neural network.
    :param trajectory_sampler_type: An indicator which of the available trajectory samplers
            should be used - list can be found in `TrajectorySamplerType`. Trajectory sampler
            determines how predictions from an ensemble of neural networks that model the
            transition dynamics are sampled. Works only with ensemble type of transition models.
    :param horizon: A trajectory optimiser parameter. The number of steps taken in the
            environment in each virtual rollout.
    :param population_size: A trajectory optimiser parameter. The number of virtual rollouts
            that are simulated in each iteration during trajectory optimization.
    :param model_free_agent_type: Type of model-free agent, e.g. PPO or TRPO.
    :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of hidden layers in the neural network.
    :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of nodes in each hidden layer. Parameter is
            shared across all layers.
    :param activation_function_agent: A model-free agent parameter, used for constructing a
            neural network. An activation function of the hidden nodes.
    :param model_free_training_iterations: Number of model-free training iterations per each
            train-call.
    :param debug_summaries: A bool; if true, subclasses should gather debug summaries.
    :param steps_per_transition_model_update: steps between transition model updates.
    :param steps_per_model_free_agent_update: steps between model-free agent updates.
    :param replay_buffer_capacity: Capacity of the buffer collecting real samples.
    :param number_of_initial_random_policy_steps: If > 0, some initial training data is
            gathered by running a random policy on the real environment.
    :param use_tf_function: If `True`, use a `tf.function` for data collection.

    """
    tf.compat.v1.set_random_seed(random_seed)

    environment = create_real_tf_environment(env_name, gym_random_seed)
    evaluation_environment = create_real_tf_environment(env_name, gym_random_seed)

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)]
    reward_model = reward_model_class(
        environment.observation_spec(), environment.action_spec()
    )
    initial_state_distribution_model = initial_state_distribution_model_class(
        environment.observation_spec()
    )
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = MepoAgent(
        environment.time_step_spec(),
        environment.action_spec(),
        transition_model_type,
        num_hidden_layers_model,
        num_hidden_nodes_model,
        activation_function_model,
        ensemble_size,
        predict_state_difference,
        epochs,
        training_batch_size,
        callbacks,
        reward_model,
        initial_state_distribution_model,
        trajectory_sampler_type,
        horizon,
        population_size,
        model_free_agent_type,
        num_hidden_layers_agent,
        num_hidden_nodes_agent,
        activation_function_agent,
        model_free_training_iterations,
        debug_summaries=debug_summaries,
        train_step_counter=global_step,
    )

    agent_trainer = BackgroundPlanningAgentTrainer(
        steps_per_transition_model_update, steps_per_model_free_agent_update
    )

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps,
        use_tf_function,
    )
    experiment_harness.run()
Beispiel #3
0
def test_create_real_tf_environment(dummy_environment_name, environment_seed):
    environment = create_real_tf_environment(dummy_environment_name,
                                             environment_seed)
    assert environment.pyenv.envs[0].random_seed == environment_seed