예제 #1
0
파일: run_ppo.py 프로젝트: adak32/bellman
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="CartPole-v1",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    actor_net = ActorDistributionNetwork(environment.observation_spec(),
                                         environment.action_spec(),
                                         fc_layer_params=(200, 100))
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=(200, 100))
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = PPOClipAgent(  # should be closer to the paper than PPOAgent...
        environment.time_step_spec(),
        environment.action_spec(),
        optimizer=tf.compat.v1.train.AdamOptimizer(
        ),  # default None does not work
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=True,
        lambda_value=0.5,
        discount_factor=0.95,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(400)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
예제 #2
0
def _experiment_harness_fixture(tmpdir) -> ExperimentHarness:
    root_dir = str(tmpdir / "root_dir")

    environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC)
    evaluation_environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC)
    agent = MyAgent(time_step_spec=environment.time_step_spec(),
                    action_spec=environment.action_spec())
    agent_trainer = SingleComponentAgentTrainer()

    return ExperimentHarness(
        root_dir=root_dir,
        environment=environment,
        evaluation_environment=evaluation_environment,
        agent=agent,
        agent_trainer=agent_trainer,
        real_replay_buffer_capacity=_REAL_REPLAY_BUFFER_CAPACITY,
        total_number_of_environment_steps=_MAX_STEPS,
        summary_interval=1,
        evaluation_interval=_MAX_STEPS,
        number_of_evaluation_episodes=1,
        number_of_initial_random_policy_steps=0,
    )
예제 #3
0
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="Pendulum-v0",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    critic_network = CriticNetwork(
        input_tensor_spec=(environment.observation_spec(),
                           environment.action_spec()),
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=(200, 100),
    )
    actor_network = ActorNetwork(
        input_tensor_spec=environment.observation_spec(),
        output_tensor_spec=environment.action_spec(),
        fc_layer_params=(200, 100),
    )
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DdpgAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        critic_network=critic_network,
        actor_network=actor_network,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(),
        train_step_counter=global_step,
    )

    agent_trainer = OffPolicyModelFreeAgentTrainer(1, 256)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
예제 #4
0
def train_eval(
    # harness
    # tensorboard files
    root_dir,
    # Params for collect
    num_environment_steps,
    # Params for eval
    num_eval_episodes,
    eval_interval,
    # Params for summaries
    summary_interval,
    # environment
    env_name,
    gym_random_seed,
    reward_model_class,
    initial_state_distribution_model_class,
    # agent
    random_seed,
    transition_model_type,
    num_hidden_layers_model,
    num_hidden_nodes_model,
    activation_function_model,
    ensemble_size,
    predict_state_difference,
    epochs,
    training_batch_size,
    trajectory_sampler_type,
    horizon,
    population_size,
    model_free_agent_type,
    num_hidden_layers_agent,
    num_hidden_nodes_agent,
    activation_function_agent,
    model_free_training_iterations,
    debug_summaries,
    # agent trainer
    steps_per_transition_model_update,
    steps_per_model_free_agent_update,
    # agent specific harness parameters
    replay_buffer_capacity,
    number_of_initial_random_policy_steps,
    use_tf_function,
):
    """
    This function will train and evaluate an MEPO agent.

    :param root_dir: Root directory where all experiments are stored.
    :param num_environment_steps: The number of environment steps to run the
            experiment for.
    :param num_eval_episodes: Number of episodes at each evaluation point.
    :param eval_interval: Interval for evaluation points.
    :param summary_interval: Interval for summaries.
    :param env_name: Name for the environment to load.
    :param gym_random_seed: Value to use as seed for the environment.
    :param reward_model_class: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
    :param initial_state_distribution_model_class: A component of the environment model that
            describes the initial state distribution (can be both deterministic or
            probabilistic). At the moment only pre-specified initial state distribution models
            are allowed, i.e. agent assumes initial state distribution is known.
    :param random_seed: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
    :param transition_model_type: An indicator which of the available transition models
            should be used - list can be found in `TransitionModelType`. A component of the
            environment model that describes the transition dynamics.
    :param num_hidden_layers_model: A transition model parameter, used for constructing a neural
            network. A number of hidden layers in the neural network.
    :param num_hidden_nodes_model: A transition model parameter, used for constructing a neural
            network. A number of nodes in each hidden layer. Parameter is shared across all layers.
    :param activation_function_model: A transition model parameter, used for constructing a
            neural network. An activation function of the hidden nodes.
    :param ensemble_size: A transition model parameter, used for constructing a neural
            network. The number of networks in the ensemble.
    :param predict_state_difference: A transition model parameter, used for constructing a
            neural network. A boolean indicating whether transition model will be predicting a
            difference between current and a next state or the next state directly.
    :param epochs: A transition model parameter, used by Keras fit method. A number of epochs
            used for training the neural network.
    :param training_batch_size: A transition model parameter, used by Keras fit method. A
            batch size used for training the neural network.
    :param trajectory_sampler_type: An indicator which of the available trajectory samplers
            should be used - list can be found in `TrajectorySamplerType`. Trajectory sampler
            determines how predictions from an ensemble of neural networks that model the
            transition dynamics are sampled. Works only with ensemble type of transition models.
    :param horizon: A trajectory optimiser parameter. The number of steps taken in the
            environment in each virtual rollout.
    :param population_size: A trajectory optimiser parameter. The number of virtual rollouts
            that are simulated in each iteration during trajectory optimization.
    :param model_free_agent_type: Type of model-free agent, e.g. PPO or TRPO.
    :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of hidden layers in the neural network.
    :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of nodes in each hidden layer. Parameter is
            shared across all layers.
    :param activation_function_agent: A model-free agent parameter, used for constructing a
            neural network. An activation function of the hidden nodes.
    :param model_free_training_iterations: Number of model-free training iterations per each
            train-call.
    :param debug_summaries: A bool; if true, subclasses should gather debug summaries.
    :param steps_per_transition_model_update: steps between transition model updates.
    :param steps_per_model_free_agent_update: steps between model-free agent updates.
    :param replay_buffer_capacity: Capacity of the buffer collecting real samples.
    :param number_of_initial_random_policy_steps: If > 0, some initial training data is
            gathered by running a random policy on the real environment.
    :param use_tf_function: If `True`, use a `tf.function` for data collection.

    """
    tf.compat.v1.set_random_seed(random_seed)

    environment = create_real_tf_environment(env_name, gym_random_seed)
    evaluation_environment = create_real_tf_environment(env_name, gym_random_seed)

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)]
    reward_model = reward_model_class(
        environment.observation_spec(), environment.action_spec()
    )
    initial_state_distribution_model = initial_state_distribution_model_class(
        environment.observation_spec()
    )
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = MepoAgent(
        environment.time_step_spec(),
        environment.action_spec(),
        transition_model_type,
        num_hidden_layers_model,
        num_hidden_nodes_model,
        activation_function_model,
        ensemble_size,
        predict_state_difference,
        epochs,
        training_batch_size,
        callbacks,
        reward_model,
        initial_state_distribution_model,
        trajectory_sampler_type,
        horizon,
        population_size,
        model_free_agent_type,
        num_hidden_layers_agent,
        num_hidden_nodes_agent,
        activation_function_agent,
        model_free_training_iterations,
        debug_summaries=debug_summaries,
        train_step_counter=global_step,
    )

    agent_trainer = BackgroundPlanningAgentTrainer(
        steps_per_transition_model_update, steps_per_model_free_agent_update
    )

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps,
        use_tf_function,
    )
    experiment_harness.run()
예제 #5
0
def train_eval(
    # harness
    # tensorboard files
    root_dir,
    # Params for collect
    num_environment_steps,
    # Params for eval
    num_eval_episodes,
    eval_interval,
    # Params for summaries
    summary_interval,
    # environment
    env_name,
    gym_random_seed,
    # agent
    random_seed,
    num_hidden_layers_agent,
    num_hidden_nodes_agent,
    discount_factor,
    lambda_value,
    max_kl,
    backtrack_coefficient,
    backtrack_iters,
    cg_iters,
    reward_normalizer,
    reward_norm_clipping,
    log_prob_clipping,
    value_train_iters,
    value_optimizer,
    gradient_clipping,
    debug,
    # agent trainer
    steps_per_policy_update,
    # agent specific harness parameters
    replay_buffer_capacity,
    use_tf_function,
):
    """
    This function will train and evaluate a TRPO agent.

    :param root_dir: Root directory where all experiments are stored.
    :param num_environment_steps: The number of environment steps to run the
            experiment for.
    :param num_eval_episodes: Number of episodes at each evaluation point.
    :param eval_interval: Interval for evaluation points.
    :param summary_interval: Interval for summaries.
    :param env_name: Name for the environment to load.
    :param gym_random_seed: Value to use as seed for the environment.
    :param random_seed: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
    :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of hidden layers in the neural network.
    :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of nodes in each hidden layer. Parameter is
            shared across all layers.
    :param discount_factor: discount factor in [0, 1]
    :param lambda_value: trace decay used by the GAE critic in [0, 1]
    :param max_kl: maximum KL distance between updated and old policy
    :param backtrack_coefficient: coefficient used in step size search
    :param backtrack_iters: number of iterations to performa in line search
    :param cg_iters: number of conjugate gradient iterations to approximate natural gradient
    :param reward_normalizer: TensorNormalizer applied to rewards
    :param reward_norm_clipping: value to clip rewards
    :param log_prob_clipping: clip value for log probs in policy gradient , None for no clipping
    :param value_train_iters: number of gradient steps to perform on value estimator
            for every policy update
    :param value_optimizer: optimizer used to train value_function (default: Adam)
    :param gradient_clipping: clip born value gradient (None for no clipping)
    :param debug: debug flag to check computations for Nans
    :param steps_per_policy_update: steps between policy updates
    :param replay_buffer_capacity: Capacity of the buffer collecting real samples.
    :param use_tf_function: If `True`, use a `tf.function` for data collection.
    """
    tf.compat.v1.set_random_seed(random_seed)

    environment = create_real_tf_environment(env_name, gym_random_seed)
    evaluation_environment = create_real_tf_environment(
        env_name, gym_random_seed)

    network_architecture = (num_hidden_nodes_agent, ) * num_hidden_layers_agent
    actor_net = ActorDistributionNetwork(
        environment.observation_spec(),
        environment.action_spec(),
        fc_layer_params=network_architecture,
    )
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=network_architecture)
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = TRPOAgent(
        environment.time_step_spec(),
        environment.action_spec(),
        actor_net,
        value_net,
        discount_factor,
        lambda_value,
        max_kl,
        backtrack_coefficient,
        backtrack_iters,
        cg_iters,
        reward_normalizer,
        reward_norm_clipping,
        log_prob_clipping,
        value_train_iters,
        value_optimizer,
        gradient_clipping,
        debug,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(steps_per_policy_update)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=use_tf_function,
    )
    experiment_harness.run()
예제 #6
0
def test_experiment_harness_summaries_and_logs(
    caplog,
    tmpdir,
    summary_interval,
    evaluation_interval,
    number_of_initial_random_policy_steps,
):
    root_dir = str(tmpdir / "root_dir")
    train_interval = _MAX_STEPS
    total_steps = train_interval * 2
    caplog.set_level(logging.INFO)

    # define a simple agent
    environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC)
    evaluation_environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC)
    agent = MyAgent(time_step_spec=environment.time_step_spec(),
                    action_spec=environment.action_spec())
    agent_trainer = SingleComponentAgentTrainer(train_interval)

    # execute the experiment
    harness = ExperimentHarness(
        root_dir=root_dir,
        environment=environment,
        evaluation_environment=evaluation_environment,
        agent=agent,
        agent_trainer=agent_trainer,
        real_replay_buffer_capacity=_REAL_REPLAY_BUFFER_CAPACITY,
        total_number_of_environment_steps=total_steps,
        summary_interval=summary_interval,
        evaluation_interval=evaluation_interval,
        number_of_evaluation_episodes=1,
        number_of_initial_random_policy_steps=
        number_of_initial_random_policy_steps,
    )
    harness.run()

    # get correct paths
    experiment_id = os.listdir(root_dir)[0]

    # check wall clock time
    wallclock_time = get_metric_values(
        root_dir,
        TRAIN_METRICS_DIR,
        TIME_METRIC,
        [experiment_id],
        True,
    )
    assert experiment_id in wallclock_time and isinstance(
        wallclock_time[experiment_id], float)

    # check train and evaluation summary
    tag_name = "Metrics/AverageReturn"
    train_metric_values = get_metric_values(root_dir, TRAIN_METRICS_DIR,
                                            tag_name, [experiment_id])
    eval_metric_values = get_metric_values(root_dir, EVALUATION_METRICS_DIR,
                                           tag_name, [experiment_id])
    assert [*train_metric_values[experiment_id].keys()] == [
        i for i in range(summary_interval, total_steps +
                         summary_interval, summary_interval)
        if i <= total_steps
    ]
    assert [*eval_metric_values[experiment_id].keys()] == [
        i for i in range(evaluation_interval, total_steps +
                         summary_interval, evaluation_interval)
        if i <= total_steps
    ]

    # check record of training the models or agents
    tag_name = "TrainingLoss/" + SingleComponentAgent.COMPONENT.name
    train_metric_values = get_metric_values(root_dir, TRAIN_METRICS_DIR,
                                            tag_name, [experiment_id])
    assert [*train_metric_values[experiment_id].keys()] == [
        i for i in range(train_interval, total_steps + 1, train_interval)
        if i <= total_steps
    ]

    # check logs for accurate random and regular transition collection intervals
    random_start_steps = []
    regular_start_steps = []
    for record in caplog.records:
        if hasattr(record,
                   "message") and "initial transitions" in record.message:
            random_start_steps.append(
                int("".join(filter(str.isdigit, record.message[0:15]))))
        if hasattr(record,
                   "message") and "regular transitions" in record.message:
            regular_start_steps.append(
                int("".join(filter(str.isdigit, record.message[0:15]))))
    if number_of_initial_random_policy_steps > 0:
        assert random_start_steps == [
            i for i in range(0, number_of_initial_random_policy_steps,
                             harness._max_steps)
            if i <= number_of_initial_random_policy_steps and i <= total_steps
        ]
    if number_of_initial_random_policy_steps < total_steps:
        assert regular_start_steps == [
            i for i in range(number_of_initial_random_policy_steps,
                             total_steps, harness._max_steps)
            if i <= total_steps
        ]