def train_eval( # tensorboard files root_dir, # environment env_name="CartPole-v1", random_seed=0, # Params for collect num_environment_steps=100000, replay_buffer_capacity=1001, # Per-environment # Params for eval num_eval_episodes=30, eval_interval=200, # Params for summaries summary_interval=50, ): tf.compat.v1.set_random_seed(random_seed) environment = TFPyEnvironment(suite_gym.load(env_name)) evaluation_environment = TFPyEnvironment(suite_gym.load(env_name)) actor_net = ActorDistributionNetwork(environment.observation_spec(), environment.action_spec(), fc_layer_params=(200, 100)) value_net = ValueNetwork(environment.observation_spec(), fc_layer_params=(200, 100)) global_step = tf.compat.v1.train.get_or_create_global_step() agent = PPOClipAgent( # should be closer to the paper than PPOAgent... environment.time_step_spec(), environment.action_spec(), optimizer=tf.compat.v1.train.AdamOptimizer( ), # default None does not work actor_net=actor_net, value_net=value_net, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, lambda_value=0.5, discount_factor=0.95, train_step_counter=global_step, ) agent_trainer = OnPolicyModelFreeAgentTrainer(400) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=True, ) experiment_harness.run()
def _experiment_harness_fixture(tmpdir) -> ExperimentHarness: root_dir = str(tmpdir / "root_dir") environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC) evaluation_environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC) agent = MyAgent(time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec()) agent_trainer = SingleComponentAgentTrainer() return ExperimentHarness( root_dir=root_dir, environment=environment, evaluation_environment=evaluation_environment, agent=agent, agent_trainer=agent_trainer, real_replay_buffer_capacity=_REAL_REPLAY_BUFFER_CAPACITY, total_number_of_environment_steps=_MAX_STEPS, summary_interval=1, evaluation_interval=_MAX_STEPS, number_of_evaluation_episodes=1, number_of_initial_random_policy_steps=0, )
def train_eval( # tensorboard files root_dir, # environment env_name="Pendulum-v0", random_seed=0, # Params for collect num_environment_steps=100000, replay_buffer_capacity=1001, # Per-environment # Params for eval num_eval_episodes=30, eval_interval=200, # Params for summaries summary_interval=50, ): tf.compat.v1.set_random_seed(random_seed) environment = TFPyEnvironment(suite_gym.load(env_name)) evaluation_environment = TFPyEnvironment(suite_gym.load(env_name)) critic_network = CriticNetwork( input_tensor_spec=(environment.observation_spec(), environment.action_spec()), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(200, 100), ) actor_network = ActorNetwork( input_tensor_spec=environment.observation_spec(), output_tensor_spec=environment.action_spec(), fc_layer_params=(200, 100), ) global_step = tf.compat.v1.train.get_or_create_global_step() agent = DdpgAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), critic_network=critic_network, actor_network=actor_network, actor_optimizer=tf.compat.v1.train.AdamOptimizer(), critic_optimizer=tf.compat.v1.train.AdamOptimizer(), train_step_counter=global_step, ) agent_trainer = OffPolicyModelFreeAgentTrainer(1, 256) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=True, ) experiment_harness.run()
def train_eval( # harness # tensorboard files root_dir, # Params for collect num_environment_steps, # Params for eval num_eval_episodes, eval_interval, # Params for summaries summary_interval, # environment env_name, gym_random_seed, reward_model_class, initial_state_distribution_model_class, # agent random_seed, transition_model_type, num_hidden_layers_model, num_hidden_nodes_model, activation_function_model, ensemble_size, predict_state_difference, epochs, training_batch_size, trajectory_sampler_type, horizon, population_size, model_free_agent_type, num_hidden_layers_agent, num_hidden_nodes_agent, activation_function_agent, model_free_training_iterations, debug_summaries, # agent trainer steps_per_transition_model_update, steps_per_model_free_agent_update, # agent specific harness parameters replay_buffer_capacity, number_of_initial_random_policy_steps, use_tf_function, ): """ This function will train and evaluate an MEPO agent. :param root_dir: Root directory where all experiments are stored. :param num_environment_steps: The number of environment steps to run the experiment for. :param num_eval_episodes: Number of episodes at each evaluation point. :param eval_interval: Interval for evaluation points. :param summary_interval: Interval for summaries. :param env_name: Name for the environment to load. :param gym_random_seed: Value to use as seed for the environment. :param reward_model_class: A component of the environment model that describes the rewards. At the moment only pre-specified reward models are allowed, i.e. agent assumes reward function is known. :param initial_state_distribution_model_class: A component of the environment model that describes the initial state distribution (can be both deterministic or probabilistic). At the moment only pre-specified initial state distribution models are allowed, i.e. agent assumes initial state distribution is known. :param random_seed: A component of the environment model that describes the rewards. At the moment only pre-specified reward models are allowed, i.e. agent assumes reward function is known. :param transition_model_type: An indicator which of the available transition models should be used - list can be found in `TransitionModelType`. A component of the environment model that describes the transition dynamics. :param num_hidden_layers_model: A transition model parameter, used for constructing a neural network. A number of hidden layers in the neural network. :param num_hidden_nodes_model: A transition model parameter, used for constructing a neural network. A number of nodes in each hidden layer. Parameter is shared across all layers. :param activation_function_model: A transition model parameter, used for constructing a neural network. An activation function of the hidden nodes. :param ensemble_size: A transition model parameter, used for constructing a neural network. The number of networks in the ensemble. :param predict_state_difference: A transition model parameter, used for constructing a neural network. A boolean indicating whether transition model will be predicting a difference between current and a next state or the next state directly. :param epochs: A transition model parameter, used by Keras fit method. A number of epochs used for training the neural network. :param training_batch_size: A transition model parameter, used by Keras fit method. A batch size used for training the neural network. :param trajectory_sampler_type: An indicator which of the available trajectory samplers should be used - list can be found in `TrajectorySamplerType`. Trajectory sampler determines how predictions from an ensemble of neural networks that model the transition dynamics are sampled. Works only with ensemble type of transition models. :param horizon: A trajectory optimiser parameter. The number of steps taken in the environment in each virtual rollout. :param population_size: A trajectory optimiser parameter. The number of virtual rollouts that are simulated in each iteration during trajectory optimization. :param model_free_agent_type: Type of model-free agent, e.g. PPO or TRPO. :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of hidden layers in the neural network. :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of nodes in each hidden layer. Parameter is shared across all layers. :param activation_function_agent: A model-free agent parameter, used for constructing a neural network. An activation function of the hidden nodes. :param model_free_training_iterations: Number of model-free training iterations per each train-call. :param debug_summaries: A bool; if true, subclasses should gather debug summaries. :param steps_per_transition_model_update: steps between transition model updates. :param steps_per_model_free_agent_update: steps between model-free agent updates. :param replay_buffer_capacity: Capacity of the buffer collecting real samples. :param number_of_initial_random_policy_steps: If > 0, some initial training data is gathered by running a random policy on the real environment. :param use_tf_function: If `True`, use a `tf.function` for data collection. """ tf.compat.v1.set_random_seed(random_seed) environment = create_real_tf_environment(env_name, gym_random_seed) evaluation_environment = create_real_tf_environment(env_name, gym_random_seed) callbacks = [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)] reward_model = reward_model_class( environment.observation_spec(), environment.action_spec() ) initial_state_distribution_model = initial_state_distribution_model_class( environment.observation_spec() ) global_step = tf.compat.v1.train.get_or_create_global_step() agent = MepoAgent( environment.time_step_spec(), environment.action_spec(), transition_model_type, num_hidden_layers_model, num_hidden_nodes_model, activation_function_model, ensemble_size, predict_state_difference, epochs, training_batch_size, callbacks, reward_model, initial_state_distribution_model, trajectory_sampler_type, horizon, population_size, model_free_agent_type, num_hidden_layers_agent, num_hidden_nodes_agent, activation_function_agent, model_free_training_iterations, debug_summaries=debug_summaries, train_step_counter=global_step, ) agent_trainer = BackgroundPlanningAgentTrainer( steps_per_transition_model_update, steps_per_model_free_agent_update ) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps, use_tf_function, ) experiment_harness.run()
def train_eval( # harness # tensorboard files root_dir, # Params for collect num_environment_steps, # Params for eval num_eval_episodes, eval_interval, # Params for summaries summary_interval, # environment env_name, gym_random_seed, # agent random_seed, num_hidden_layers_agent, num_hidden_nodes_agent, discount_factor, lambda_value, max_kl, backtrack_coefficient, backtrack_iters, cg_iters, reward_normalizer, reward_norm_clipping, log_prob_clipping, value_train_iters, value_optimizer, gradient_clipping, debug, # agent trainer steps_per_policy_update, # agent specific harness parameters replay_buffer_capacity, use_tf_function, ): """ This function will train and evaluate a TRPO agent. :param root_dir: Root directory where all experiments are stored. :param num_environment_steps: The number of environment steps to run the experiment for. :param num_eval_episodes: Number of episodes at each evaluation point. :param eval_interval: Interval for evaluation points. :param summary_interval: Interval for summaries. :param env_name: Name for the environment to load. :param gym_random_seed: Value to use as seed for the environment. :param random_seed: A component of the environment model that describes the rewards. At the moment only pre-specified reward models are allowed, i.e. agent assumes reward function is known. :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of hidden layers in the neural network. :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural networks for actor and critic. A number of nodes in each hidden layer. Parameter is shared across all layers. :param discount_factor: discount factor in [0, 1] :param lambda_value: trace decay used by the GAE critic in [0, 1] :param max_kl: maximum KL distance between updated and old policy :param backtrack_coefficient: coefficient used in step size search :param backtrack_iters: number of iterations to performa in line search :param cg_iters: number of conjugate gradient iterations to approximate natural gradient :param reward_normalizer: TensorNormalizer applied to rewards :param reward_norm_clipping: value to clip rewards :param log_prob_clipping: clip value for log probs in policy gradient , None for no clipping :param value_train_iters: number of gradient steps to perform on value estimator for every policy update :param value_optimizer: optimizer used to train value_function (default: Adam) :param gradient_clipping: clip born value gradient (None for no clipping) :param debug: debug flag to check computations for Nans :param steps_per_policy_update: steps between policy updates :param replay_buffer_capacity: Capacity of the buffer collecting real samples. :param use_tf_function: If `True`, use a `tf.function` for data collection. """ tf.compat.v1.set_random_seed(random_seed) environment = create_real_tf_environment(env_name, gym_random_seed) evaluation_environment = create_real_tf_environment( env_name, gym_random_seed) network_architecture = (num_hidden_nodes_agent, ) * num_hidden_layers_agent actor_net = ActorDistributionNetwork( environment.observation_spec(), environment.action_spec(), fc_layer_params=network_architecture, ) value_net = ValueNetwork(environment.observation_spec(), fc_layer_params=network_architecture) global_step = tf.compat.v1.train.get_or_create_global_step() agent = TRPOAgent( environment.time_step_spec(), environment.action_spec(), actor_net, value_net, discount_factor, lambda_value, max_kl, backtrack_coefficient, backtrack_iters, cg_iters, reward_normalizer, reward_norm_clipping, log_prob_clipping, value_train_iters, value_optimizer, gradient_clipping, debug, train_step_counter=global_step, ) agent_trainer = OnPolicyModelFreeAgentTrainer(steps_per_policy_update) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=use_tf_function, ) experiment_harness.run()
def test_experiment_harness_summaries_and_logs( caplog, tmpdir, summary_interval, evaluation_interval, number_of_initial_random_policy_steps, ): root_dir = str(tmpdir / "root_dir") train_interval = _MAX_STEPS total_steps = train_interval * 2 caplog.set_level(logging.INFO) # define a simple agent environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC) evaluation_environment = RandomTFEnvironment(TIMESTEP_SPEC, ACTION_SPEC) agent = MyAgent(time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec()) agent_trainer = SingleComponentAgentTrainer(train_interval) # execute the experiment harness = ExperimentHarness( root_dir=root_dir, environment=environment, evaluation_environment=evaluation_environment, agent=agent, agent_trainer=agent_trainer, real_replay_buffer_capacity=_REAL_REPLAY_BUFFER_CAPACITY, total_number_of_environment_steps=total_steps, summary_interval=summary_interval, evaluation_interval=evaluation_interval, number_of_evaluation_episodes=1, number_of_initial_random_policy_steps= number_of_initial_random_policy_steps, ) harness.run() # get correct paths experiment_id = os.listdir(root_dir)[0] # check wall clock time wallclock_time = get_metric_values( root_dir, TRAIN_METRICS_DIR, TIME_METRIC, [experiment_id], True, ) assert experiment_id in wallclock_time and isinstance( wallclock_time[experiment_id], float) # check train and evaluation summary tag_name = "Metrics/AverageReturn" train_metric_values = get_metric_values(root_dir, TRAIN_METRICS_DIR, tag_name, [experiment_id]) eval_metric_values = get_metric_values(root_dir, EVALUATION_METRICS_DIR, tag_name, [experiment_id]) assert [*train_metric_values[experiment_id].keys()] == [ i for i in range(summary_interval, total_steps + summary_interval, summary_interval) if i <= total_steps ] assert [*eval_metric_values[experiment_id].keys()] == [ i for i in range(evaluation_interval, total_steps + summary_interval, evaluation_interval) if i <= total_steps ] # check record of training the models or agents tag_name = "TrainingLoss/" + SingleComponentAgent.COMPONENT.name train_metric_values = get_metric_values(root_dir, TRAIN_METRICS_DIR, tag_name, [experiment_id]) assert [*train_metric_values[experiment_id].keys()] == [ i for i in range(train_interval, total_steps + 1, train_interval) if i <= total_steps ] # check logs for accurate random and regular transition collection intervals random_start_steps = [] regular_start_steps = [] for record in caplog.records: if hasattr(record, "message") and "initial transitions" in record.message: random_start_steps.append( int("".join(filter(str.isdigit, record.message[0:15])))) if hasattr(record, "message") and "regular transitions" in record.message: regular_start_steps.append( int("".join(filter(str.isdigit, record.message[0:15])))) if number_of_initial_random_policy_steps > 0: assert random_start_steps == [ i for i in range(0, number_of_initial_random_policy_steps, harness._max_steps) if i <= number_of_initial_random_policy_steps and i <= total_steps ] if number_of_initial_random_policy_steps < total_steps: assert regular_start_steps == [ i for i in range(number_of_initial_random_policy_steps, total_steps, harness._max_steps) if i <= total_steps ]