Beispiel #1
0
def test_reinforce_agent_learning(env_name):
    """
    Extension of the test for an agent playing in the environment to include training.
    Note: This does not test that training improves the policy. It simply tests that the training
    loop runs effectively.
    """
    # Set up environment using default parameters.
    # Environment parameters do not affect the test result here.
    tf_env, _ = rl_env_from_snc_env(load_scenario(
        env_name,
        job_gen_seed=10,
        override_env_params={'max_episode_length': 25})[1],
                                    discount_factor=0.99)

    # Set up a training step counter.
    global_step = tf.compat.v1.train.get_or_create_global_step()
    # Instantiate a REINFORCE agent
    reinforce_agent = create_reinforce_agent(tf_env,
                                             training_step_counter=global_step)

    # Instantiate a replay buffer.
    replay_buffer = TFUniformReplayBuffer(
        data_spec=reinforce_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000)

    # Initialise the action network weights etc.
    reinforce_agent.initialize()

    # Use a driver to handle data collection for the agent. This handles a lot of the backend
    # TensorFlow set up and solves previous errors with episodes of differing lengths.
    collect_driver = DynamicEpisodeDriver(tf_env,
                                          reinforce_agent.collect_policy,
                                          observers=[replay_buffer.add_batch],
                                          num_episodes=2)

    # Get the initial states of the agent and environment before training.
    time_step = tf_env.reset()
    policy_state = reinforce_agent.collect_policy.get_initial_state(
        tf_env.batch_size)

    # Take a copy of the variables in order to ensure that training does lead to parameter changes.
    initial_vars = deepcopy(reinforce_agent.trainable_variables)
    assert len(initial_vars) > 0, "Agent has no trainable variables."

    # Set up a minimal training loop to simply test training mechanics work.
    for _ in range(5):
        # Collect experience.
        time_step, policy_state = collect_driver.run(time_step=time_step,
                                                     policy_state=policy_state)
        # Now the replay buffer should have data in it so we can collect the data and train the
        # agent.
        experience = replay_buffer.gather_all()
        reinforce_agent.train(experience)
        # Clear the replay buffer and return to play.
        replay_buffer.clear()

    # Check that training has had some effect
    for v1, v2 in zip(initial_vars, reinforce_agent.trainable_variables):
        assert not np.allclose(v1.numpy(), v2.numpy())
def get_collection_driver(
        env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], observers: List[Any],
        policy_observers: Optional[List[tf_metric.TFStepMetric]], num_episodes: int
    ) -> DynamicEpisodeDriver:
    """
    Sets up a driver which will run data collection and in-training metric tracking.
    The driver is defined in tf_agents and handles agent play and monitoring as well as data
    storage for a fixed number of episodes at a time. This driver will be run to collect data once
    per training iteration.

    :param env: The TensorFlow environment object which will be run.
    :param agent: The agent to play in the environment.
    :param observers: A list of operations (including metrics to track) which will be executed in
        play to collect data and perform logging.
    :param policy_observers: A list of metrics to track which are executed in play throughout
        training.
    :param num_episodes: The number of episodes to play out in each driver run.
    :return: A driver to use for data collection (and in-play performance tracking)
    """
    collection_driver = DynamicEpisodeDriver(
        env,
        agent.collect_policy,
        observers=observers + policy_observers,
        num_episodes=num_episodes
    )
    # Wrap the run function for faster execution.
    collection_driver.run = tf.function(collection_driver.run)
    return collection_driver
Beispiel #3
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
Beispiel #4
0
  def build_driver(self):
    """Build elements of the data pipeline."""
    observers = [self.replay_buffer.add_batch]
    driver = DynamicEpisodeDriver(
        env=self.tf_env,
        policy=self.agent.collect_policy,
        observers=observers)

    dataset = self.replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=self.batch_size,
        num_steps=self.agent.train_sequence_length
    ).prefetch(3)
    iterator = iter(dataset)
    return driver, iterator
def train_agent(
        env: TFPyEnvironment,
        agent: Union[ReinforceAgent, PPOAgent],
        data_collection_driver: DynamicEpisodeDriver,
        replay_buffer: TFUniformReplayBuffer,
        num_iters: int,
        global_step=None,
        metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None,
        eval_env: Optional[TFPyEnvironment] = None,
        eval_summary_writer: Optional[tf.summary.SummaryWriter] = None,
        num_eval_episodes: int = 1,
        eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None,
        per_step_eval_metrics: Optional[List[Any]] = None,
        eval_freq: int = 10,
        log_freq: int = 5,
        save_freq: int = 5,
        model_save_path: Optional[str] = None,
        tf_log_stream_path: Optional[str] = None) -> None:
    """
    Function for putting the pieces together to train and evaluate an agent.

    :param env: The environment for which the agent will be trained.
    :param agent: The agent to train.
    :param data_collection_driver: The driver used for data collection and metric tracking.
    :param replay_buffer: Replay buffer in which to store experience.
    :param num_iters: The number of training iterations to perform.
    :param global_step: A counter of the number of training iterations.
    :param metrics: A list of the metrics to track during training.
    :param policy_metrics: A list of metrics related to the policy distribution to track during
        training.
    :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of
        policy metrics in TensorBoard.
    :param eval_env: The environment in which to play out evaluations of the policy.
    :param eval_summary_writer: The summary writer used for evaluation metrics.
    :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point.
    :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution).
    :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step
        resolution).
    :param eval_freq: The number of training iterations between runs of policy evaluation logging.
    :param log_freq: The frequency with which to log values to TensorBoard.
    :param save_freq: The number of training iterations between model saves.
    :param model_save_path: Directory in which to save model checkpoints (weights etc). If None
        model will not be saved.
    :param tf_log_stream_path:
    """
    # Get the initial states of the agent and environment before training.
    time_step = env.reset()
    policy_state = agent.collect_policy.get_initial_state(env.batch_size)

    # Set up the model saving infrastructure if a path to save to is provided.
    save_model = bool(model_save_path)
    if save_model:
        # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent.
        checkpoint = tf.train.Checkpoint(agent=agent)
        # The checkpoint manager enables us to save multiple versions of the check point at
        # different training steps. We save the 20 most recent saves to span a wide section of
        # training.
        checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20)
    else:
        # Warn the user that training will continue but models will not be saved.
        warn("No save directory provided. Model will not be saved.")

    if metrics is None:
        metrics = []
    if per_step_eval_metrics is None:
        per_step_eval_metrics = []
    # Set up a minimal training loop to simply test training mechanics work.
    for i in range(num_iters):
        with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)):
            # Collect experience.
            time_step, policy_state = data_collection_driver.run(
                time_step=time_step,
                policy_state=policy_state
            )
            # Now the replay buffer should have data in it so we can collect the data and train the
            # agent.
            experience = replay_buffer.gather_all()
            agent.train(experience)
            # Clear the replay buffer and return to play.
            replay_buffer.clear()
            for metric in metrics:
                metric.tf_summaries(
                    train_step=global_step,
                    step_metrics=metrics[:2]
                )
            # Run the policy tracking metrics one at a time each on their own summary writer to
            # enable shared axes on TensorBoard.
            for metric, summary_writer in zip(policy_metrics, policy_summary_writers):
                with summary_writer.as_default():
                    tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step)

        if eval_summary_writer and eval_metrics and eval_env:
            if i > 0 and global_step % eval_freq == 0:
                evaluate_policy(
                    eval_metrics,
                    eval_env,
                    agent.policy,
                    per_step_metrics=per_step_eval_metrics,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix="Metrics",
                    logging=True,
                    tf_log_stream_path=tf_log_stream_path
                )
        # Periodically save the model provided that we have the infrastructure in place.
        if save_model and i > 0 and (i + 1) % save_freq == 0:
            checkpoint_manager.save(i + 1)
        if i % (num_iters // 100) == 0:
            print(f"\tCompleted: {i / num_iters * 100} %")
    checkpoint_manager.save(num_iters)
def evaluate_policy(metrics: List[Any],
                    environment: TFPyEnvironment,
                    policy: tf_agents.policies.tf_policy.Base,
                    per_step_metrics: Optional[List[tf.Module]] = None,
                    num_episodes: int = 1,
                    train_step: Optional[Any] = None,
                    summary_writer: Optional[tf.summary.SummaryWriter] = None,
                    summary_prefix: str = "Eval",
                    logging: bool = False,
                    tf_log_stream_path: Optional[str] = None) -> None:
    """
    Track performance (via metrics) using policy in the environment provided.
    Prints a dictionary of results {metric_name: metric_value}.

    *NOTE*: Because placeholders are not compatible with Eager mode this is not compatible with
    python policies.

    This function is adapted from tf_agents.eval.metric_utils.eager_compute to allow for per time
    step logging.

    :param metrics: List of metrics to compute.
    :param environment: tf_environment instance.
    :param policy: tf_policy instance used to step the environment.
    :param per_step_metrics: List of metrics to be passed as observers to run every time step during
        evaluation.
    :param num_episodes: Number of episodes to compute the metrics over.
    :param train_step: An optional step to write summaries against.
    :param summary_writer: An optional writer for generating metric summaries.
    :param summary_prefix: An optional prefix scope for metric summaries.
    :param logging: Option to enable logging to the console of standard metrics.
    :param tf_log_stream_path: Path to a file which tf.print calls are set to write to. If none
        tf.print statements print to sys.stdout.
    """
    # Reset the state of all metrics (e.g. running totals for averages).
    for metric in metrics + per_step_metrics:
        metric.reset()

    # Attain the initial state of the environment and policy.
    time_step = environment.reset()
    policy_state = policy.get_initial_state(environment.batch_size)

    # Set up a driver to run the evaluation episodes while logging the desired metrics.
    driver = DynamicEpisodeDriver(
        environment,
        policy,
        observers=metrics,
        transition_observers=per_step_metrics,
        num_episodes=num_episodes)

    # Run the driver which adds experience to the replay buffer.
    driver.run(time_step, policy_state)

    # If we have the required prerequisites then perform the TensorBoard logging as well as logging
    # results to the console.
    if train_step and summary_writer:
        # Utilise a (possibly) different summary writer to put the evaluation metrics to
        # TensorBoard.
        with summary_writer.as_default():
            for m in metrics:
                # Attain the full name of the metric to record.
                tag = "/".join([summary_prefix, m.name])
                # Simply calculating and forming the scalar summary in the current context with a
                # default summary writer does the logging to TensorBoard for us.
                tf.summary.scalar(name=tag, data=m.result(), step=train_step)
    # If requested to then log metrics to the console.
    if logging and train_step:
        for m in metrics:
            tf.print(f"Evaluation at step {train_step.numpy()}: {m.name}\t{m.result()}",
                     output_stream=f'file://{tf_log_stream_path}' if tf_log_stream_path else
                     sys.stdout)
Beispiel #7
0
    def train_implementation(self, train_context: core.TrainContext):
        """Tf-Agents Reinforce Implementation of the train loop."""

        assert isinstance(train_context, core.EpisodesTrainContext)
        tc: core.EpisodesTrainContext = train_context
        self.log('Creating environment...')
        train_env = self._create_env(discount=tc.reward_discount_gamma)
        observation_spec = train_env.observation_spec()
        action_spec = train_env.action_spec()
        timestep_spec = train_env.time_step_spec()

        # SetUp Optimizer, Networks and PpoAgent
        self.log_api('AdamOptimizer', 'create')
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=tc.learning_rate)

        self.log_api('ActorDistributionNetwork', 'create')
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=self.model_config.fc_layers)

        self.log_api('ReinforceAgent', 'create')
        tf_agent = reinforce_agent.ReinforceAgent(timestep_spec,
                                                  action_spec,
                                                  actor_network=actor_net,
                                                  optimizer=optimizer)

        self.log_api('tf_agent.initialize()')
        tf_agent.initialize()
        self._trained_policy = tf_agent.policy

        # SetUp Data collection & Buffering
        collect_data_spec = tf_agent.collect_data_spec
        self.log_api('TFUniformReplayBuffer', 'create')
        replay_buffer = TFUniformReplayBuffer(
            collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer)
        self.log_api('DynamicEpisodeDriver', 'create')
        collect_driver = DynamicEpisodeDriver(
            train_env,
            tf_agent.collect_policy,
            observers=[replay_buffer.add_batch],
            num_episodes=tc.num_episodes_per_iteration)

        # Train
        collect_driver.run = common.function(collect_driver.run,
                                             autograph=False)
        tf_agent.train = common.function(tf_agent.train, autograph=False)

        self.log('Starting training...')
        while True:
            self.on_train_iteration_begin()
            msg = f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4}'
            self.log_api('collect_driver.run', msg)
            collect_driver.run()

            self.log_api('replay_buffer.gather_all', msg)
            trajectories = replay_buffer.gather_all()

            self.log_api('tf_agent.train', msg)
            loss_info = tf_agent.train(experience=trajectories)
            total_loss = loss_info.loss.numpy()
            self.log_api('', f'loss={total_loss:<7.1f}')

            self.log_api('replay_buffer.clear', msg)
            replay_buffer.clear()

            self.on_train_iteration_end(loss=total_loss)
            if tc.training_done:
                break
        return
Beispiel #8
0
    def train_implementation(self, train_context: core.TrainContext):
        """Tf-Agents Ppo Implementation of the train loop."""

        assert isinstance(train_context, core.PpoTrainContext)
        tc: core.PpoTrainContext = train_context
        train_env = self._create_env(discount=tc.reward_discount_gamma)
        observation_spec = train_env.observation_spec()
        action_spec = train_env.action_spec()
        timestep_spec = train_env.time_step_spec()

        # SetUp Optimizer, Networks and PpoAgent
        self.log_api('AdamOptimizer', '()')
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=tc.learning_rate)

        self.log_api('ActorDistributionNetwork', '()')
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=self.model_config.fc_layers)
        self.log_api('ValueNetwork', '()')
        value_net = value_network.ValueNetwork(
            observation_spec, fc_layer_params=self.model_config.fc_layers)

        self.log_api('PpoAgent', '()')
        tf_agent = ppo_agent.PPOAgent(timestep_spec,
                                      action_spec,
                                      optimizer,
                                      actor_net=actor_net,
                                      value_net=value_net,
                                      num_epochs=tc.num_epochs_per_iteration)
        self.log_api('tf_agent.initialize', '()')
        tf_agent.initialize()
        self._trained_policy = tf_agent.policy

        # SetUp Data collection & Buffering
        collect_data_spec = tf_agent.collect_data_spec
        self.log_api('TFUniformReplayBuffer', '()')
        replay_buffer = TFUniformReplayBuffer(
            collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer)

        collect_policy = tf_agent.collect_policy
        self.log_api('DynamicEpisodeDriver', '()')
        collect_driver = DynamicEpisodeDriver(
            train_env,
            collect_policy,
            observers=[replay_buffer.add_batch],
            num_episodes=tc.num_episodes_per_iteration)

        # Train
        collect_driver.run = common.function(collect_driver.run,
                                             autograph=False)
        tf_agent.train = common.function(tf_agent.train, autograph=False)

        while True:
            self.on_train_iteration_begin()
            self.log_api(
                '-----',
                f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4}      -----'
            )
            self.log_api('collect_driver.run', '()')
            collect_driver.run()

            self.log_api('replay_buffer.gather_all', '()')
            trajectories = replay_buffer.gather_all()

            self.log_api('tf_agent.train', '(experience=...)')
            loss_info = tf_agent.train(experience=trajectories)
            total_loss = loss_info.loss.numpy()
            actor_loss = loss_info.extra.policy_gradient_loss.numpy()
            critic_loss = loss_info.extra.value_estimation_loss.numpy()
            self.log_api(
                '',
                f'loss={total_loss:<7.1f} [actor={actor_loss:<7.1f} critic={critic_loss:<7.1f}]'
            )

            self.log_api('replay_buffer.clear', '()')
            replay_buffer.clear()

            self.on_train_iteration_end(loss=total_loss,
                                        actor_loss=actor_loss,
                                        critic_loss=critic_loss)
            if tc.training_done:
                break
        return