Exemple #1
0
def test_reinforce_agent_learning(env_name):
    """
    Extension of the test for an agent playing in the environment to include training.
    Note: This does not test that training improves the policy. It simply tests that the training
    loop runs effectively.
    """
    # Set up environment using default parameters.
    # Environment parameters do not affect the test result here.
    tf_env, _ = rl_env_from_snc_env(load_scenario(
        env_name,
        job_gen_seed=10,
        override_env_params={'max_episode_length': 25})[1],
                                    discount_factor=0.99)

    # Set up a training step counter.
    global_step = tf.compat.v1.train.get_or_create_global_step()
    # Instantiate a REINFORCE agent
    reinforce_agent = create_reinforce_agent(tf_env,
                                             training_step_counter=global_step)

    # Instantiate a replay buffer.
    replay_buffer = TFUniformReplayBuffer(
        data_spec=reinforce_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000)

    # Initialise the action network weights etc.
    reinforce_agent.initialize()

    # Use a driver to handle data collection for the agent. This handles a lot of the backend
    # TensorFlow set up and solves previous errors with episodes of differing lengths.
    collect_driver = DynamicEpisodeDriver(tf_env,
                                          reinforce_agent.collect_policy,
                                          observers=[replay_buffer.add_batch],
                                          num_episodes=2)

    # Get the initial states of the agent and environment before training.
    time_step = tf_env.reset()
    policy_state = reinforce_agent.collect_policy.get_initial_state(
        tf_env.batch_size)

    # Take a copy of the variables in order to ensure that training does lead to parameter changes.
    initial_vars = deepcopy(reinforce_agent.trainable_variables)
    assert len(initial_vars) > 0, "Agent has no trainable variables."

    # Set up a minimal training loop to simply test training mechanics work.
    for _ in range(5):
        # Collect experience.
        time_step, policy_state = collect_driver.run(time_step=time_step,
                                                     policy_state=policy_state)
        # Now the replay buffer should have data in it so we can collect the data and train the
        # agent.
        experience = replay_buffer.gather_all()
        reinforce_agent.train(experience)
        # Clear the replay buffer and return to play.
        replay_buffer.clear()

    # Check that training has had some effect
    for v1, v2 in zip(initial_vars, reinforce_agent.trainable_variables):
        assert not np.allclose(v1.numpy(), v2.numpy())
Exemple #2
0
class SyncUniformExperienceReplayer(ExperienceReplayer):
    """
    For synchronous off-policy training.

    Example algorithms: DDPG, SAC
    """
    def __init__(self, experience_spec, batch_size):
        self._buffer = TFUniformReplayBuffer(experience_spec, batch_size)
        self._data_iter = None

    def observe(self, exp, env_ids=None):
        """
        For the sync driver, `exp` has the shape (`env_batch_size`, ...)
        with `num_envs`==1 and `unroll_length`==1. This function always ignores
        `env_ids`.
        """
        self._buffer.add_batch(exp)

    def replay(self, sample_batch_size, mini_batch_length):
        if self._data_iter is None:
            dataset = self._buffer.as_dataset(
                num_parallel_calls=3,
                sample_batch_size=sample_batch_size,
                num_steps=mini_batch_length).prefetch(3)
            self._data_iter = iter(dataset)
        return next(self._data_iter)

    def replay_all(self):
        return self._buffer.gather_all()

    def clear(self):
        self._buffer.clear()

    @property
    def batch_size(self):
        return self._buffer._batch_size
class SyncUniformExperienceReplayer(ExperienceReplayer):
    """
    For synchronous off-policy training.

    Example algorithms: DDPG, SAC
    """

    def __init__(self, experience_spec, batch_size):
        # TFUniformReplayBuffer does not support list in spec, we have to do
        # some conversion.
        self._experience_spec = experience_spec
        self._exp_has_list = nest_utils.nest_contains_list(experience_spec)
        tuple_experience_spec = nest_utils.nest_list_to_tuple(experience_spec)
        self._buffer = TFUniformReplayBuffer(tuple_experience_spec, batch_size)
        self._data_iter = None

    def _list_to_tuple(self, exp):
        if self._exp_has_list:
            return nest_utils.nest_list_to_tuple(exp)
        else:
            return exp

    def _tuple_to_list(self, exp):
        if self._exp_has_list:
            return nest_utils.nest_tuple_to_list(exp, self._experience_spec)
        else:
            return exp

    def observe(self, exp, env_ids=None):
        """
        For the sync driver, `exp` has the shape (`env_batch_size`, ...)
        with `num_envs`==1 and `unroll_length`==1. This function always ignores
        `env_ids`.
        """
        self._buffer.add_batch(self._list_to_tuple(exp))

    def replay(self, sample_batch_size, mini_batch_length):
        """Get a random batch.

        Args:
            sample_batch_size (int): number of sequences
            mini_batch_length (int): the length of each sequence
        Returns:
            Experience: experience batch in batch major (B, T, ...)
            tf_uniform_replay_buffer.BufferInfo: information about the batch
        """
        if self._data_iter is None:
            dataset = self._buffer.as_dataset(
                num_parallel_calls=3,
                sample_batch_size=sample_batch_size,
                num_steps=mini_batch_length).prefetch(3)
            self._data_iter = iter(dataset)
        exp, info = next(self._data_iter)
        return self._tuple_to_list(exp), info

    def replay_all(self):
        return self._tuple_to_list(self._buffer.gather_all())

    def clear(self):
        self._buffer.clear()

    @property
    def batch_size(self):
        return self._buffer._batch_size
def train_agent(
        env: TFPyEnvironment,
        agent: Union[ReinforceAgent, PPOAgent],
        data_collection_driver: DynamicEpisodeDriver,
        replay_buffer: TFUniformReplayBuffer,
        num_iters: int,
        global_step=None,
        metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None,
        eval_env: Optional[TFPyEnvironment] = None,
        eval_summary_writer: Optional[tf.summary.SummaryWriter] = None,
        num_eval_episodes: int = 1,
        eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None,
        per_step_eval_metrics: Optional[List[Any]] = None,
        eval_freq: int = 10,
        log_freq: int = 5,
        save_freq: int = 5,
        model_save_path: Optional[str] = None,
        tf_log_stream_path: Optional[str] = None) -> None:
    """
    Function for putting the pieces together to train and evaluate an agent.

    :param env: The environment for which the agent will be trained.
    :param agent: The agent to train.
    :param data_collection_driver: The driver used for data collection and metric tracking.
    :param replay_buffer: Replay buffer in which to store experience.
    :param num_iters: The number of training iterations to perform.
    :param global_step: A counter of the number of training iterations.
    :param metrics: A list of the metrics to track during training.
    :param policy_metrics: A list of metrics related to the policy distribution to track during
        training.
    :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of
        policy metrics in TensorBoard.
    :param eval_env: The environment in which to play out evaluations of the policy.
    :param eval_summary_writer: The summary writer used for evaluation metrics.
    :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point.
    :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution).
    :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step
        resolution).
    :param eval_freq: The number of training iterations between runs of policy evaluation logging.
    :param log_freq: The frequency with which to log values to TensorBoard.
    :param save_freq: The number of training iterations between model saves.
    :param model_save_path: Directory in which to save model checkpoints (weights etc). If None
        model will not be saved.
    :param tf_log_stream_path:
    """
    # Get the initial states of the agent and environment before training.
    time_step = env.reset()
    policy_state = agent.collect_policy.get_initial_state(env.batch_size)

    # Set up the model saving infrastructure if a path to save to is provided.
    save_model = bool(model_save_path)
    if save_model:
        # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent.
        checkpoint = tf.train.Checkpoint(agent=agent)
        # The checkpoint manager enables us to save multiple versions of the check point at
        # different training steps. We save the 20 most recent saves to span a wide section of
        # training.
        checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20)
    else:
        # Warn the user that training will continue but models will not be saved.
        warn("No save directory provided. Model will not be saved.")

    if metrics is None:
        metrics = []
    if per_step_eval_metrics is None:
        per_step_eval_metrics = []
    # Set up a minimal training loop to simply test training mechanics work.
    for i in range(num_iters):
        with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)):
            # Collect experience.
            time_step, policy_state = data_collection_driver.run(
                time_step=time_step,
                policy_state=policy_state
            )
            # Now the replay buffer should have data in it so we can collect the data and train the
            # agent.
            experience = replay_buffer.gather_all()
            agent.train(experience)
            # Clear the replay buffer and return to play.
            replay_buffer.clear()
            for metric in metrics:
                metric.tf_summaries(
                    train_step=global_step,
                    step_metrics=metrics[:2]
                )
            # Run the policy tracking metrics one at a time each on their own summary writer to
            # enable shared axes on TensorBoard.
            for metric, summary_writer in zip(policy_metrics, policy_summary_writers):
                with summary_writer.as_default():
                    tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step)

        if eval_summary_writer and eval_metrics and eval_env:
            if i > 0 and global_step % eval_freq == 0:
                evaluate_policy(
                    eval_metrics,
                    eval_env,
                    agent.policy,
                    per_step_metrics=per_step_eval_metrics,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix="Metrics",
                    logging=True,
                    tf_log_stream_path=tf_log_stream_path
                )
        # Periodically save the model provided that we have the infrastructure in place.
        if save_model and i > 0 and (i + 1) % save_freq == 0:
            checkpoint_manager.save(i + 1)
        if i % (num_iters // 100) == 0:
            print(f"\tCompleted: {i / num_iters * 100} %")
    checkpoint_manager.save(num_iters)
Exemple #5
0
    def train_implementation(self, train_context: core.TrainContext):
        """Tf-Agents Reinforce Implementation of the train loop."""

        assert isinstance(train_context, core.EpisodesTrainContext)
        tc: core.EpisodesTrainContext = train_context
        self.log('Creating environment...')
        train_env = self._create_env(discount=tc.reward_discount_gamma)
        observation_spec = train_env.observation_spec()
        action_spec = train_env.action_spec()
        timestep_spec = train_env.time_step_spec()

        # SetUp Optimizer, Networks and PpoAgent
        self.log_api('AdamOptimizer', 'create')
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=tc.learning_rate)

        self.log_api('ActorDistributionNetwork', 'create')
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=self.model_config.fc_layers)

        self.log_api('ReinforceAgent', 'create')
        tf_agent = reinforce_agent.ReinforceAgent(timestep_spec,
                                                  action_spec,
                                                  actor_network=actor_net,
                                                  optimizer=optimizer)

        self.log_api('tf_agent.initialize()')
        tf_agent.initialize()
        self._trained_policy = tf_agent.policy

        # SetUp Data collection & Buffering
        collect_data_spec = tf_agent.collect_data_spec
        self.log_api('TFUniformReplayBuffer', 'create')
        replay_buffer = TFUniformReplayBuffer(
            collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer)
        self.log_api('DynamicEpisodeDriver', 'create')
        collect_driver = DynamicEpisodeDriver(
            train_env,
            tf_agent.collect_policy,
            observers=[replay_buffer.add_batch],
            num_episodes=tc.num_episodes_per_iteration)

        # Train
        collect_driver.run = common.function(collect_driver.run,
                                             autograph=False)
        tf_agent.train = common.function(tf_agent.train, autograph=False)

        self.log('Starting training...')
        while True:
            self.on_train_iteration_begin()
            msg = f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4}'
            self.log_api('collect_driver.run', msg)
            collect_driver.run()

            self.log_api('replay_buffer.gather_all', msg)
            trajectories = replay_buffer.gather_all()

            self.log_api('tf_agent.train', msg)
            loss_info = tf_agent.train(experience=trajectories)
            total_loss = loss_info.loss.numpy()
            self.log_api('', f'loss={total_loss:<7.1f}')

            self.log_api('replay_buffer.clear', msg)
            replay_buffer.clear()

            self.on_train_iteration_end(loss=total_loss)
            if tc.training_done:
                break
        return
Exemple #6
0
    def train_implementation(self, train_context: core.TrainContext):
        """Tf-Agents Ppo Implementation of the train loop."""

        assert isinstance(train_context, core.PpoTrainContext)
        tc: core.PpoTrainContext = train_context
        train_env = self._create_env(discount=tc.reward_discount_gamma)
        observation_spec = train_env.observation_spec()
        action_spec = train_env.action_spec()
        timestep_spec = train_env.time_step_spec()

        # SetUp Optimizer, Networks and PpoAgent
        self.log_api('AdamOptimizer', '()')
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=tc.learning_rate)

        self.log_api('ActorDistributionNetwork', '()')
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=self.model_config.fc_layers)
        self.log_api('ValueNetwork', '()')
        value_net = value_network.ValueNetwork(
            observation_spec, fc_layer_params=self.model_config.fc_layers)

        self.log_api('PpoAgent', '()')
        tf_agent = ppo_agent.PPOAgent(timestep_spec,
                                      action_spec,
                                      optimizer,
                                      actor_net=actor_net,
                                      value_net=value_net,
                                      num_epochs=tc.num_epochs_per_iteration)
        self.log_api('tf_agent.initialize', '()')
        tf_agent.initialize()
        self._trained_policy = tf_agent.policy

        # SetUp Data collection & Buffering
        collect_data_spec = tf_agent.collect_data_spec
        self.log_api('TFUniformReplayBuffer', '()')
        replay_buffer = TFUniformReplayBuffer(
            collect_data_spec, batch_size=1, max_length=tc.max_steps_in_buffer)

        collect_policy = tf_agent.collect_policy
        self.log_api('DynamicEpisodeDriver', '()')
        collect_driver = DynamicEpisodeDriver(
            train_env,
            collect_policy,
            observers=[replay_buffer.add_batch],
            num_episodes=tc.num_episodes_per_iteration)

        # Train
        collect_driver.run = common.function(collect_driver.run,
                                             autograph=False)
        tf_agent.train = common.function(tf_agent.train, autograph=False)

        while True:
            self.on_train_iteration_begin()
            self.log_api(
                '-----',
                f'iteration {tc.iterations_done_in_training:4} of {tc.num_iterations:<4}      -----'
            )
            self.log_api('collect_driver.run', '()')
            collect_driver.run()

            self.log_api('replay_buffer.gather_all', '()')
            trajectories = replay_buffer.gather_all()

            self.log_api('tf_agent.train', '(experience=...)')
            loss_info = tf_agent.train(experience=trajectories)
            total_loss = loss_info.loss.numpy()
            actor_loss = loss_info.extra.policy_gradient_loss.numpy()
            critic_loss = loss_info.extra.value_estimation_loss.numpy()
            self.log_api(
                '',
                f'loss={total_loss:<7.1f} [actor={actor_loss:<7.1f} critic={critic_loss:<7.1f}]'
            )

            self.log_api('replay_buffer.clear', '()')
            replay_buffer.clear()

            self.on_train_iteration_end(loss=total_loss,
                                        actor_loss=actor_loss,
                                        critic_loss=critic_loss)
            if tc.training_done:
                break
        return
Exemple #7
0
class TFAgentsPPOAgent(RLAgent):
    def __init__(self,
                 name=None,
                 actor_net=None,
                 value_net=None,
                 predictor=None,
                 keep_models_fixed=False,
                 featurizer=None):
        super().__init__(name, predictor, keep_models_fixed, featurizer)

        action_spec = BoundedTensorSpec((1, ), tf.int64, 0,
                                        ACTION_DIMENSIONS - 1)

        # we store both mask and the actual observation in the observation
        # given to the agent in order to get an association between these two
        # see also https://github.com/tensorflow/agents/issues/125#issuecomment-496583325
        observation_spec = {
            'state': TensorSpec((self.featurizer.state_dimension(), ),
                                tf.float32),
            'mask': TensorSpec((ACTION_DIMENSIONS, ), tf.float32)
        }

        layers = equal_spacing_fc(5, self.featurizer.state_dimension())

        if actor_net is None:
            self.actor_net = MaskedActorNetwork(observation_spec, action_spec,
                                                layers)
        else:
            self.actor_net = actor_net

        if value_net is None:
            self.value_net = DummyMaskedValueNetwork(observation_spec,
                                                     fc_layer_params=layers)
        else:
            self.value_net = value_net

        self.agent = tf_agents.agents.ppo.ppo_agent.PPOAgent(
            time_step_spec=ts.time_step_spec(observation_spec),
            action_spec=action_spec,
            actor_net=self.actor_net,
            value_net=self.value_net,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-5),
            discount_factor=1,
            use_gae=True,
            use_td_lambda_return=True,
            lambda_value=0.85,
            num_epochs=30,

            # the observations are dicts { 'state': ..., 'mask': ... }
            # normalization does not make any sense for the mask
            normalize_observations=False,
        )

        if actor_net is not None or value_net is not None:
            self.agent.initialize()
        else:
            self._create_train_checkpointer()

            # All the variables are in fact successfully restored but this is
            # not done immediately but only once some shapes are known.
            # Therefore, if the shapes are never known, the variables are not
            # restored. This is no problem in self play, where all of the shapes
            # are known after the first training but it is a problem when playing
            # against old versions because often some of the old versions aren't
            # used (and also the value net is never used because the old versions
            # aren't trained). It isn't an error but tensorflow gives warnings at
            # the end which are confusing if one doesn't know this.
            # Therefore we silence those warnings with .expect_partial().
            # For more information see
            # https://github.com/tensorflow/tensorflow/issues/27937#issuecomment-484683443
            # https://github.com/tensorflow/tensorflow/issues/27937#issuecomment-488356053

            self.train_checkpointer.initialize_or_restore().expect_partial()

        # it seems like there is also agent.policy. I still don't understand when
        # one should use which and why but this one works for now.
        self.policy = self.agent.collect_policy

        # because tf_agents wants the data as trajectories
        # (prev_time_step, action, new_time_step), we have to store the prev_time_step
        # until we have the new_time_step to build the trajectory at which point
        # the new prev_time_step is the new_time_step
        # this variable is to keep track of the prev_time_step
        self.last_time_step = None

        # even though PPO is on policy, storing the stuff for a bit seems to be ok
        # and the examples in the tf_agents repo also use one
        self.replay_buffer = TFUniformReplayBuffer(
            self.agent.collect_data_spec,
            batch_size=1,
            max_length=REPLAY_BUFFER_SIZE)
        self.replay_buffer_position = 0

        self.clone_counter = 0

    def _create_train_checkpointer(self):
        self.train_checkpointer = tf_agents.utils.common.Checkpointer(
            ckpt_dir=os.path.join(MODELS_PATH, self.name, 'Agent'),
            agent=self.agent)

    def _add_trajectory(self, prev_time_step, action, new_time_step):
        """Add a trajectory (prev_time_step, action, new_time_step) to the replay buffer

        Also train the agent on the whole buffer if it is full.
        """

        traj = tf_agents.trajectories.trajectory.from_transition(
            prev_time_step, action, new_time_step)

        self.replay_buffer.add_batch(traj)
        self.replay_buffer_position += 1

        if self.replay_buffer_position == REPLAY_BUFFER_SIZE + 1:
            if not self.keep_models_fixed:
                self.agent.train(self.replay_buffer.gather_all())
            self.replay_buffer_position = 0
            self.replay_buffer.clear()

    def act(self, observation, valid_action_mask):
        observation = {
            'state': np.array(observation, dtype=np.float32),
            'mask': valid_action_mask
        }

        if self.last_time_step is None:
            # a new episode started
            self.last_time_step = _to_tf_timestep(ts.restart(observation))
            self.last_action_step = self.policy.action(self.last_time_step)
            return self.last_action_step.action.numpy()[0, 0]

        new_time_step = _to_tf_timestep(
            ts.transition(observation, self.prev_reward))
        self._add_trajectory(self.last_time_step, self.last_action_step,
                             new_time_step)

        self.last_time_step = new_time_step
        self.last_action_step = self.policy.action(new_time_step)
        self.prev_reward = None

        return self.last_action_step.action.numpy()[0, 0]

    def observe(self, reward, terminal):
        if not terminal:
            self.prev_reward = reward
            return

        # even when the episode ends, tf_agents expects some observation
        # additionally to the reward. Because that makes no sense for us,
        # we just give it an observation consisting of all-zeros
        new_time_step = _to_tf_timestep(
            ts.termination(
                {
                    'state': np.zeros(self.featurizer.state_dimension()),
                    'mask': np.zeros(ACTION_DIMENSIONS)
                }, reward))

        self._add_trajectory(self.last_time_step, self.last_action_step,
                             new_time_step)

        self.last_time_step = None
        self.last_action_step = None
        self.prev_reward = None

    def clone(self, name=None):
        """Return a clone of this agent with networks & predictor shared"""

        if name is None:
            self.clone_counter += 1
            name = self.name + 'Clone' + str(self.clone_counter)

        return TFAgentsPPOAgent(name=name,
                                actor_net=self.actor_net,
                                value_net=self.value_net,
                                predictor=self.predictor,
                                keep_models_fixed=self.keep_models_fixed,
                                featurizer=self.featurizer)

    def save_models(self):
        """Save actor, critic and predictor

        Args:
            global_step: the current game number, is appended to
                the filenames of the saved models
        """

        if self.keep_models_fixed:
            return

        super().save_models(os.path.join(MODELS_PATH, self.name))
        if not hasattr(self, 'train_checkpointer'):
            self._create_train_checkpointer()
        self.train_checkpointer.save(0)