Beispiel #1
0
def zeros_from_spec(nested_spec, batch_size):
    """Create nested zero Tensors or Distributions.

    A zero tensor with shape[0]=`batch_size is created for each TensorSpec and
    A distribution with all the parameters as zero Tensors is created for each
    DistributionSpec.

    Args:
        nested_spec (nested TensorSpec or DistributionSpec):
        batch_size (int): batch size added as the first dimension to the shapes
             in TensorSpec
    Returns:
        nested Tensor or Distribution
    """

    def _zero_tensor(spec):
        if batch_size is None:
            shape = spec.shape
        else:
            spec_shape = tf.convert_to_tensor(value=spec.shape, dtype=tf.int32)
            shape = tf.concat(([batch_size], spec_shape), axis=0)
        dtype = spec.dtype
        return tf.zeros(shape, dtype)

    param_spec = nest_utils.to_distribution_param_spec(nested_spec)
    params = tf.nest.map_structure(_zero_tensor, param_spec)
    return nest_utils.params_to_distributions(params, nested_spec)
Beispiel #2
0
    def rollout(self, max_num_steps, time_step, policy_state):
        counter = tf.zeros((), tf.int32)
        batch_size = self._env.batch_size
        maximum_iterations = math.ceil(max_num_steps / self._env.batch_size)

        def create_ta(s):
            return tf.TensorArray(dtype=s.dtype,
                                  size=maximum_iterations,
                                  element_shape=tf.TensorShape(
                                      [batch_size]).concatenate(s.shape))

        training_info_ta = tf.nest.map_structure(
            create_ta,
            self._training_info_spec._replace(
                rollout_info=nest_utils.to_distribution_param_spec(
                    self._training_info_spec.rollout_info)))

        [counter, time_step, policy_state, training_info_ta] = tf.while_loop(
            cond=lambda *_: True,
            body=self._rollout_loop_body,
            loop_vars=[counter, time_step, policy_state, training_info_ta],
            maximum_iterations=maximum_iterations,
            back_prop=False,
            name="rollout_loop")

        training_info = tf.nest.map_structure(lambda ta: ta.stack(),
                                              training_info_ta)

        training_info = nest_utils.params_to_distributions(
            training_info, self._training_info_spec)

        self._algorithm.summarize_rollout(training_info)
        self._algorithm.summarize_metrics()

        return time_step, policy_state
Beispiel #3
0
    def _iter(self, time_step, policy_state):
        """One training iteration."""
        counter = tf.zeros((), tf.int32)
        batch_size = self._env.batch_size

        def create_ta(s):
            return tf.TensorArray(dtype=s.dtype,
                                  size=self._train_interval,
                                  element_shape=tf.TensorShape(
                                      [batch_size]).concatenate(s.shape))

        training_info_ta = tf.nest.map_structure(
            create_ta,
            self._training_info_spec._replace(
                info=nest_utils.to_distribution_param_spec(
                    self._training_info_spec.info)))

        with tf.GradientTape(watch_accessed_variables=False,
                             persistent=True) as tape:
            tape.watch(self._trainable_variables)
            [counter, next_time_step, next_state, training_info_ta
             ] = tf.while_loop(cond=lambda *_: True,
                               body=self._train_loop_body,
                               loop_vars=[
                                   counter, time_step, policy_state,
                                   training_info_ta
                               ],
                               back_prop=True,
                               parallel_iterations=1,
                               maximum_iterations=self._train_interval,
                               name='iter_loop')

            training_info = tf.nest.map_structure(lambda ta: ta.stack(),
                                                  training_info_ta)

            training_info = nest_utils.params_to_distributions(
                training_info, self._training_info_spec)

        loss_info, grads_and_vars = self._algorithm.train_complete(
            tape, training_info)

        del tape

        self._algorithm.summarize_train(training_info, loss_info,
                                        grads_and_vars)
        self._algorithm.summarize_metrics()

        common.get_global_counter().assign_add(1)

        return [next_time_step, next_state]
Beispiel #4
0
    def set_exp_replayer(self, exp_replayer: str, num_envs):
        """Set experience replayer.

        Args:
            exp_replayer (str): type of experience replayer. One of ("one_time",
                "uniform")
            num_envs (int): the total number of environments from all batched
                environments.
        """
        if exp_replayer == "one_time":
            self._exp_replayer = OnetimeExperienceReplayer()
        elif exp_replayer == "uniform":
            exp_spec = nest_utils.to_distribution_param_spec(
                self.experience_spec)
            self._exp_replayer = SyncUniformExperienceReplayer(
                exp_spec, num_envs)
        else:
            raise ValueError("invalid experience replayer name")
        self.add_experience_observer(self._exp_replayer.observe)
Beispiel #5
0
    def set_exp_replayer(self,
                         exp_replayer: str,
                         num_envs,
                         num_actors=0,
                         unroll_length=0,
                         learn_queue_cap=0):
        """Set experience replayer.

        Args:
            exp_replayer (str): type of experience replayer. One of ("one_time",
                "uniform", "cycle_one_time")
            num_envs (int): the total number of environments from all batched
                environments/actors, which is num_actors * batch_size.
            num_actors (int): number of async actors, required to be positive
                for cycle_one_time replayer.
            unroll_length (int): number of env steps to unroll.  Used in
                cycle_one_time replayer.
            learn_queue_cap (int): number of actors to use for each mini-batch.
        """
        if exp_replayer == "one_time":
            self._exp_replayer = OnetimeExperienceReplayer()
        else:
            exp_spec = nest_utils.to_distribution_param_spec(
                self.experience_spec)
            if exp_replayer == "uniform":
                self._exp_replayer = SyncUniformExperienceReplayer(
                    exp_spec, num_envs)
            elif exp_replayer == "cycle_one_time":
                assert num_actors > 0
                assert unroll_length > 0
                self._exp_replayer = CyclicOneTimeExperienceReplayer(
                    exp_spec, num_envs, num_actors, unroll_length,
                    learn_queue_cap)
            else:
                raise ValueError("invalid experience replayer name")
        self.add_experience_observer(self._exp_replayer.observe)
Beispiel #6
0
    def _update(self, experience, weight):
        batch_size = tf.shape(experience.step_type)[1]
        counter = tf.zeros((), tf.int32)
        initial_train_state = common.get_initial_policy_state(
            batch_size, self.train_state_spec)
        if self._use_rollout_state:
            first_train_state = tf.nest.map_structure(
                lambda state: state[0, ...], experience.state)
        else:
            first_train_state = initial_train_state
        num_steps = tf.shape(experience.step_type)[0]

        def create_ta(s):
            # TensorArray cannot use Tensor (batch_size) as element_shape
            ta_batch_size = experience.step_type.shape[1]
            return tf.TensorArray(dtype=s.dtype,
                                  size=num_steps,
                                  element_shape=tf.TensorShape(
                                      [ta_batch_size]).concatenate(s.shape))

        experience_ta = tf.nest.map_structure(
            create_ta,
            nest_utils.to_distribution_param_spec(
                self.processed_experience_spec))
        experience_ta = tf.nest.map_structure(
            lambda elem, ta: ta.unstack(elem), experience, experience_ta)
        info_ta = tf.nest.map_structure(
            create_ta,
            nest_utils.to_distribution_param_spec(self.train_step_info_spec))

        scope = get_current_scope()

        def _train_loop_body(counter, policy_state, info_ta):
            exp = tf.nest.map_structure(lambda ta: ta.read(counter),
                                        experience_ta)
            exp = nest_utils.params_to_distributions(
                exp, self.processed_experience_spec)
            policy_state = common.reset_state_if_necessary(
                policy_state, initial_train_state,
                tf.equal(exp.step_type, StepType.FIRST))

            with tf.name_scope(scope):
                policy_step = self.train_step(exp, policy_state)

            info_ta = tf.nest.map_structure(
                lambda ta, x: ta.write(counter, x), info_ta,
                nest_utils.distributions_to_params(policy_step.info))

            counter += 1

            return [counter, policy_step.state, info_ta]

        with tf.GradientTape(persistent=True,
                             watch_accessed_variables=False) as tape:
            tape.watch(self.trainable_variables)
            [_, _, info_ta] = tf.while_loop(
                cond=lambda counter, *_: tf.less(counter, num_steps),
                body=_train_loop_body,
                loop_vars=[counter, first_train_state, info_ta],
                back_prop=True,
                name="train_loop")
            info = tf.nest.map_structure(lambda ta: ta.stack(), info_ta)
            info = nest_utils.params_to_distributions(
                info, self.train_step_info_spec)
            experience = nest_utils.params_to_distributions(
                experience, self.processed_experience_spec)
            training_info = TrainingInfo(action=experience.action,
                                         reward=experience.reward,
                                         discount=experience.discount,
                                         step_type=experience.step_type,
                                         rollout_info=experience.rollout_info,
                                         info=info,
                                         env_id=experience.env_id)

        loss_info, grads_and_vars = self.train_complete(
            tape=tape, training_info=training_info, weight=weight)

        del tape

        return training_info, loss_info, grads_and_vars
Beispiel #7
0
    def __init__(self,
                 num_envs,
                 env_batch_size,
                 learn_queue_cap,
                 actor_queue_cap,
                 time_step_spec,
                 policy_step_spec,
                 unroll_length,
                 num_actor_queues=1):
        """
        Create five kinds of queues:
        1. one learner queue
            stores batches of training trajectories
            all agent threads should enqueue unrolled trajectories into it
        2.`num_actor_queues` actor queues
            each queue stores batches of observations from some envs to act upon
            all agent threads should enqueue current observations into one of
            the actor queues to get predicted actions
        3. `num_envs` action-returning queues
            each env holds one such queue for receiving the returned action
            predicted the by actor
        4. one log queue
            the logging thread retrieves trajectory data from this queue
        5. `num_envs` env-unroll queues
            there is a one-to-one mapping from a queue to an env. Each queue
            accumulates `unroll_length` time steps before they are used for
            training.

        These queues are used for communications between learner&actor threads
        and actor&logging threads. We manage them in a centralized way to
        facilitate closing.

        Args:
            num_envs (int): number of tf_agents batched environments running in
                parallel. Each environment could be a batch of environments!
            env_batch_size (int): number of envs contained by each batched env
            learn_queue_cap (int): the capacity of the learner queue
            actor_queue_cap (int): the capacity of a actor queue
            time_step_spec (tf.nest): see OffPolicyAsyncDriver._prepare_specs();
                used for creating queues
            policy_step_spec (tf.nest): see OffPolicyAsyncDriver._prepare_specs();
                used for creating queues
            unroll_length (int): how many time steps each environment proceeds
                before training
            num_actor_queues (int): number of actor queues running in parallel
        """
        batch_time_step_spec = repeat_shape_n(time_step_spec, env_batch_size)
        batch_policy_step_spec = repeat_shape_n(
            nest_utils.to_distribution_param_spec(policy_step_spec),
            env_batch_size)
        unrolled_time_step_spec = repeat_shape_n(batch_time_step_spec,
                                                 unroll_length)
        unrolled_policy_step_spec = repeat_shape_n(batch_policy_step_spec,
                                                   unroll_length)

        self._batch_state_spec = batch_policy_step_spec.state

        self.learn_queue = NestFIFOQueue(
            capacity=learn_queue_cap,
            sample_element=LearningBatch(
                time_step=unrolled_time_step_spec,
                state=unrolled_policy_step_spec.state,
                policy_step=unrolled_policy_step_spec,
                next_time_step=unrolled_time_step_spec))

        self.log_queue = NestFIFOQueue(capacity=num_envs,
                                       sample_element=[
                                           unrolled_time_step_spec,
                                           unrolled_policy_step_spec,
                                           unrolled_time_step_spec,
                                           tf.ones((), dtype=tf.int32)
                                       ])

        tf.debugging.assert_greater_equal(num_envs,
                                          num_actor_queues * actor_queue_cap,
                                          message="not enough environments!")

        self.actor_queues = [
            NestFIFOQueue(capacity=actor_queue_cap,
                          sample_element=[
                              batch_time_step_spec,
                              batch_policy_step_spec.state,
                              tf.ones((), dtype=tf.int32)
                          ]) for i in range(num_actor_queues)
        ]

        self.action_return_queues = [
            NestFIFOQueue(capacity=1, sample_element=batch_policy_step_spec)
            for i in range(num_envs)
        ]

        self.env_unroll_queues = [
            NestFIFOQueue(capacity=unroll_length,
                          sample_element=LearningBatch(
                              time_step=batch_time_step_spec,
                              state=batch_policy_step_spec.state,
                              policy_step=batch_policy_step_spec,
                              next_time_step=batch_time_step_spec))
            for i in range(num_envs)
        ]