コード例 #1
0
    def update(
        self,
        policy_state: types.NestedTensor,
        trajectories: Trajectory,
        number_of_particles: int,
    ) -> types.NestedTensor:
        """
        Update the policy state at the end of each iteration.

        Note that the each of the trajectories in the batch should be of the same length.
        Trajectories cannot terminate and restart.

        :param policy_state: A nest of tensors with details about policy.
        :param trajectories: A time-stacked trajectory object.
        :param number_of_particles: Number of monte-carlo rollouts of each action trajectory.
        """
        assert (self._num_elites <= trajectories.discount.shape[0]
                ), "num_elites needs to be smaller than population size"
        assert tf.equal(
            tf.reduce_all(trajectories.is_boundary()[:, :-1]), False
        ), "No trajectories in the batch should contain a terminal state before the final step."
        assert tf.equal(
            tf.reduce_all(trajectories.is_boundary()[:, -1]), True
        ), "All trajectories in the batch must end in a terminal state."

        returns = averaged_particle_returns(trajectories.reward,
                                            trajectories.discount,
                                            number_of_particles)

        sorted_idx = tf.argsort(returns, direction="DESCENDING")
        elite_idx = sorted_idx[:self._num_elites]
        elites = tf.gather(
            trajectories.action, elite_idx
        )  # shape = (number of elites, horizon) + action_spec.shape

        elites_mean = tf.reduce_mean(
            elites, axis=0)  # shape = (horizon,) + action_spec.shape
        elites_var = tf.reduce_mean(
            tf.math.square(elites - elites_mean),
            axis=0)  # shape = (horizon,) + action_spec.shape

        old_mean, old_var, low, high, _, step_index = policy_state

        new_mean = (
            1.0 - self._lr
        ) * old_mean + self._lr * elites_mean  # shape = (horizon,) + action_spec.shape
        new_var = (
            1.0 - self._lr
        ) * old_var + self._lr * elites_var  # shape = (horizon,) + action_spec.shape

        new_actions = sample_action_batch(new_mean, new_var, low, high,
                                          returns.shape[0])

        return tf.nest.pack_sequence_as(
            policy_state,
            [
                new_mean, new_var, low, high, new_actions,
                tf.zeros_like(step_index)
            ],
        )
コード例 #2
0
    def call(self, trajectory: Trajectory) -> Trajectory:
        time_step = TimeStep(trajectory.step_type, trajectory.reward, trajectory.discount,
                             trajectory.observation)
        action_dist = self._policy.distribution(time_step).action

        # If the action distribution is in fact a tuple of distributions (one for each resource set)
        # then we need to index into them to attain the underlying distribution which can then be
        # used to attain probabilities. This is only the case where there are multiple resource
        # sets.
        for i in self._action_indices[:-1]:
            action_dist = action_dist[i]

        action_probs = action_dist.probs_parameter()
        # Zero out batch indices where a new episode is starting.
        self._probability_accumulator.assign(
            tf.where(trajectory.is_first(), tf.zeros_like(self._probability_accumulator),
                     self._probability_accumulator))
        self._count_accumulator.assign(
            tf.where(trajectory.is_first(), tf.zeros_like(self._count_accumulator),
                     self._count_accumulator))
        # Update accumulators with probability and count increments.
        self._probability_accumulator.assign_add(action_probs[..., 0, self._action_indices[-1]])
        self._count_accumulator.assign_add(tf.ones_like(self._count_accumulator))

        # Add final cumulants to buffer at the end of episodes.
        last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()), axis=-1)
        for idx in last_episode_indices:
            self._buffer.add(self._probability_accumulator[idx] / self._count_accumulator[idx])

        return trajectory
コード例 #3
0
    def __call__(self, trajectory: trajectory_lib.Trajectory) -> None:
        """Cache the single step trajectory to be written into Reverb.

    Allows trajectory to be a flattened trajectory. No batch dimension allowed.

    Args:
      trajectory: The trajectory to be written which could be (possibly nested)
        trajectory object or a flattened version of a trajectory. It assumes
        there is *no* batch dimension.

    Raises:
      ValueError: If `bypass_partial_episodes` == False and episode length
        is > `max_sequence_length`.
    """
        # TODO(b/176494855): Raise an error if an invalid trajectory is passed in.
        # Currently, invalid `traj` value (mid->first, last->last) is not specially
        # handled and is treated as a normal mid->mid step.
        if (self._cached_steps >= self._max_sequence_length
                and not self._overflow_episode):
            self._overflow_episode = True
            if self._bypass_partial_episodes:
                logging.error(
                    "The number of trajectories within the same episode exceeds "
                    "`max_sequence_length`. This episode is bypassed and will NOT "
                    "be written into the replay buffer. Consider increasing the "
                    "`max_sequence_length`.")
            else:
                raise ValueError(
                    "The number of trajectories within the same episode "
                    "exceeds `max_sequence_length`. Consider increasing the "
                    "`max_sequence_length` or set `bypass_partial_episodes` to true "
                    "to bypass the episodes with length more than "
                    "`max_sequence_length`.")

        # At the end of the overflowing episode, drop the cached incomplete episode
        # and reset the writer.
        if self._overflow_episode and trajectory.is_boundary():
            self.reset(write_cached_steps=False)
            return

        if not self._overflow_episode:
            self._writer.append(trajectory)
            self._writer_has_data = True
            self._cached_steps += 1

            # At the end of an episode, write the item to Reverb and clear the cache.
            if trajectory.is_boundary():
                self.reset(write_cached_steps=True)
コード例 #4
0
    def call(self, trajectory: Trajectory) -> Trajectory:
        """
        Process the experience passed in to update the metric value (or the components required to
        calculate the final value).

        :param trajectory: Experience from the agent rolling out in the environment.
        :return: The unchanged input trajectory (as per the standard use of TensorFlow Metrics).
        """
        start_of_episode_indices = tf.squeeze(tf.where(trajectory.is_first()),
                                              axis=-1)
        mask = tf.ones(shape=(self._batch_size, ), dtype=self._dtype)

        for idx in start_of_episode_indices:
            mask -= tf.eye(self._batch_size)[idx]

        # Reset the accumulators at the end of each episode.
        self._num_valid_timesteps.assign(self._num_valid_timesteps * mask)
        self._activity_accumulator.assign(self._activity_accumulator * mask)

        # Find the number of time steps satisfying the filter condition.
        # The reshape is to ensure compatibility with the variable below in the case of no batch
        # dimension.
        valid_timesteps = tf.reshape(
            tf.reduce_sum(tf.cast(self.filter_condition(trajectory),
                                  self._dtype),
                          axis=-1), self._num_valid_timesteps.shape)

        # Track the number of time steps which meet the qualifying condition.
        self._num_valid_timesteps.assign_add(valid_timesteps,
                                             name="increment_valid_timesteps")

        # Update accumulator with activity counts where both the filtering and activity condition
        # are satisfied. Again the reshape is to ensure compatibility with the accumulator
        # variable in the case where there is no batch dimension.
        bool_values = tf.logical_and(self.filter_condition(trajectory),
                                     self.activity_condition(trajectory))
        to_add = tf.reshape(
            tf.reduce_sum(tf.cast(bool_values, self._dtype), axis=-1),
            self._activity_accumulator.shape)

        self._activity_accumulator.assign_add(to_add)

        # Add values to buffer at the end of the episode by first finding where the trajectories end
        # and then using the resulting indices to update the correct buffer locations.
        # At the same time build up a mask of values to use for resetting the accumulators.
        end_of_episode_indices = tf.squeeze(
            tf.where(trajectory.step_type == 2), axis=-1)

        for idx in end_of_episode_indices:
            self._activity_buffer.add(self._activity_accumulator[idx])
            self._qualifying_timesteps_buffer.add(
                self._num_valid_timesteps[idx])

        # Return the original trajectory data as is standard for TFStepMetrics.
        return trajectory
コード例 #5
0
def filter_episodes(traj):
    """Map TFRecord windows (of adjacent TimeSteps) to single episodes.

  Outputs the last episode within a sample window. It does this by using
  the step_type tensor to break up sequences into single episode sections.
  For example, if step_type is: [FIRST, MID, LAST, FIRST, MID, MID], we
  will return a sample, whos tensor indices are sampled as:
  [3, 3, 3, 3, 4, 5]. So that the 3rd index frame is replicated 3 times to
  across the beginning of the tensor.

  Args:
    traj: Trajectory.

  Returns:
    Trajectory containing filtered sample with only one episode.
  """
    step_types = traj.step_type
    seq_len = tf.cast(tf.shape(step_types)[0], tf.int32)

    # Find the last start frame in the window. e.g. if we have step types
    # [FIRST, MID, LAST, FIRST, MID, MID], we want index 3.
    first_frames = tf.where(step_types == StepType.FIRST)

    if tf.shape(first_frames)[0] == 0:
        # No first frame, return sequence as is.
        inds = tf.range(0, seq_len)
    else:
        ind_start = tf.cast(first_frames[-1, 0], tf.int32)
        if ind_start == 0:
            # Last episode starts on the first frame, return as is.
            inds = tf.range(0, seq_len)
        else:
            # Otherwise, resample so that the last episode's first frame is
            # replicated to the beginning of the sample. In the example above we want:
            # [3, 3, 3, 3, 4, 5].
            inds_start = tf.tile(ind_start[None], ind_start[None])
            inds_end = tf.range(ind_start, seq_len)
            inds = tf.concat([inds_start, inds_end], axis=0)

    def _resample(arr):
        if isinstance(arr, tf.Tensor):
            return tf.gather(arr, inds)
        else:
            return arr  # empty or None

    observation = tf.nest.map_structure(_resample, traj.observation)

    return Trajectory(step_type=_resample(traj.step_type),
                      action=_resample(traj.action),
                      policy_info=_resample(traj.policy_info),
                      next_step_type=_resample(traj.next_step_type),
                      reward=_resample(traj.reward),
                      discount=_resample(traj.discount),
                      observation=observation)
コード例 #6
0
ファイル: main.py プロジェクト: kang-sw-archive/2048-RL
    def __call__(self, trajectory: Trajectory):
        if not trajectory.is_boundary():
            self.step_counter += 1
        else:
            self.episode_couinter += 1

        if self.step_counter % self.log_period == 0:
            print(
                f"...Step {self.step_counter:12} of Episode {self.episode_couinter+1:8}",
                end="\r",
            )
コード例 #7
0
ファイル: simple_buffer.py プロジェクト: tomasruizt/dads
def make_trajectory_from(transitions: Sequence[Transition]) -> Trajectory:
    s, a, s_next = zip(*transitions)
    two_cols = np.ones((len(s), 2))
    return Trajectory(
        step_type=StepType.MID * two_cols,
        observation=np.stack((s, s_next), axis=1),
        action=np.stack((a, np.NaN * np.ones_like(a)), axis=1),
        policy_info=(),
        next_step_type=StepType.MID * two_cols,
        reward=np.NaN * two_cols,
        discount=0.99 * two_cols
    )
コード例 #8
0
ファイル: transition_model.py プロジェクト: adak32/bellman
    def train(self, trajectories: Trajectory, training_spec: TS) -> T:
        """
        Train the transition model using data from the `Trajectory`, according to the specification
        `training_spec`.

        :param trajectories: The training data.
        :param training_spec: The training specification.
        :return: A summary of the training process, for example the model loss over the training
            data.
        """
        latent_observation = self.observation_transformation.forward_observation(
            trajectories.observation
        )
        latent_trajectories = trajectories.replace(observation=latent_observation)
        return self._train(latent_trajectories, training_spec)
コード例 #9
0
ファイル: utils.py プロジェクト: adak32/bellman
def extract_transitions_from_trajectories(
    trajectory: Trajectory,
    observation_spec: TensorSpec,
    action_spec: TensorSpec,
    predict_state_difference: bool,
) -> Transition:
    """
    TF-Agents returns a batch of trajectories from a buffer as a `Trajectory` object. This function
    transforms the data in the batch into a `Transition` tuple which can be used used for training
    the model.

    :param trajectory: The TF-Agents trajectory object
    :param observation_spec: The `TensorSpec` object which defines the observation tensors
    :param action_spec: The `TensorSpec` object which defines the action tensors
    :param predict_state_difference: Boolean to specify whether the transition model should
        return the next (latent) state or the difference between the current (latent) state and
        the next (latent) state

    :return: A `Transition` tuple which contains the observations and actions which can be used to
            train the model.
    """
    mask = ~trajectory.is_boundary()[:, :-1]  # to filter out boundary elements

    trajectory_observation = trajectory.observation
    # [batch_size, time_dim, features...]
    tf.ensure_shape(trajectory_observation, [None, None] + observation_spec.shape)
    next_observation = tf.boolean_mask(trajectory_observation[:, 1:, ...], mask)
    observation = tf.boolean_mask(trajectory_observation[:, :-1, ...], mask)

    trajectory_action = trajectory.action
    # [batch_size, time_dim, features...]
    tf.ensure_shape(trajectory_action, [None, None] + action_spec.shape)
    action = tf.boolean_mask(trajectory_action[:, :-1, ...], mask)

    trajectory_reward = trajectory.reward
    # [batch_size, time_dim]
    tf.ensure_shape(trajectory_reward, [None, None])
    reward = tf.boolean_mask(trajectory_reward[:, :-1], mask)

    if predict_state_difference:
        next_observation -= observation

    return Transition(
        observation=observation,
        action=action,
        reward=reward,
        next_observation=next_observation,
    )
コード例 #10
0
    def generate_experience_data(self, steps, save_dir):

        time_step = self._tf_env.reset()
        observations = []
        actions = []
        labels = []

        for _ in tqdm(range(steps), 'Generating experience data'):
            action = self._agent.policy.action(time_step).action
            time_step = self._tf_env.step(action=action)

            label = {}
            if isinstance(self.env._env, DoomEnvironment):
                state = self._tf_env.envs[0]._game.get_state()
                self._tf_env.envs[0]._game.advance_action()
                if state is not None:
                    deamons = [
                        lbl for lbl in state.labels
                        if lbl.object_name == 'Demon'
                    ]
                    if len(deamons) > 0:
                        label['object_angle'] = int(deamons[0].object_angle)
                        label['distance_from_wall'] = abs(
                            deamons[0].object_position_x)

            observations.append(time_step.observation)
            actions.append(action.numpy()[0])
            labels.append(label)

        observations = np.array([ob.numpy()[0] for ob in observations])
        actions = np.array(actions)
        labels = np.array(labels)

        exp_data = Trajectory(observation=observations,
                              action=actions,
                              policy_info={'satisfaction': labels},
                              step_type=(),
                              next_step_type=(),
                              reward=(),
                              discount=())

        file_path = os.path.join(save_dir, f'exp_data_{steps}.pkl')
        with file_io.FileIO(file_path, mode='wb') as f:
            pickle.dump([exp_data], f, protocol=4)
コード例 #11
0
ファイル: trpo_agent.py プロジェクト: adak32/bellman
def dummy_trajectory_batch(batch_size=2, n_steps=5, obs_dim=2):
    observations = tf.reshape(
        tf.constant(np.arange(batch_size * n_steps * obs_dim),
                    dtype=tf.float32),
        (batch_size, n_steps, obs_dim),
    )

    time_steps = TimeStep(
        step_type=tf.constant([[1] * (n_steps - 2) + [2] * 2] * batch_size,
                              dtype=tf.int32),
        reward=tf.constant([[1] * n_steps] * batch_size, dtype=tf.float32),
        discount=tf.constant([[1.0] * n_steps] * batch_size, dtype=tf.float32),
        observation=observations,
    )
    actions = tf.ones((batch_size, n_steps, 1), dtype=tf.float32)

    action_distribution_parameters = {
        "dist_params": {
            "loc":
            tf.constant([[[10.0]] * n_steps] * batch_size, dtype=tf.float32),
            "scale":
            tf.constant([[[10.0]] * n_steps] * batch_size, dtype=tf.float32),
        },
        "value_prediction":
        tf.constant([[0.0] * n_steps] * batch_size, dtype=tf.float32),
    }

    policy_info = action_distribution_parameters

    return Trajectory(
        time_steps.step_type,
        observations,
        actions,
        policy_info,
        time_steps.step_type,
        time_steps.reward,
        time_steps.discount,
    )
コード例 #12
0
    def __call__(self, trajectory: trajectory_lib.Trajectory) -> None:
        """Writes the trajectory into the underlying replay buffer.

    Allows trajectory to be a flattened trajectory. No batch dimension allowed.

    Args:
      trajectory: The trajectory to be written which could be (possibly nested)
        trajectory object or a flattened version of a trajectory. It assumes
        there is *no* batch dimension.
    """
        self._last_trajectory = trajectory
        self._writer.append(trajectory)
        self._cached_steps += 1

        # If the fixed sequence length is reached, write the sequence.
        self._write_cached_steps()

        # If it happens to be the end of the episode, clear the cache. Pad first and
        # write the items into Reverb if required.
        if trajectory.is_boundary():
            if self._pad_end_of_episodes:
                self.reset(write_cached_steps=True)
            else:
                self.reset(write_cached_steps=False)
コード例 #13
0
def make_trajectory_mask(batched_traj: trajectory.Trajectory) -> types.Tensor:
  """Mask boundary trajectories and those with invalid returns and advantages.

  Args:
    batched_traj: Trajectory, doubly-batched [batch_dim, time_dim,...]. It must
      be preprocessed already.

  Returns:
    A mask, type tf.float32, that is 0.0 for all between-episode Trajectory
      (batched_traj.step_type is LAST) and 0.0 if the return value is
      unavailable.
  """
  # 1.0 for all valid trajectories. 0.0 where between episodes.
  not_between_episodes = ~batched_traj.is_boundary()

  # 1.0 for trajectories with valid return values. 0.0 where return and
  # advantage are both 0. This happens to the last item when the experience gets
  # preprocessed, as insufficient information was available for calculating
  # advantages.
  valid_return_value = ~(
      tf.equal(batched_traj.policy_info['return'], 0)
      & tf.equal(batched_traj.policy_info['normalized_advantage'], 0))

  return tf.cast(not_between_episodes & valid_return_value, tf.float32)
コード例 #14
0
    def call(self, trajectory: traj.Trajectory):
        if trajectory.step_type.ndim == 0:
            trajectory = nest_utils.batch_nested_array(trajectory)

        completed_episodes = np.sum(trajectory.is_last().astype(np.int64))
        self._np_state.number_episodes += completed_episodes
コード例 #15
0
    def call(self, trajectory: traj.Trajectory):
        if trajectory.step_type.ndim == 0:
            trajectory = nest_utils.batch_nested_array(trajectory)

        new_steps = np.sum((~trajectory.is_boundary()).astype(np.int64))
        self._np_state.environment_steps += new_steps
コード例 #16
0
def collect_step(environment, policies, replay_buffers):

    aggregate_time_step = environment.current_time_step()
    is_first_step = aggregate_time_step.reward.shape == (
        1, )  # first time env reward is [0], while others are [[r1,r2,r3,r4]]
    aggregate_action_step = {}  # action in form of [[e1,e2,e3,e4]]

    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            # abstract observation and construct time step for each agent separately
            # for the first step, env output step type is in shape [num]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            time_step = TimeStep(aggregate_time_step.step_type[0],
                                 aggregate_time_step.reward[0],
                                 aggregate_time_step.discount[0], observation)

            # agent policy receive time_step and output single action
            action_step = policies[i].action(time_step)

            # add single action to joint action
            action = tf.convert_to_tensor(action_step.action, dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step
    else:
        for i, name in enumerate(AGENT_NAMES):
            # abstract observation and construct time step for each agent separately
            # for the step other than the first step, env output step type is in shape [[num,num,num,num]]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            time_step = TimeStep(aggregate_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0], observation)

            # agent policy receive time_step and output single action
            action_step = policies[i].action(time_step)

            # add single action to joint action
            action = tf.convert_to_tensor(action_step.action, dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step

    # let enviroment take one step forward.
    aggregate_next_time_step = environment.step(aggregate_action_step)

    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            tra = Trajectory(aggregate_time_step.step_type[0], observation,
                             action_step.action, action_step.info,
                             aggregate_next_time_step.step_type[0][0],
                             aggregate_time_step.reward[0],
                             aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)

    else:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            if aggregate_next_time_step.step_type.shape == (1, ):
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            else:
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)
コード例 #17
0
def random_collect_step(environment, policies, replay_buffers):

    aggregate_time_step = environment.current_time_step()
    is_first_step = aggregate_time_step.reward.shape == (
        1, )  # first time env reward is [0], while others are [[r1,r2,r3,r4]]
    aggregate_action_step = {}  # action in form of [[e1,e2,e3,e4]]
    squeezed_action_step = {}  # action in form of [e1,e2,e3,e4]
    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            # create time_step satisfies the spec of single random policy
            time_step = TimeStep(aggregate_time_step.step_type[0],
                                 aggregate_time_step.reward[0],
                                 aggregate_time_step.discount[0],
                                 aggregate_time_step.observation[name][0])

            # random policy recieve observation(time_step) and return action(action_step)
            action_step = policies[i].action(time_step)
            squeezed_action_step[name] = action_step

            # create action_step(policy_step) satisfies the spec of aggregate environment
            action = tf.convert_to_tensor([action_step.action],
                                          dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step
    else:
        for i, name in enumerate(AGENT_NAMES):
            # create time_step satisfies the spec of single random policy
            time_step = TimeStep(aggregate_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0],
                                 aggregate_time_step.observation[name][0])

            # random policy recieve observation(time_step) and return action(action_step)
            action_step = policies[i].action(time_step)
            squeezed_action_step[name] = action_step

            # create action_step(policy_step) satisfies the spec of aggregate environment
            action = tf.convert_to_tensor([action_step.action],
                                          dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step

    # let enviroment take one step forward.
    aggregate_next_time_step = environment.step(aggregate_action_step)

    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            tra = Trajectory(aggregate_time_step.step_type[0], observation,
                             action_step.action, action_step.info,
                             aggregate_next_time_step.step_type[0][0],
                             aggregate_time_step.reward[0],
                             aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)

    else:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            if aggregate_next_time_step.step_type.shape == (1, ):
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            else:
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)
コード例 #18
0
    def total_loss(self,
                   experience: traj.Trajectory,
                   returns: types.Tensor,
                   weights: types.Tensor,
                   training: bool = False) -> tf_agent.LossInfo:
        # Ensure we see at least one full episode.
        time_steps = ts.TimeStep(experience.step_type,
                                 tf.zeros_like(experience.reward),
                                 tf.zeros_like(experience.discount),
                                 experience.observation)
        is_last = experience.is_last()
        num_episodes = tf.reduce_sum(tf.cast(is_last, tf.float32))
        tf.debugging.assert_greater(
            num_episodes,
            0.0,
            message=
            'No complete episode found. REINFORCE requires full episodes '
            'to compute losses.')

        # Mask out partial episodes at the end of each batch of time_steps.
        # NOTE: We use is_last rather than is_boundary because the last transition
        # is the transition with the last valid reward.  In other words, the
        # reward on the boundary transitions do not have valid rewards.  Since
        # REINFORCE is calculating a loss w.r.t. the returns (and not bootstrapping)
        # keeping the boundary transitions is irrelevant.
        valid_mask = tf.cast(experience.is_last(), dtype=tf.float32)
        valid_mask = tf.math.cumsum(valid_mask, axis=1, reverse=True)
        valid_mask = tf.cast(valid_mask > 0, dtype=tf.float32)
        if weights is not None:
            weights *= valid_mask
        else:
            weights = valid_mask

        advantages = returns
        value_preds = None

        if self._baseline:
            value_preds, _ = self._value_network(time_steps.observation,
                                                 time_steps.step_type,
                                                 training=True)
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='value_preds',
                                               data=value_preds,
                                               step=self.train_step_counter)

        advantages = self._advantage_fn(returns, value_preds)
        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='advantages',
                                           data=advantages,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            advantages = _standard_normalize(advantages, axes=(0, 1))
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(
                    name='normalized_%s' %
                    ('advantages' if self._baseline else 'returns'),
                    data=advantages,
                    step=self.train_step_counter)

        nest_utils.assert_same_structure(time_steps, self.time_step_spec)
        policy_state = _get_initial_policy_state(self.collect_policy,
                                                 time_steps)
        actions_distribution = self.collect_policy.distribution(
            time_steps, policy_state=policy_state).action

        policy_gradient_loss = self.policy_gradient_loss(
            actions_distribution,
            experience.action,
            experience.is_boundary(),
            advantages,
            num_episodes,
            weights,
        )

        entropy_regularization_loss = self.entropy_regularization_loss(
            actions_distribution, weights)

        network_regularization_loss = tf.nn.scale_regularization_loss(
            self._actor_network.losses)

        total_loss = (policy_gradient_loss + network_regularization_loss +
                      entropy_regularization_loss)

        losses_dict = {
            'policy_gradient_loss': policy_gradient_loss,
            'policy_network_regularization_loss': network_regularization_loss,
            'entropy_regularization_loss': entropy_regularization_loss,
            'value_estimation_loss': 0.0,
            'value_network_regularization_loss': 0.0,
        }

        value_estimation_loss = None
        if self._baseline:
            value_estimation_loss = self.value_estimation_loss(
                value_preds, returns, num_episodes, weights)
            value_network_regularization_loss = tf.nn.scale_regularization_loss(
                self._value_network.losses)
            total_loss += value_estimation_loss + value_network_regularization_loss
            losses_dict['value_estimation_loss'] = value_estimation_loss
            losses_dict['value_network_regularization_loss'] = (
                value_network_regularization_loss)

        loss_info_extra = ReinforceAgentLossInfo(**losses_dict)

        losses_dict[
            'total_loss'] = total_loss  # Total loss not in loss_info_extra.

        common.summarize_scalar_dict(losses_dict,
                                     self.train_step_counter,
                                     name_scope='Losses/')

        return tf_agent.LossInfo(total_loss, loss_info_extra)