def update_step(self,
                  replay_buffer_iter,
                  train_target='both'):
    """Performs a single training step for critic and embedding.

    Args:
      replay_buffer_iter: A tensorflow graph iteratable object.
      train_target: string specifying whether update RL and or representation

    Returns:
      Dictionary with losses to track.
    """
    del train_target
    transition = next(replay_buffer_iter)
    numpy_dataset = isinstance(replay_buffer_iter, np.ndarray)
    # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 ->
    # n_batch x H x W x 3*n_frames
    if not numpy_dataset:
      states = transition.observation[:, 0]
      next_states = transition.observation[:, 1]
      actions = transition.action[:, 0]
      rewards = transition.reward[:, 0]
      discounts = transition.discount[:, 0]

      if transition.observation.dtype == tf.uint8:
        states = tf.cast(states, tf.float32) / 255.
        next_states = tf.cast(next_states, tf.float32) / 255.
    else:
      states, actions, rewards, next_states, discounts = transition

    if self.num_augmentations > 0:
      states, next_states = tf_utils.image_aug(
          states,
          next_states,
          img_pad=4,
          num_augmentations=self.num_augmentations)

    if not self.discrete_actions:
      actor_dict = self.fit_actor(
          states[0] if self.num_augmentations > 0 else states, actions)

    next_actions = self.select_actions(
        next_states[0] if self.num_augmentations > 0 else next_states)

    if self.discrete_actions:
      actions = tf.cast(tf.one_hot(actions, depth=self.action_dim), tf.float32)
      next_actions = tf.cast(
          tf.one_hot(next_actions, depth=self.action_dim), tf.float32)

    critic_dict = self.critic_learner.fit_critic(
        states[0] if self.num_augmentations > 0 else states, actions,
        next_states[0] if self.num_augmentations > 0 else next_states,
        next_actions, rewards, discounts)

    if self.discrete_actions:
      return critic_dict
    else:
      return {**actor_dict, **critic_dict}
  def update_step(self,
                  replay_buffer_iter,
                  train_target='both'):
    """Performs a single training step for critic and embedding.

    Args:
      replay_buffer_iter: A tensorflow graph iteratable object.
      train_target: string specifying whether update RL and or representation

    Returns:
      Dictionary with losses to track.
    """
    transition = next(replay_buffer_iter)
    numpy_dataset = isinstance(replay_buffer_iter, np.ndarray)
    # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 ->
    # n_batch x H x W x 3*n_frames
    if not numpy_dataset:
      states = transition.observation[:, 0]
      next_states = transition.observation[:, 1]
      actions = transition.action[:, 0]
      rewards = transition.reward[:, 0]
      discounts = transition.discount[:, 0]

      if transition.observation.dtype == tf.uint8:
        states = tf.cast(states, tf.float32) / 255.
        next_states = tf.cast(next_states, tf.float32) / 255.
    else:
      states, actions, rewards, next_states, discounts = transition

    if self.num_augmentations > 0:
      states, next_states = tf_utils.image_aug(
          states,
          next_states,
          img_pad=4,
          num_augmentations=self.num_augmentations,
          obs_dim=64,
          channels=3,
          cropped_shape=[self.batch_size, 68, 68, 3])

    next_actions = self.act(next_states, data_aug=True)

    if train_target == 'both':
      ssl_dict = self.fit_embedding(states, actions, next_states, next_actions,
                                    rewards, discounts)
      critic_dict = self.fit_critic(states, actions, next_states, next_actions,
                                    rewards, discounts)
    elif train_target == 'encoder':
      ssl_dict = self.fit_embedding(states, actions, next_states, next_actions,
                                    rewards, discounts)
      critic_dict = {}
    elif train_target == 'rl':
      ssl_dict = {}
      critic_dict = self.fit_critic(states, actions, next_states, next_actions,
                                    rewards, discounts)

    return {**ssl_dict, **critic_dict}
    def update_step(self, replay_buffer_iter, numpy_dataset):
        """Performs a single training step for critic and actor.

    Args:
      replay_buffer_iter: A tensorflow graph iteratable object.
      numpy_dataset: Is the dataset a NumPy array?

    Returns:
      Dictionary with losses to track.
    """

        transition = next(replay_buffer_iter)

        # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 ->
        # n_batch x H x W x 3*n_frames
        if not numpy_dataset:
            states = transition.observation[:, 0]
            next_states = transition.observation[:, 1]
            actions = transition.action[:, 0]
            rewards = transition.reward[:, 0]
            discounts = transition.discount[:, 0]

            if transition.observation.dtype == tf.uint8:
                states = tf.cast(states, tf.float32) / 255.
                next_states = tf.cast(next_states, tf.float32) / 255.
        else:
            states, actions, rewards, next_states, discounts = transition

        if self.num_augmentations > 0:
            states, next_states = tf_utils.image_aug(
                states,
                next_states,
                img_pad=4,
                num_augmentations=self.num_augmentations,
                obs_dim=64,
                channels=3,
                cropped_shape=[self.batch_size, 68, 68, 3])

        # states, actions, rewards, discounts, next_states = next(replay_buffer_iter
        rewards = rewards + self.reward_bonus

        if self.discrete_actions:
            actions = tf.cast(tf.one_hot(actions, depth=self.action_dim),
                              tf.float32)

        critic_dict = self.fit_critic(states, actions, next_states, rewards,
                                      discounts)

        actor_dict = self.fit_actor(
            states[0] if self.num_augmentations > 0 else states)

        return {**actor_dict, **critic_dict}
    def update_step(self,
                    replay_buffer_iter,
                    numpy_dataset,
                    train_target='both'):
        """Performs a single training step for critic and embedding.

    Args:
      replay_buffer_iter: A tensorflow graph iteratable object.
      numpy_dataset: Whether the dataset is a NumPy array
      train_target: string specifying whether update RL and or representation

    Returns:
      Dictionary with losses to track.
    """
        del train_target
        transition = next(replay_buffer_iter)

        # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 ->
        # n_batch x H x W x 3*n_frames
        if not numpy_dataset:
            states = transition.observation[:, 0]
            next_states = transition.observation[:, 1]
            actions = transition.action[:, 0]
            rewards = transition.reward[:, 0]
            discounts = transition.discount[:, 0]

            if transition.observation.dtype == tf.uint8:
                states = tf.cast(states, tf.float32) / 255.
                next_states = tf.cast(next_states, tf.float32) / 255.
        else:
            states, actions, rewards, next_states, discounts = transition

        if self.num_augmentations > 0:
            states, next_states = tf_utils.image_aug(
                states,
                next_states,
                img_pad=4,
                num_augmentations=self.num_augmentations,
                obs_dim=64,
                channels=3,
                cropped_shape=[self.batch_size, 68, 68, 3])

        next_actions = self.act(next_states, data_aug=True)

        # entropy_rewards = self.discount * discounts * self.alpha * next_log_probs
        # rewards -= entropy_rewards
        critic_dict = self.fit_critic(states, actions, next_states,
                                      next_actions, rewards, discounts)

        return critic_dict
    def update_step(self, replay_buffer_iter, train_target='both'):
        transition = next(replay_buffer_iter)
        numpy_dataset = isinstance(replay_buffer_iter, np.ndarray)
        if not numpy_dataset:
            states = transition.observation[:, 0]
            next_states = transition.observation[:, 1]
            actions = transition.action[:, 0]
            rewards = transition.reward[:, 0]
            discounts = transition.discount[:, 0]

            if transition.observation.dtype == tf.uint8:
                states = tf.cast(states, tf.float32) / 255.
                next_states = tf.cast(next_states, tf.float32) / 255.
        else:
            states, actions, rewards, next_states, discounts = transition

        if self.num_augmentations > 0:
            states, next_states = tf_utils.image_aug(
                states,
                next_states,
                img_pad=4,
                num_augmentations=self.num_augmentations,
                obs_dim=64,
                channels=3,
                cropped_shape=[self.batch_size, 68, 68, 3])

        next_actions = self.act(next_states, data_aug=True)

        if train_target == 'both':
            ssl_dict = self.fit_embedding(states, actions, next_states,
                                          next_actions, rewards, discounts)
            critic_dict = self.fit_critic(states, actions, next_states,
                                          next_actions, rewards, discounts)
        elif train_target == 'encoder':
            ssl_dict = self.fit_embedding(states, actions, next_states,
                                          next_actions, rewards, discounts)
            critic_dict = {}
        elif train_target == 'rl':
            ssl_dict = {}
            critic_dict = self.fit_critic(states, actions, next_states,
                                          next_actions, rewards, discounts)

        return {**ssl_dict, **critic_dict}
Beispiel #6
0
    def update_step(self, replay_buffer_iter, train_target='both'):
        """Performs a single training step for critic and embedding.

    Args:
      replay_buffer_iter: A tensorflow graph iteratable object.
      train_target: string specifying whether update RL and or representation

    Returns:
      Dictionary with losses to track.
    """

        transition = next(replay_buffer_iter)
        numpy_dataset = isinstance(replay_buffer_iter, np.ndarray)
        # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1
        # -> n_batch x H x W x 3*n_frames
        if not numpy_dataset:
            states = transition.observation[:, 0]
            next_states = transition.observation[:, 1]
            actions = transition.action[:, 0]
            rewards = transition.reward
            level_ids = transition.policy_info[:, 0]
            if tf.shape(transition.reward)[1] > 2:
                rewards = tf.einsum(
                    'ij,j->i', rewards,
                    self.discount**tf.range(0,
                                            tf.shape(transition.reward)[1],
                                            dtype=tf.float32))
                self.n_step_rewards = tf.shape(transition.reward)[1]
            else:
                rewards = transition.reward[:, 0]
                self.n_step_rewards = 1
            discounts = transition.discount[:, 0]

            if transition.observation.dtype == tf.uint8:
                states = tf.cast(states, tf.float32) / 255.
                next_states = tf.cast(next_states, tf.float32) / 255.
        else:
            states, actions, rewards, next_states, discounts = transition

        self.reward_normalizer.update_normalization_statistics(rewards)

        if self.num_augmentations > 0:
            states, next_states = tf_utils.image_aug(
                states,
                next_states,
                img_pad=4,
                num_augmentations=self.num_augmentations,
                obs_dim=64,
                channels=3,
                cropped_shape=[self.batch_size, 68, 68, 3])

        next_actions_pi = self.act(next_states, data_aug=True)
        next_actions_mu = transition.action[:, 1]
        next_actions_pi_per_level = next_actions_mu

        states_b1 = states
        next_states_b1 = next_states
        actions_b1 = actions
        next_actions_b1 = next_actions_pi
        rewards_b1 = rewards
        discounts_b1 = discounts
        level_ids_b1 = level_ids

        states_b2 = states
        next_states_b2 = next_states
        actions_b2 = actions
        next_actions_b2 = next_actions_pi
        rewards_b2 = rewards
        discounts_b2 = discounts

        if train_target == 'both':
            critic_dict = self.fit_critic(states_b2, actions_b2,
                                          next_states_b2, next_actions_b2,
                                          rewards_b2, discounts_b2)
            print('Updating per-task critics')
            ssl_dict = {}
            critic_distillation_dict = self.fit_task_critics(
                states_b1, actions_b1, next_states_b1,
                next_actions_pi_per_level, rewards_b1, discounts_b1,
                level_ids_b1)
            print('Done updating per-task critics')
            return {**ssl_dict, **critic_dict, **critic_distillation_dict}
        elif train_target == 'encoder':
            print('Updating per-task critics')
            critic_distillation_dict = self.fit_task_critics(
                states_b1, actions_b1, next_states_b1,
                next_actions_pi_per_level, rewards_b1, discounts_b1,
                level_ids_b1)
            print('Done updating per-task critics')
            ssl_dict = {}
            critic_dict = {}
            return {**ssl_dict, **critic_distillation_dict}

        elif train_target == 'rl':
            critic_distillation_dict = {}
            critic_dict = self.fit_critic(states_b2, actions_b2,
                                          next_states_b2, next_actions_b2,
                                          rewards_b2, discounts_b2)
            ssl_dict = self.fit_embedding(states_b1, actions_b1,
                                          next_states_b1, next_actions_b1,
                                          rewards_b1, discounts_b1, level_ids)

        return {**ssl_dict, **critic_dict, **critic_distillation_dict}
  def update_step(self, replay_buffer_iter):
    """Performs a single training step for critic and embedding.

    Args:
      replay_buffer_iter: A tensorflow graph iteratable object.

    Returns:
      Dictionary with losses to track.
    """
    transition = next(replay_buffer_iter)
    numpy_dataset = isinstance(replay_buffer_iter, np.ndarray)
    # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 ->
    # n_batch x H x W x 3*n_frames
    if not numpy_dataset:
      states = transition.observation[:, 0]
      next_states = transition.observation[:, 1]
      actions = transition.action[:, 0]

      if transition.observation.dtype == tf.uint8:
        states = tf.cast(states, tf.float32) / 255.
        next_states = tf.cast(next_states, tf.float32) / 255.
    else:
      states, actions, _, next_states, _ = transition

    if self.num_augmentations > 0:
      states, next_states = tf_utils.image_aug(
          states,
          next_states,
          img_pad=4,
          num_augmentations=self.num_augmentations,
          obs_dim=64,
          channels=3,
          cropped_shape=[self.batch_size, 68, 68, 3])
      states = states[0]
      next_states = next_states[0]

    # actions = tf.gather(self.PROCGEN_ACTION_MAP.astype(np.int32).argmax(1),
    #                     actions,axis=0)

    variables = self.policy.trainable_variables

    with tf.GradientTape(watch_accessed_variables=False) as tape:
      tape.watch(variables)

      log_probs, entropy = self.policy.log_probs(
          states, actions, with_entropy=True)

      loss = -tf.reduce_mean(log_probs)  # self.alpha * entropy +

    grads = tape.gradient(loss, variables)

    self.optimizer.apply_gradients(zip(grads, variables))

#     with tf.GradientTape(watch_accessed_variables=False) as tape:
#       tape.watch([self.log_alpha])
#       alpha_loss = tf.reduce_mean(self.alpha * (entropy - self.target_entropy)

#     alpha_grads = tape.gradient(alpha_loss, [self.log_alpha])
#     self.alpha_optimizer.apply_gradients(zip(alpha_grads, [self.log_alpha]))
    alpha_loss = tf.constant(0.)

    return {
        'bc_actor_loss': loss,
        'bc_alpha': self.alpha,
        'bc_alpha_loss': alpha_loss,
        'bc_log_probs': tf.reduce_mean(log_probs),
        'bc_entropy': tf.reduce_mean(entropy)
    }