コード例 #1
0
    def set_device(self,
                   device,
                   memory_backend_params=None,
                   mode='numpy') -> None:
        """
        An optional function that allows the filter to get the device if it is required to use tensorflow ops
        :param device: the device to use
        :return: None
        """

        if mode == 'tf':
            from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats
            self.running_rewards_stats = TFSharedRunningStats(
                device,
                name='rewards_stats',
                create_ops=False,
                pubsub_params=memory_backend_params)
        elif mode == 'numpy':
            self.running_rewards_stats = NumpySharedRunningStats(
                name='rewards_stats', pubsub_params=memory_backend_params)
 def set_device(self,
                device,
                memory_backend_params=None,
                mode='numpy') -> None:
     """
     An optional function that allows the filter to get the device if it is required to use tensorflow ops
     :param device: the device to use
     :memory_backend_params: if not None, holds params for a memory backend for sharing data (e.g. Redis)
     :param mode: the arithmetic module to use {'tf' | 'numpy'}
     :return: None
     """
     if mode == 'tf':
         from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats
         self.running_observation_stats = TFSharedRunningStats(
             device,
             name=self.name,
             create_ops=False,
             pubsub_params=memory_backend_params)
     elif mode == 'numpy':
         self.running_observation_stats = NumpySharedRunningStats(
             name=self.name, pubsub_params=memory_backend_params)
class ObservationNormalizationFilter(ObservationFilter):
    """
    Normalizes the observation values with a running mean and standard deviation of
    all the observations seen so far. The normalization is performed element-wise. Additionally, when working with
    multiple workers, the statistics used for the normalization operation are accumulated over all the workers.
    """
    def __init__(self,
                 clip_min: float = -5.0,
                 clip_max: float = 5.0,
                 name='observation_stats'):
        """
        :param clip_min: The minimum value to allow after normalizing the observation
        :param clip_max: The maximum value to allow after normalizing the observation
        """
        super().__init__()
        self.clip_min = clip_min
        self.clip_max = clip_max
        self.running_observation_stats = None
        self.name = name
        self.supports_batching = True
        self.observation_space = None

    def set_device(self,
                   device,
                   memory_backend_params=None,
                   mode='numpy') -> None:
        """
        An optional function that allows the filter to get the device if it is required to use tensorflow ops
        :param device: the device to use
        :memory_backend_params: if not None, holds params for a memory backend for sharing data (e.g. Redis)
        :param mode: the arithmetic module to use {'tf' | 'numpy'}
        :return: None
        """
        if mode == 'tf':
            from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats
            self.running_observation_stats = TFSharedRunningStats(
                device,
                name=self.name,
                create_ops=False,
                pubsub_params=memory_backend_params)
        elif mode == 'numpy':
            self.running_observation_stats = NumpySharedRunningStats(
                name=self.name, pubsub_params=memory_backend_params)

    def set_session(self, sess) -> None:
        """
        An optional function that allows the filter to get the session if it is required to use tensorflow ops
        :param sess: the session
        :return: None
        """
        self.running_observation_stats.set_session(sess)

    def filter(self,
               observations: List[ObservationType],
               update_internal_state: bool = True) -> ObservationType:
        observations = np.array(observations)
        if update_internal_state:
            self.running_observation_stats.push(observations)
            self.last_mean = self.running_observation_stats.mean
            self.last_stdev = self.running_observation_stats.std

        return self.running_observation_stats.normalize(observations)

    def get_filtered_observation_space(
            self,
            input_observation_space: ObservationSpace) -> ObservationSpace:
        self.running_observation_stats.set_params(
            shape=input_observation_space.shape,
            clip_values=(self.clip_min, self.clip_max))
        return input_observation_space

    def save_state_to_checkpoint(self, checkpoint_dir: str,
                                 checkpoint_prefix: str):
        self.running_observation_stats.save_state_to_checkpoint(
            checkpoint_dir, checkpoint_prefix)

    def restore_state_from_checkpoint(self, checkpoint_dir: str,
                                      checkpoint_prefix: str):
        self.running_observation_stats.restore_state_from_checkpoint(
            checkpoint_dir, checkpoint_prefix)
コード例 #4
0
 def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
     super().__init__(agent_parameters, parent)
     self.rnd_stats = NumpySharedRunningStats(name='RND_normalization', epsilon=1e-8)
     self.rnd_stats.set_params()
     self.rnd_obs_stats = NumpySharedRunningStats(name='RND_observation_normalization', epsilon=1e-8)
     self.intrinsic_returns_estimate = None
コード例 #5
0
class TD3ExplorationAgent(TD3Agent):
    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
        super().__init__(agent_parameters, parent)
        self.rnd_stats = NumpySharedRunningStats(name='RND_normalization', epsilon=1e-8)
        self.rnd_stats.set_params()
        self.rnd_obs_stats = NumpySharedRunningStats(name='RND_observation_normalization', epsilon=1e-8)
        self.intrinsic_returns_estimate = None

    def update_intrinsic_returns_estimate(self, rewards):
        returns = np.zeros_like(rewards)
        for i, r in enumerate(rewards):
            if self.intrinsic_returns_estimate is None:
                self.intrinsic_returns_estimate = r
            else:
                self.intrinsic_returns_estimate = \
                    self.intrinsic_returns_estimate * self.ap.algorithm.discount + r
            returns[i] = self.intrinsic_returns_estimate
        return returns

    def prepare_rnd_inputs(self, batch):
        env_obs_key = self.ap.algorithm.env_obs_key
        next_states = batch.next_states([env_obs_key])
        inputs = {env_obs_key: self.rnd_obs_stats.normalize(next_states[env_obs_key])}
        return inputs

    def handle_self_supervised_reward(self, batch):
        """
        Allows agents to update the batch for self supervised learning

        :param batch: original training batch
        :return: updated traing batch
        """
        return batch

    def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
        """
        Allows agents to update the transition just before adding it to the replay buffer.
        Can be useful for agents that want to tweak the reward, termination signal, etc.

        :param transition: the transition to update
        :return: the updated transition
        """
        transition = super().update_transition_before_adding_to_replay_buffer(transition)
        image = np.array(transition.state[self.ap.algorithm.env_obs_key])
        if self.rnd_obs_stats.n < 1:
            self.rnd_obs_stats.set_params(shape=image.shape, clip_values=[-5, 5])
        self.rnd_obs_stats.push_val(np.expand_dims(image, 0))
        return transition

    def train_rnd(self):
        if self.memory.num_transitions() == 0:
            return

        transitions = self.memory.transitions[-self.ap.algorithm.rnd_sample_size:]
        dataset = Batch(transitions)
        dataset_order = list(range(dataset.size))
        batch_size = self.ap.algorithm.rnd_batch_size
        for epoch in range(self.ap.algorithm.rnd_optimization_epochs):
            shuffle(dataset_order)
            total_loss = 0
            total_grads = 0
            for i in range(int(dataset.size / batch_size)):
                start = i * batch_size
                end = (i + 1) * batch_size

                batch = Batch(list(np.array(dataset.transitions)[dataset_order[start:end]]))
                inputs = self.prepare_rnd_inputs(batch)

                const_embedding = self.networks['constant'].online_network.predict(inputs)

                res = self.networks['predictor'].train_and_sync_networks(inputs, [const_embedding])

                total_loss += res[0]
                total_grads += res[2]

            screen.log_dict(
                OrderedDict([
                    ("training epoch", epoch),
                    ("dataset size", dataset.size),
                    ("mean loss", total_loss / dataset.size),
                    ("mean gradients", total_grads / dataset.size)
                ]),
                prefix="RND Training"
            )

    def learn_from_batch(self, batch):
        batch = self.handle_self_supervised_reward(batch)
        return super().learn_from_batch(batch)

    def train(self):
        self.ap.algorithm.num_consecutive_training_steps = \
            int(self.current_episode_steps_counter * self.ap.algorithm.td3_training_ratio)
        return Agent.train(self)

    def calculate_novelty(self, batch):
        inputs = self.prepare_rnd_inputs(batch)
        embedding = self.networks['constant'].online_network.predict(inputs)
        prediction = self.networks['predictor'].online_network.predict(inputs)
        prediction_error = np.mean((embedding - prediction) ** 2, axis=1)
        return prediction_error

    def save_replay_buffer(self, dir_path=None):
        if dir_path is None:
            dir_path = os.path.join(self.parent_level_manager.parent_graph_manager.task_parameters.experiment_path,
                                    'replay_buffer')
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)

        path = os.path.join(dir_path, 'RB_{}.joblib.bz2'.format(type(self).__name__))
        joblib.dump(self.memory.get_all_complete_episodes(), path, compress=('bz2', 1))

        screen.log('Saved replay buffer to: \"{}\" - Number of transitions: {}'.format(path,
                                                                                       self.memory.num_transitions()))

    def handle_episode_ended(self) -> None:
        super().handle_episode_ended()

        if self.total_steps_counter % self.ap.algorithm.rnd_sample_size == 0:
            self.train_rnd()

        if self.total_steps_counter % self.ap.algorithm.replay_buffer_save_steps == 0:
            self.save_replay_buffer(self.ap.algorithm.replay_buffer_save_path)
            self.save_rnd_images(self.ap.algorithm.replay_buffer_save_path)

    def save_rnd_images(self, dir_path=None):
        if dir_path is None:
            dir_path = os.path.join(self.parent_level_manager.parent_graph_manager.task_parameters.experiment_path,
                                    'rnd_images')
        else:
            dir_path = os.path.join(dir_path, 'rnd_images')
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        transitions = self.memory.transitions
        dataset = Batch(transitions)
        batch_size = self.ap.algorithm.rnd_batch_size
        novelties = []
        for i in range(int(dataset.size / batch_size)):
            start = i * batch_size
            end = (i + 1) * batch_size

            batch = Batch(dataset[start:end])
            novelty = self.calculate_novelty(batch)
            novelties.append(novelty)
        novelties = np.concatenate(novelties)
        sorted_indices = np.argsort(novelties)
        sample_indices = sorted_indices[np.round(np.linspace(0, len(sorted_indices) - 1, 100)).astype(np.uint32)]
        images = []
        for si in sample_indices:
            images.append(np.flip(transitions[si].next_state[self.ap.algorithm.env_obs_key], 0))
        rows = []
        for i in range(10):
            rows.append(np.hstack(images[(i * 10):((i + 1) * 10)]))
        image = np.vstack(rows)
        image = Image.fromarray(image)
        image.save('{}/{}_{}.jpeg'.format(dir_path, 'rnd_samples', len(transitions)))
コード例 #6
0
class RewardNormalizationFilter(RewardFilter):
    """
    Normalizes the reward values with a running mean and standard deviation of
    all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation
    are accumulated over all the workers.
    """
    def __init__(self, clip_min: float = -5.0, clip_max: float = 5.0):
        """
        :param clip_min: The minimum value to allow after normalizing the reward
        :param clip_max: The maximum value to allow after normalizing the reward
        """
        super().__init__()
        self.clip_min = clip_min
        self.clip_max = clip_max
        self.running_rewards_stats = None

    def set_device(self,
                   device,
                   memory_backend_params=None,
                   mode='numpy') -> None:
        """
        An optional function that allows the filter to get the device if it is required to use tensorflow ops
        :param device: the device to use
        :return: None
        """

        if mode == 'tf':
            from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats
            self.running_rewards_stats = TFSharedRunningStats(
                device,
                name='rewards_stats',
                create_ops=False,
                pubsub_params=memory_backend_params)
        elif mode == 'numpy':
            self.running_rewards_stats = NumpySharedRunningStats(
                name='rewards_stats', pubsub_params=memory_backend_params)

    def set_session(self, sess) -> None:
        """
        An optional function that allows the filter to get the session if it is required to use tensorflow ops
        :param sess: the session
        :return: None
        """
        self.running_rewards_stats.set_session(sess)

    def filter(self,
               reward: RewardType,
               update_internal_state: bool = True) -> RewardType:
        if update_internal_state:
            self.running_rewards_stats.push(reward)

        reward = (reward - self.running_rewards_stats.mean) / \
                      (self.running_rewards_stats.std + 1e-15)
        reward = np.clip(reward, self.clip_min, self.clip_max)

        return reward

    def get_filtered_reward_space(
            self, input_reward_space: RewardSpace) -> RewardSpace:
        return input_reward_space

    def save_state_to_checkpoint(self, checkpoint_dir: str,
                                 checkpoint_prefix: str):
        self.running_rewards_stats.save_state_to_checkpoint(
            checkpoint_dir, checkpoint_prefix)

    def restore_state_from_checkpoint(self, checkpoint_dir: str,
                                      checkpoint_prefix: str):
        self.running_rewards_stats.restore_state_from_checkpoint(
            checkpoint_dir, checkpoint_prefix)