def set_device(self, device, memory_backend_params=None, mode='numpy') -> None: """ An optional function that allows the filter to get the device if it is required to use tensorflow ops :param device: the device to use :return: None """ if mode == 'tf': from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats self.running_rewards_stats = TFSharedRunningStats( device, name='rewards_stats', create_ops=False, pubsub_params=memory_backend_params) elif mode == 'numpy': self.running_rewards_stats = NumpySharedRunningStats( name='rewards_stats', pubsub_params=memory_backend_params)
def set_device(self, device, memory_backend_params=None, mode='numpy') -> None: """ An optional function that allows the filter to get the device if it is required to use tensorflow ops :param device: the device to use :memory_backend_params: if not None, holds params for a memory backend for sharing data (e.g. Redis) :param mode: the arithmetic module to use {'tf' | 'numpy'} :return: None """ if mode == 'tf': from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats self.running_observation_stats = TFSharedRunningStats( device, name=self.name, create_ops=False, pubsub_params=memory_backend_params) elif mode == 'numpy': self.running_observation_stats = NumpySharedRunningStats( name=self.name, pubsub_params=memory_backend_params)
class ObservationNormalizationFilter(ObservationFilter): """ Normalizes the observation values with a running mean and standard deviation of all the observations seen so far. The normalization is performed element-wise. Additionally, when working with multiple workers, the statistics used for the normalization operation are accumulated over all the workers. """ def __init__(self, clip_min: float = -5.0, clip_max: float = 5.0, name='observation_stats'): """ :param clip_min: The minimum value to allow after normalizing the observation :param clip_max: The maximum value to allow after normalizing the observation """ super().__init__() self.clip_min = clip_min self.clip_max = clip_max self.running_observation_stats = None self.name = name self.supports_batching = True self.observation_space = None def set_device(self, device, memory_backend_params=None, mode='numpy') -> None: """ An optional function that allows the filter to get the device if it is required to use tensorflow ops :param device: the device to use :memory_backend_params: if not None, holds params for a memory backend for sharing data (e.g. Redis) :param mode: the arithmetic module to use {'tf' | 'numpy'} :return: None """ if mode == 'tf': from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats self.running_observation_stats = TFSharedRunningStats( device, name=self.name, create_ops=False, pubsub_params=memory_backend_params) elif mode == 'numpy': self.running_observation_stats = NumpySharedRunningStats( name=self.name, pubsub_params=memory_backend_params) def set_session(self, sess) -> None: """ An optional function that allows the filter to get the session if it is required to use tensorflow ops :param sess: the session :return: None """ self.running_observation_stats.set_session(sess) def filter(self, observations: List[ObservationType], update_internal_state: bool = True) -> ObservationType: observations = np.array(observations) if update_internal_state: self.running_observation_stats.push(observations) self.last_mean = self.running_observation_stats.mean self.last_stdev = self.running_observation_stats.std return self.running_observation_stats.normalize(observations) def get_filtered_observation_space( self, input_observation_space: ObservationSpace) -> ObservationSpace: self.running_observation_stats.set_params( shape=input_observation_space.shape, clip_values=(self.clip_min, self.clip_max)) return input_observation_space def save_state_to_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str): self.running_observation_stats.save_state_to_checkpoint( checkpoint_dir, checkpoint_prefix) def restore_state_from_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str): self.running_observation_stats.restore_state_from_checkpoint( checkpoint_dir, checkpoint_prefix)
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None): super().__init__(agent_parameters, parent) self.rnd_stats = NumpySharedRunningStats(name='RND_normalization', epsilon=1e-8) self.rnd_stats.set_params() self.rnd_obs_stats = NumpySharedRunningStats(name='RND_observation_normalization', epsilon=1e-8) self.intrinsic_returns_estimate = None
class TD3ExplorationAgent(TD3Agent): def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None): super().__init__(agent_parameters, parent) self.rnd_stats = NumpySharedRunningStats(name='RND_normalization', epsilon=1e-8) self.rnd_stats.set_params() self.rnd_obs_stats = NumpySharedRunningStats(name='RND_observation_normalization', epsilon=1e-8) self.intrinsic_returns_estimate = None def update_intrinsic_returns_estimate(self, rewards): returns = np.zeros_like(rewards) for i, r in enumerate(rewards): if self.intrinsic_returns_estimate is None: self.intrinsic_returns_estimate = r else: self.intrinsic_returns_estimate = \ self.intrinsic_returns_estimate * self.ap.algorithm.discount + r returns[i] = self.intrinsic_returns_estimate return returns def prepare_rnd_inputs(self, batch): env_obs_key = self.ap.algorithm.env_obs_key next_states = batch.next_states([env_obs_key]) inputs = {env_obs_key: self.rnd_obs_stats.normalize(next_states[env_obs_key])} return inputs def handle_self_supervised_reward(self, batch): """ Allows agents to update the batch for self supervised learning :param batch: original training batch :return: updated traing batch """ return batch def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition: """ Allows agents to update the transition just before adding it to the replay buffer. Can be useful for agents that want to tweak the reward, termination signal, etc. :param transition: the transition to update :return: the updated transition """ transition = super().update_transition_before_adding_to_replay_buffer(transition) image = np.array(transition.state[self.ap.algorithm.env_obs_key]) if self.rnd_obs_stats.n < 1: self.rnd_obs_stats.set_params(shape=image.shape, clip_values=[-5, 5]) self.rnd_obs_stats.push_val(np.expand_dims(image, 0)) return transition def train_rnd(self): if self.memory.num_transitions() == 0: return transitions = self.memory.transitions[-self.ap.algorithm.rnd_sample_size:] dataset = Batch(transitions) dataset_order = list(range(dataset.size)) batch_size = self.ap.algorithm.rnd_batch_size for epoch in range(self.ap.algorithm.rnd_optimization_epochs): shuffle(dataset_order) total_loss = 0 total_grads = 0 for i in range(int(dataset.size / batch_size)): start = i * batch_size end = (i + 1) * batch_size batch = Batch(list(np.array(dataset.transitions)[dataset_order[start:end]])) inputs = self.prepare_rnd_inputs(batch) const_embedding = self.networks['constant'].online_network.predict(inputs) res = self.networks['predictor'].train_and_sync_networks(inputs, [const_embedding]) total_loss += res[0] total_grads += res[2] screen.log_dict( OrderedDict([ ("training epoch", epoch), ("dataset size", dataset.size), ("mean loss", total_loss / dataset.size), ("mean gradients", total_grads / dataset.size) ]), prefix="RND Training" ) def learn_from_batch(self, batch): batch = self.handle_self_supervised_reward(batch) return super().learn_from_batch(batch) def train(self): self.ap.algorithm.num_consecutive_training_steps = \ int(self.current_episode_steps_counter * self.ap.algorithm.td3_training_ratio) return Agent.train(self) def calculate_novelty(self, batch): inputs = self.prepare_rnd_inputs(batch) embedding = self.networks['constant'].online_network.predict(inputs) prediction = self.networks['predictor'].online_network.predict(inputs) prediction_error = np.mean((embedding - prediction) ** 2, axis=1) return prediction_error def save_replay_buffer(self, dir_path=None): if dir_path is None: dir_path = os.path.join(self.parent_level_manager.parent_graph_manager.task_parameters.experiment_path, 'replay_buffer') if not os.path.exists(dir_path): os.mkdir(dir_path) path = os.path.join(dir_path, 'RB_{}.joblib.bz2'.format(type(self).__name__)) joblib.dump(self.memory.get_all_complete_episodes(), path, compress=('bz2', 1)) screen.log('Saved replay buffer to: \"{}\" - Number of transitions: {}'.format(path, self.memory.num_transitions())) def handle_episode_ended(self) -> None: super().handle_episode_ended() if self.total_steps_counter % self.ap.algorithm.rnd_sample_size == 0: self.train_rnd() if self.total_steps_counter % self.ap.algorithm.replay_buffer_save_steps == 0: self.save_replay_buffer(self.ap.algorithm.replay_buffer_save_path) self.save_rnd_images(self.ap.algorithm.replay_buffer_save_path) def save_rnd_images(self, dir_path=None): if dir_path is None: dir_path = os.path.join(self.parent_level_manager.parent_graph_manager.task_parameters.experiment_path, 'rnd_images') else: dir_path = os.path.join(dir_path, 'rnd_images') if not os.path.exists(dir_path): os.mkdir(dir_path) transitions = self.memory.transitions dataset = Batch(transitions) batch_size = self.ap.algorithm.rnd_batch_size novelties = [] for i in range(int(dataset.size / batch_size)): start = i * batch_size end = (i + 1) * batch_size batch = Batch(dataset[start:end]) novelty = self.calculate_novelty(batch) novelties.append(novelty) novelties = np.concatenate(novelties) sorted_indices = np.argsort(novelties) sample_indices = sorted_indices[np.round(np.linspace(0, len(sorted_indices) - 1, 100)).astype(np.uint32)] images = [] for si in sample_indices: images.append(np.flip(transitions[si].next_state[self.ap.algorithm.env_obs_key], 0)) rows = [] for i in range(10): rows.append(np.hstack(images[(i * 10):((i + 1) * 10)])) image = np.vstack(rows) image = Image.fromarray(image) image.save('{}/{}_{}.jpeg'.format(dir_path, 'rnd_samples', len(transitions)))
class RewardNormalizationFilter(RewardFilter): """ Normalizes the reward values with a running mean and standard deviation of all the rewards seen so far. When working with multiple workers, the statistics used for the normalization operation are accumulated over all the workers. """ def __init__(self, clip_min: float = -5.0, clip_max: float = 5.0): """ :param clip_min: The minimum value to allow after normalizing the reward :param clip_max: The maximum value to allow after normalizing the reward """ super().__init__() self.clip_min = clip_min self.clip_max = clip_max self.running_rewards_stats = None def set_device(self, device, memory_backend_params=None, mode='numpy') -> None: """ An optional function that allows the filter to get the device if it is required to use tensorflow ops :param device: the device to use :return: None """ if mode == 'tf': from rl_coach.architectures.tensorflow_components.shared_variables import TFSharedRunningStats self.running_rewards_stats = TFSharedRunningStats( device, name='rewards_stats', create_ops=False, pubsub_params=memory_backend_params) elif mode == 'numpy': self.running_rewards_stats = NumpySharedRunningStats( name='rewards_stats', pubsub_params=memory_backend_params) def set_session(self, sess) -> None: """ An optional function that allows the filter to get the session if it is required to use tensorflow ops :param sess: the session :return: None """ self.running_rewards_stats.set_session(sess) def filter(self, reward: RewardType, update_internal_state: bool = True) -> RewardType: if update_internal_state: self.running_rewards_stats.push(reward) reward = (reward - self.running_rewards_stats.mean) / \ (self.running_rewards_stats.std + 1e-15) reward = np.clip(reward, self.clip_min, self.clip_max) return reward def get_filtered_reward_space( self, input_reward_space: RewardSpace) -> RewardSpace: return input_reward_space def save_state_to_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str): self.running_rewards_stats.save_state_to_checkpoint( checkpoint_dir, checkpoint_prefix) def restore_state_from_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str): self.running_rewards_stats.restore_state_from_checkpoint( checkpoint_dir, checkpoint_prefix)