def update_step(self, replay_buffer_iter, train_target='both'): """Performs a single training step for critic and embedding. Args: replay_buffer_iter: A tensorflow graph iteratable object. train_target: string specifying whether update RL and or representation Returns: Dictionary with losses to track. """ del train_target transition = next(replay_buffer_iter) numpy_dataset = isinstance(replay_buffer_iter, np.ndarray) # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 -> # n_batch x H x W x 3*n_frames if not numpy_dataset: states = transition.observation[:, 0] next_states = transition.observation[:, 1] actions = transition.action[:, 0] rewards = transition.reward[:, 0] discounts = transition.discount[:, 0] if transition.observation.dtype == tf.uint8: states = tf.cast(states, tf.float32) / 255. next_states = tf.cast(next_states, tf.float32) / 255. else: states, actions, rewards, next_states, discounts = transition if self.num_augmentations > 0: states, next_states = tf_utils.image_aug( states, next_states, img_pad=4, num_augmentations=self.num_augmentations) if not self.discrete_actions: actor_dict = self.fit_actor( states[0] if self.num_augmentations > 0 else states, actions) next_actions = self.select_actions( next_states[0] if self.num_augmentations > 0 else next_states) if self.discrete_actions: actions = tf.cast(tf.one_hot(actions, depth=self.action_dim), tf.float32) next_actions = tf.cast( tf.one_hot(next_actions, depth=self.action_dim), tf.float32) critic_dict = self.critic_learner.fit_critic( states[0] if self.num_augmentations > 0 else states, actions, next_states[0] if self.num_augmentations > 0 else next_states, next_actions, rewards, discounts) if self.discrete_actions: return critic_dict else: return {**actor_dict, **critic_dict}
def update_step(self, replay_buffer_iter, train_target='both'): """Performs a single training step for critic and embedding. Args: replay_buffer_iter: A tensorflow graph iteratable object. train_target: string specifying whether update RL and or representation Returns: Dictionary with losses to track. """ transition = next(replay_buffer_iter) numpy_dataset = isinstance(replay_buffer_iter, np.ndarray) # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 -> # n_batch x H x W x 3*n_frames if not numpy_dataset: states = transition.observation[:, 0] next_states = transition.observation[:, 1] actions = transition.action[:, 0] rewards = transition.reward[:, 0] discounts = transition.discount[:, 0] if transition.observation.dtype == tf.uint8: states = tf.cast(states, tf.float32) / 255. next_states = tf.cast(next_states, tf.float32) / 255. else: states, actions, rewards, next_states, discounts = transition if self.num_augmentations > 0: states, next_states = tf_utils.image_aug( states, next_states, img_pad=4, num_augmentations=self.num_augmentations, obs_dim=64, channels=3, cropped_shape=[self.batch_size, 68, 68, 3]) next_actions = self.act(next_states, data_aug=True) if train_target == 'both': ssl_dict = self.fit_embedding(states, actions, next_states, next_actions, rewards, discounts) critic_dict = self.fit_critic(states, actions, next_states, next_actions, rewards, discounts) elif train_target == 'encoder': ssl_dict = self.fit_embedding(states, actions, next_states, next_actions, rewards, discounts) critic_dict = {} elif train_target == 'rl': ssl_dict = {} critic_dict = self.fit_critic(states, actions, next_states, next_actions, rewards, discounts) return {**ssl_dict, **critic_dict}
def update_step(self, replay_buffer_iter, numpy_dataset): """Performs a single training step for critic and actor. Args: replay_buffer_iter: A tensorflow graph iteratable object. numpy_dataset: Is the dataset a NumPy array? Returns: Dictionary with losses to track. """ transition = next(replay_buffer_iter) # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 -> # n_batch x H x W x 3*n_frames if not numpy_dataset: states = transition.observation[:, 0] next_states = transition.observation[:, 1] actions = transition.action[:, 0] rewards = transition.reward[:, 0] discounts = transition.discount[:, 0] if transition.observation.dtype == tf.uint8: states = tf.cast(states, tf.float32) / 255. next_states = tf.cast(next_states, tf.float32) / 255. else: states, actions, rewards, next_states, discounts = transition if self.num_augmentations > 0: states, next_states = tf_utils.image_aug( states, next_states, img_pad=4, num_augmentations=self.num_augmentations, obs_dim=64, channels=3, cropped_shape=[self.batch_size, 68, 68, 3]) # states, actions, rewards, discounts, next_states = next(replay_buffer_iter rewards = rewards + self.reward_bonus if self.discrete_actions: actions = tf.cast(tf.one_hot(actions, depth=self.action_dim), tf.float32) critic_dict = self.fit_critic(states, actions, next_states, rewards, discounts) actor_dict = self.fit_actor( states[0] if self.num_augmentations > 0 else states) return {**actor_dict, **critic_dict}
def update_step(self, replay_buffer_iter, numpy_dataset, train_target='both'): """Performs a single training step for critic and embedding. Args: replay_buffer_iter: A tensorflow graph iteratable object. numpy_dataset: Whether the dataset is a NumPy array train_target: string specifying whether update RL and or representation Returns: Dictionary with losses to track. """ del train_target transition = next(replay_buffer_iter) # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 -> # n_batch x H x W x 3*n_frames if not numpy_dataset: states = transition.observation[:, 0] next_states = transition.observation[:, 1] actions = transition.action[:, 0] rewards = transition.reward[:, 0] discounts = transition.discount[:, 0] if transition.observation.dtype == tf.uint8: states = tf.cast(states, tf.float32) / 255. next_states = tf.cast(next_states, tf.float32) / 255. else: states, actions, rewards, next_states, discounts = transition if self.num_augmentations > 0: states, next_states = tf_utils.image_aug( states, next_states, img_pad=4, num_augmentations=self.num_augmentations, obs_dim=64, channels=3, cropped_shape=[self.batch_size, 68, 68, 3]) next_actions = self.act(next_states, data_aug=True) # entropy_rewards = self.discount * discounts * self.alpha * next_log_probs # rewards -= entropy_rewards critic_dict = self.fit_critic(states, actions, next_states, next_actions, rewards, discounts) return critic_dict
def update_step(self, replay_buffer_iter, train_target='both'): transition = next(replay_buffer_iter) numpy_dataset = isinstance(replay_buffer_iter, np.ndarray) if not numpy_dataset: states = transition.observation[:, 0] next_states = transition.observation[:, 1] actions = transition.action[:, 0] rewards = transition.reward[:, 0] discounts = transition.discount[:, 0] if transition.observation.dtype == tf.uint8: states = tf.cast(states, tf.float32) / 255. next_states = tf.cast(next_states, tf.float32) / 255. else: states, actions, rewards, next_states, discounts = transition if self.num_augmentations > 0: states, next_states = tf_utils.image_aug( states, next_states, img_pad=4, num_augmentations=self.num_augmentations, obs_dim=64, channels=3, cropped_shape=[self.batch_size, 68, 68, 3]) next_actions = self.act(next_states, data_aug=True) if train_target == 'both': ssl_dict = self.fit_embedding(states, actions, next_states, next_actions, rewards, discounts) critic_dict = self.fit_critic(states, actions, next_states, next_actions, rewards, discounts) elif train_target == 'encoder': ssl_dict = self.fit_embedding(states, actions, next_states, next_actions, rewards, discounts) critic_dict = {} elif train_target == 'rl': ssl_dict = {} critic_dict = self.fit_critic(states, actions, next_states, next_actions, rewards, discounts) return {**ssl_dict, **critic_dict}
def update_step(self, replay_buffer_iter, train_target='both'): """Performs a single training step for critic and embedding. Args: replay_buffer_iter: A tensorflow graph iteratable object. train_target: string specifying whether update RL and or representation Returns: Dictionary with losses to track. """ transition = next(replay_buffer_iter) numpy_dataset = isinstance(replay_buffer_iter, np.ndarray) # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 # -> n_batch x H x W x 3*n_frames if not numpy_dataset: states = transition.observation[:, 0] next_states = transition.observation[:, 1] actions = transition.action[:, 0] rewards = transition.reward level_ids = transition.policy_info[:, 0] if tf.shape(transition.reward)[1] > 2: rewards = tf.einsum( 'ij,j->i', rewards, self.discount**tf.range(0, tf.shape(transition.reward)[1], dtype=tf.float32)) self.n_step_rewards = tf.shape(transition.reward)[1] else: rewards = transition.reward[:, 0] self.n_step_rewards = 1 discounts = transition.discount[:, 0] if transition.observation.dtype == tf.uint8: states = tf.cast(states, tf.float32) / 255. next_states = tf.cast(next_states, tf.float32) / 255. else: states, actions, rewards, next_states, discounts = transition self.reward_normalizer.update_normalization_statistics(rewards) if self.num_augmentations > 0: states, next_states = tf_utils.image_aug( states, next_states, img_pad=4, num_augmentations=self.num_augmentations, obs_dim=64, channels=3, cropped_shape=[self.batch_size, 68, 68, 3]) next_actions_pi = self.act(next_states, data_aug=True) next_actions_mu = transition.action[:, 1] next_actions_pi_per_level = next_actions_mu states_b1 = states next_states_b1 = next_states actions_b1 = actions next_actions_b1 = next_actions_pi rewards_b1 = rewards discounts_b1 = discounts level_ids_b1 = level_ids states_b2 = states next_states_b2 = next_states actions_b2 = actions next_actions_b2 = next_actions_pi rewards_b2 = rewards discounts_b2 = discounts if train_target == 'both': critic_dict = self.fit_critic(states_b2, actions_b2, next_states_b2, next_actions_b2, rewards_b2, discounts_b2) print('Updating per-task critics') ssl_dict = {} critic_distillation_dict = self.fit_task_critics( states_b1, actions_b1, next_states_b1, next_actions_pi_per_level, rewards_b1, discounts_b1, level_ids_b1) print('Done updating per-task critics') return {**ssl_dict, **critic_dict, **critic_distillation_dict} elif train_target == 'encoder': print('Updating per-task critics') critic_distillation_dict = self.fit_task_critics( states_b1, actions_b1, next_states_b1, next_actions_pi_per_level, rewards_b1, discounts_b1, level_ids_b1) print('Done updating per-task critics') ssl_dict = {} critic_dict = {} return {**ssl_dict, **critic_distillation_dict} elif train_target == 'rl': critic_distillation_dict = {} critic_dict = self.fit_critic(states_b2, actions_b2, next_states_b2, next_actions_b2, rewards_b2, discounts_b2) ssl_dict = self.fit_embedding(states_b1, actions_b1, next_states_b1, next_actions_b1, rewards_b1, discounts_b1, level_ids) return {**ssl_dict, **critic_dict, **critic_distillation_dict}
def update_step(self, replay_buffer_iter): """Performs a single training step for critic and embedding. Args: replay_buffer_iter: A tensorflow graph iteratable object. Returns: Dictionary with losses to track. """ transition = next(replay_buffer_iter) numpy_dataset = isinstance(replay_buffer_iter, np.ndarray) # observation: n_batch x n_timesteps x 1 x H*W*3*n_frames x 1 -> # n_batch x H x W x 3*n_frames if not numpy_dataset: states = transition.observation[:, 0] next_states = transition.observation[:, 1] actions = transition.action[:, 0] if transition.observation.dtype == tf.uint8: states = tf.cast(states, tf.float32) / 255. next_states = tf.cast(next_states, tf.float32) / 255. else: states, actions, _, next_states, _ = transition if self.num_augmentations > 0: states, next_states = tf_utils.image_aug( states, next_states, img_pad=4, num_augmentations=self.num_augmentations, obs_dim=64, channels=3, cropped_shape=[self.batch_size, 68, 68, 3]) states = states[0] next_states = next_states[0] # actions = tf.gather(self.PROCGEN_ACTION_MAP.astype(np.int32).argmax(1), # actions,axis=0) variables = self.policy.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(variables) log_probs, entropy = self.policy.log_probs( states, actions, with_entropy=True) loss = -tf.reduce_mean(log_probs) # self.alpha * entropy + grads = tape.gradient(loss, variables) self.optimizer.apply_gradients(zip(grads, variables)) # with tf.GradientTape(watch_accessed_variables=False) as tape: # tape.watch([self.log_alpha]) # alpha_loss = tf.reduce_mean(self.alpha * (entropy - self.target_entropy) # alpha_grads = tape.gradient(alpha_loss, [self.log_alpha]) # self.alpha_optimizer.apply_gradients(zip(alpha_grads, [self.log_alpha])) alpha_loss = tf.constant(0.) return { 'bc_actor_loss': loss, 'bc_alpha': self.alpha, 'bc_alpha_loss': alpha_loss, 'bc_log_probs': tf.reduce_mean(log_probs), 'bc_entropy': tf.reduce_mean(entropy) }