def _train_value_function(self, paths): """Train the value function. Args: paths (list[dict]): A list of collected paths. Returns: torch.Tensor: Calculated mean scalar value of value function loss (float). """ # MAML resets a value function to its initial state before training. self._value_function.load_state_dict(self._initial_vf_state) obs = np.concatenate([path['observations'] for path in paths], axis=0) returns = np.concatenate([path['returns'] for path in paths]) obs = np_to_torch(obs) returns = np_to_torch(returns.astype(np.float32)) vf_loss = self._value_function.compute_loss(obs, returns) # pylint: disable=protected-access zero_optim_grads(self._inner_algo._vf_optimizer._optimizer) vf_loss.backward() # pylint: disable=protected-access self._inner_algo._vf_optimizer.step() return vf_loss
def adapt_policy(self, exploration_policy, exploration_episodes): """Produce a policy adapted for a task. Args: exploration_policy (Policy): A policy which was returned from get_exploration_policy(), and which generated exploration_episodes by interacting with an environment. The caller may not use this object after passing it into this method. exploration_episodes (EpisodeBatch): Episodes to which to adapt, generated by exploration_policy exploring the environment. Returns: Policy: A policy adapted to the task represented by the exploration_episodes. """ total_steps = sum(exploration_episodes.lengths) o = exploration_episodes.observations a = exploration_episodes.actions r = exploration_episodes.rewards.reshape(total_steps, 1) ctxt = np.hstack((o, a, r)).reshape(1, total_steps, -1) context = np_to_torch(ctxt) self._policy.infer_posterior(context) return self._policy
def get_action(self, observation): r"""Get a single action given an observation. Args: observation (np.ndarray): Observation from the environment. Shape is :math:`env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted action. Shape is :math:`env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ if not isinstance(observation, np.ndarray) and not isinstance( observation, torch.Tensor): observation = self._env_spec.observation_space.flatten(observation) elif isinstance(observation, np.ndarray) and len(observation.shape) > 1: observation = self._env_spec.observation_space.flatten(observation) elif isinstance(observation, torch.Tensor) and len(observation.shape) > 1: observation = torch.flatten(observation) with torch.no_grad(): if isinstance(observation, np.ndarray): observation = np_to_torch(observation) if not isinstance(observation, torch.Tensor): observation = list_to_tensor(observation) observation = observation.unsqueeze(0) action, agent_infos = self.get_actions(observation) return action[0], {k: v[0] for k, v in agent_infos.items()}
def _sample_data(self, indices): """Sample batch of training data from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Obervations, with shape :math:`(X, N, O^*)` where X is the number of tasks. N is batch size. torch.Tensor: Actions, with shape :math:`(X, N, A^*)`. torch.Tensor: Rewards, with shape :math:`(X, N, 1)`. torch.Tensor: Next obervations, with shape :math:`(X, N, O^*)`. torch.Tensor: Dones, with shape :math:`(X, N, 1)`. """ # transitions sampled randomly from replay buffer initialized = False for idx in indices: batch = self._replay_buffers[idx].sample_transitions( self._batch_size) if not initialized: o = batch['observations'][np.newaxis] a = batch['actions'][np.newaxis] r = batch['rewards'][np.newaxis] no = batch['next_observations'][np.newaxis] d = batch['dones'][np.newaxis] initialized = True else: o = np.vstack((o, batch['observations'][np.newaxis])) a = np.vstack((a, batch['actions'][np.newaxis])) r = np.vstack((r, batch['rewards'][np.newaxis])) no = np.vstack((no, batch['next_observations'][np.newaxis])) d = np.vstack((d, batch['dones'][np.newaxis])) o = np_to_torch(o) a = np_to_torch(a) r = np_to_torch(r) no = np_to_torch(no) d = np_to_torch(d) return o, a, r, no, d
def get_actions(self, observations): r"""Get actions given observations. Args: observations (np.ndarray): Observations from the environment. Shape is :math:`batch_dim \bullet env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted actions. :math:`batch_dim \bullet env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution. * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ if not isinstance(observations[0], np.ndarray) and not isinstance( observations[0], torch.Tensor): observations = self._env_spec.observation_space.flatten_n( observations) # frequently users like to pass lists of torch tensors or lists of # numpy arrays. This handles those conversions. if isinstance(observations, list): if isinstance(observations[0], np.ndarray): observations = np.stack(observations) elif isinstance(observations[0], torch.Tensor): observations = torch.stack(observations) if isinstance(observations[0], np.ndarray) and len(observations[0].shape) > 1: observations = self._env_spec.observation_space.flatten_n( observations) elif isinstance(observations[0], torch.Tensor) and len(observations[0].shape) > 1: observations = torch.flatten(observations, start_dim=1) with torch.no_grad(): if isinstance(observations, np.ndarray): observations = np_to_torch(observations) if not isinstance(observations, torch.Tensor): observations = list_to_tensor(observations) if isinstance(self._env_spec.observation_space, akro.Image): observations /= 255.0 # scale image dist, info = self.forward(observations) return dist.sample().cpu().numpy(), { k: v.detach().cpu().numpy() for (k, v) in info.items() }
def _sample_context(self, indices): """Sample batch of context from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Context data, with shape :math:`(X, N, C)`. X is the number of tasks. N is batch size. C is the combined size of observation, action, reward, and next observation if next observation is used in context. Otherwise, C is the combined size of observation, action, and reward. """ # make method work given a single task index if not hasattr(indices, '__iter__'): indices = [indices] initialized = False for idx in indices: batch = self._context_replay_buffers[idx].sample_transitions( self._embedding_batch_size) o = batch['observations'] a = batch['actions'] r = batch['rewards'] context = np.hstack((np.hstack((o, a)), r)) if self._use_next_obs_in_context: context = np.hstack((context, batch['next_observations'])) if not initialized: final_context = context[np.newaxis] initialized = True else: final_context = np.vstack((final_context, context[np.newaxis])) final_context = np_to_torch(final_context) if len(indices) == 1: final_context = final_context.unsqueeze(0) return final_context
def _train_once(self, itr, eps): """Train the algorithm once. Args: itr (int): Iteration number. eps (EpisodeBatch): A batch of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs = np_to_torch(eps.padded_observations) rewards = np_to_torch(eps.padded_rewards) returns = np_to_torch( np.stack([ discount_cumsum(reward, self.discount) for reward in eps.padded_rewards ])) valids = eps.lengths with torch.no_grad(): baselines = self._value_function(obs) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = np_to_torch(eps.observations) actions_flat = np_to_torch(eps.actions) rewards_flat = np_to_torch(eps.rewards) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, eps, discount=self._discount) return np.mean(undiscounted_returns)