def _train_once(self, trainer, epoch): """Obtain samplers and train for one epoch. Args: trainer (Trainer): Experiment trainer, which may be used to obtain samples. epoch (int): The current epoch. Returns: List[float]: Losses. """ batch = self._obtain_samples(trainer, epoch) indices = np.random.permutation(len(batch.actions)) minibatches = np.array_split(indices, self._minibatches_per_epoch) losses = [] for minibatch in minibatches: observations = as_torch(batch.observations[minibatch]) actions = as_torch(batch.actions[minibatch]) self._optimizer.zero_grad() loss = self._compute_loss(observations, actions) loss.backward() losses.append(loss.item()) self._optimizer.step() return losses
def _optimize_qf(self, timesteps): """Perform algorithm optimizing. Args: timesteps (TimeStepBatch): Processed batch data. Returns: qval_loss: Loss of Q-value predicted by the Q-network. ys: y_s. qval: Q-value predicted by the Q-network. """ observations = as_torch(timesteps.observations) rewards = as_torch(timesteps.rewards).reshape(-1, 1) rewards *= self._reward_scale actions = as_torch(timesteps.actions) next_observations = as_torch(timesteps.next_observations) terminals = as_torch(timesteps.terminals).reshape(-1, 1) next_inputs = next_observations inputs = observations with torch.no_grad(): if self._double_q: # Use online qf to get optimal actions selected_actions = torch.argmax(self._qf(next_inputs), axis=1) # use target qf to get Q values for those actions selected_actions = selected_actions.long().unsqueeze(1) best_qvals = torch.gather(self._target_qf(next_inputs), dim=1, index=selected_actions) else: target_qvals = self._target_qf(next_inputs) best_qvals, _ = torch.max(target_qvals, 1) best_qvals = best_qvals.unsqueeze(1) rewards_clipped = rewards if self._clip_reward is not None: rewards_clipped = torch.clamp(rewards, -1 * self._clip_reward, self._clip_reward) y_target = (rewards_clipped + (1.0 - terminals) * self._discount * best_qvals) y_target = y_target.squeeze(1) # optimize qf qvals = self._qf(inputs) selected_qs = torch.sum(qvals * actions, axis=1) qval_loss = F.smooth_l1_loss(selected_qs, y_target) self._qf_optimizer.zero_grad() qval_loss.backward() # optionally clip the gradients if self._clip_grad is not None: torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self._clip_grad) self._qf_optimizer.step() return (qval_loss.detach(), y_target, selected_qs.detach())
def get_action(self, observation): r"""Get a single action given an observation. Args: observation (np.ndarray): Observation from the environment. Shape is :math:`env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted action. Shape is :math:`env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ if not isinstance(observation, np.ndarray) and not isinstance( observation, torch.Tensor): observation = self._env_spec.observation_space.flatten(observation) elif isinstance(observation, np.ndarray) and len(observation.shape) > 1: observation = self._env_spec.observation_space.flatten(observation) elif isinstance(observation, torch.Tensor) and len(observation.shape) > 1: observation = torch.flatten(observation) with torch.no_grad(): if not isinstance(observation, torch.Tensor): observation = as_torch(observation) observation = observation.unsqueeze(0) action, agent_infos = self.get_actions(observation) return action[0], {k: v[0] for k, v in agent_infos.items()}
def test_double_dqn_loss(setup): algo, env, buff, _, batch_size = setup algo._double_q = True trainer = Trainer(snapshot_config) trainer.setup(algo, env) paths = trainer.obtain_episodes(0, batch_size=batch_size) buff.add_episode_batch(paths) timesteps = buff.sample_timesteps(algo._buffer_batch_size) timesteps_copy = copy.deepcopy(timesteps) observations = as_torch(timesteps.observations) rewards = as_torch(timesteps.rewards).reshape(-1, 1) actions = as_torch(timesteps.actions) next_observations = as_torch(timesteps.next_observations) terminals = as_torch(timesteps.terminals).reshape(-1, 1) next_inputs = next_observations inputs = observations with torch.no_grad(): # double Q loss selected_actions = torch.argmax(algo._qf(next_inputs), axis=1) # use target qf to get Q values for those actions selected_actions = selected_actions.long().unsqueeze(1) best_qvals = torch.gather(algo._target_qf(next_inputs), dim=1, index=selected_actions) rewards_clipped = rewards y_target = (rewards_clipped + (1.0 - terminals) * algo._discount * best_qvals) y_target = y_target.squeeze(1) # optimize qf qvals = algo._qf(inputs) selected_qs = torch.sum(qvals * actions, axis=1) qval_loss = F.smooth_l1_loss(selected_qs, y_target) algo_loss, algo_targets, algo_selected_qs = algo._optimize_qf( timesteps_copy) env.close() assert (qval_loss.detach() == algo_loss).all() assert (y_target == algo_targets).all() assert (selected_qs == algo_selected_qs).all()
def get_actions(self, observations): """Get actions given observations. Args: observations (np.ndarray): Batch of observations, should have shape :math:`(N, O)`. Returns: torch.Tensor: Predicted actions. Tensor has shape :math:`(N, A)`. dict: Empty since this policy does not produce a distribution. """ with torch.no_grad(): return self(as_torch(observations)).cpu().numpy(), dict()
def get_actions(self, observations): r"""Get actions given observations. Args: observations (np.ndarray): Observations from the environment. Shape is :math:`batch_dim \bullet env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted actions. :math:`batch_dim \bullet env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution. * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ if not isinstance(observations[0], np.ndarray) and not isinstance( observations[0], torch.Tensor): observations = self._env_spec.observation_space.flatten_n( observations) # frequently users like to pass lists of torch tensors or lists of # numpy arrays. This handles those conversions. if isinstance(observations, list): if isinstance(observations[0], np.ndarray): observations = np.stack(observations) elif isinstance(observations[0], torch.Tensor): observations = torch.stack(observations) if isinstance(observations[0], np.ndarray) and len(observations[0].shape) > 1: observations = self._env_spec.observation_space.flatten_n( observations) elif isinstance(observations[0], torch.Tensor) and len(observations[0].shape) > 1: observations = torch.flatten(observations, start_dim=1) with torch.no_grad(): if not isinstance(observations, torch.Tensor): observations = as_torch(observations) dist, info = self.forward(observations) return dist.sample().cpu().numpy(), { k: v.detach().cpu().numpy() for (k, v) in info.items() }