def _train_once(self, itr): """Perform one iteration of training. Args: itr (int): Iteration number. """ for grad_step_timer in range(self._grad_steps_per_env_step): if (self._replay_buffer.n_transitions_stored >= self._min_buffer_size): # Sample from buffer samples = self._replay_buffer.sample_transitions( self._buffer_batch_size) samples = dict_np_to_torch(samples) # Optimize qf_loss, y, q, policy_loss = torch_to_np( self._optimize_policy(samples, grad_step_timer)) self._episode_policy_losses.append(policy_loss) self._episode_qf_losses.append(qf_loss) self._epoch_ys.append(y) self._epoch_qs.append(q) if itr % self._steps_per_epoch == 0: logger.log('Training finished') epoch = itr // self._steps_per_epoch if (self._replay_buffer.n_transitions_stored >= self._min_buffer_size): tabular.record('Epoch', epoch) self._log_statistics()
def train_once(self, itr, episodes): """Perform one iteration of training. Args: itr (int): Iteration number. episodes (EpisodeBatch): Batch of episodes. """ self.replay_buffer.add_episode_batch(episodes) epoch = itr / self._steps_per_epoch for _ in range(self._n_train_steps): if (self.replay_buffer.n_transitions_stored >= self._min_buffer_size): samples = self.replay_buffer.sample_transitions( self._buffer_batch_size) samples['rewards'] *= self._reward_scale qf_loss, y, q, policy_loss = torch_to_np( self.optimize_policy(samples)) self._episode_policy_losses.append(policy_loss) self._episode_qf_losses.append(qf_loss) self._epoch_ys.append(y) self._epoch_qs.append(q) if itr % self._steps_per_epoch == 0: logger.log('Training finished') if (self.replay_buffer.n_transitions_stored >= self._min_buffer_size): tabular.record('Epoch', epoch) tabular.record('Policy/AveragePolicyLoss', np.mean(self._episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self._episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self._epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self._epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self._epoch_ys)) tabular.record('QFunction/MaxY', np.max(self._epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self._epoch_ys)))
def test_torch_to_np(): """Test whether tuples of tensors can be converted to np arrays.""" tup = (torch.zeros(1), torch.zeros(1)) np_out_1, np_out_2 = torch_to_np(tup) assert isinstance(np_out_1, np.ndarray) assert isinstance(np_out_2, np.ndarray)
def train_once(self, itr, trajectories): """Perform one iteration of training. Args: itr (int): Iteration number. trajectories (TrajectoryBatch): Batch of trajectories. Returns: float: Average return. """ self.replay_buffer.add_trajectory_batch(trajectories) epoch = itr / self._steps_per_epoch self._episode_rewards.extend( [traj.rewards.sum() for traj in trajectories.split()]) last_average_return = np.NaN if self._episode_rewards: last_average_return = np.mean(self._episode_rewards) for _ in range(self._n_train_steps): if (self.replay_buffer.n_transitions_stored >= self._min_buffer_size): samples = self.replay_buffer.sample_transitions( self._buffer_batch_size) samples['rewards'] *= self._reward_scale qf_loss, y, q, policy_loss = torch_to_np( self.optimize_policy(samples)) self._episode_policy_losses.append(policy_loss) self._episode_qf_losses.append(qf_loss) self._epoch_ys.append(y) self._epoch_qs.append(q) if itr % self._steps_per_epoch == 0: logger.log('Training finished') if (self.replay_buffer.n_transitions_stored >= self._min_buffer_size): tabular.record('Epoch', epoch) tabular.record('Policy/AveragePolicyLoss', np.mean(self._episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self._episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self._epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self._epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self._epoch_ys)) tabular.record('QFunction/MaxY', np.max(self._epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self._epoch_ys))) if not self._smooth_return: self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] return last_average_return
def train_once(self, itr, paths): """Perform one iteration of training. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: float: Average return. """ paths = self.process_samples(itr, paths) epoch = itr / self.steps_per_epoch self._episode_rewards.extend([ path for path, complete in zip(paths['undiscounted_returns'], paths['complete']) if complete ]) self._success_history.extend([ path for path, complete in zip(paths['success_history'], paths['complete']) if complete ]) # Avoid calculating the mean of an empty list in cases where # all paths were non-terminal. last_average_return = np.NaN avg_success_rate = 0 if self._episode_rewards: last_average_return = np.mean(self._episode_rewards) if self._success_history: if itr % self.steps_per_epoch == 0 and self._buffer_prefilled: avg_success_rate = np.mean(self._success_history) for _ in range(self.n_train_steps): if self._buffer_prefilled: samples = self.replay_buffer.sample_transitions( self.buffer_batch_size) qf_loss, y, q, policy_loss = torch_to_np( self.optimize_policy(samples)) self._episode_policy_losses.append(policy_loss) self._episode_qf_losses.append(qf_loss) self._epoch_ys.append(y) self._epoch_qs.append(q) if itr % self.steps_per_epoch == 0: logger.log('Training finished') if self._buffer_prefilled: tabular.record('Epoch', epoch) tabular.record('Policy/AveragePolicyLoss', np.mean(self._episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self._episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self._epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self._epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self._epoch_ys)) tabular.record('QFunction/MaxY', np.max(self._epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self._epoch_ys))) tabular.record('AverageSuccessRate', avg_success_rate) if not self.smooth_return: self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._success_history.clear() return last_average_return