def log_diagnostics(self): for i in range(self.agent_num): logger.record_tabular('max-path-return_agent_{}'.format(i), self._max_path_return[i]) logger.record_tabular('mean-path-return_agent_{}'.format(i), self._mean_path_return[i]) logger.record_tabular('last-path-return_agent_{}'.format(i), self._last_path_return[i]) logger.record_tabular('episodes', self._n_episodes) logger.record_tabular('total-samples', self._total_samples)
def log_diagnostics(self, batch): """Record diagnostic information. Records the mean and standard deviation of Q-function and the squared Bellman residual of the s (mean squared Bellman error) for a sample batch. Also call the `draw` method of the plotter, if plotter is defined. """ feeds = self._get_feed_dict(batch) qf, bellman_residual = self._sess.run( [self._q_values, self._bellman_residual], feeds) logger.record_tabular('qf-avg-agent-{}'.format(self._agent_id), np.mean(qf)) logger.record_tabular('qf-std-agent-{}'.format(self._agent_id), np.std(qf)) logger.record_tabular('mean-sq-bellman-error-agent-{}'.format(self._agent_id), bellman_residual)
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records the mean, min, max, and standard deviation of the GMM means, component weights, and covariances. """ feeds = {self._observations_ph: batch['observations']} sess = tf_utils.get_default_session() mus, log_sigs, log_ws, log_pis = sess.run(( self.distribution.mus_t, self.distribution.log_sigs_t, self.distribution.log_ws_t, self.distribution.log_p_t, ), feeds) logger.record_tabular('gmm-mus-mean', np.mean(mus)) logger.record_tabular('gmm-mus-min', np.min(mus)) logger.record_tabular('gmm-mus-max', np.max(mus)) logger.record_tabular('gmm-mus-std', np.std(mus)) logger.record_tabular('gmm-log-w-mean', np.mean(log_ws)) logger.record_tabular('gmm-log-w-min', np.min(log_ws)) logger.record_tabular('gmm-log-w-max', np.max(log_ws)) logger.record_tabular('gmm-log-w-std', np.std(log_ws)) logger.record_tabular('gmm-log-sigs-mean', np.mean(log_sigs)) logger.record_tabular('gmm-log-sigs-min', np.min(log_sigs)) logger.record_tabular('gmm-log-sigs-max', np.max(log_sigs)) logger.record_tabular('gmm-log-sigs-std', np.std(log_sigs)) logger.record_tabular('log_pi_mean', np.mean(log_pis)) logger.record_tabular('log_pi_max', np.max(log_pis)) logger.record_tabular('log_pi_min', np.min(log_pis))
def _train(self, env, policy, initial_exploration_policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, pool) initial_exploration_done = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def _evaluate(self, epoch): """Perform evaluation for the current policy. :param epoch: The epoch number. :return: None """ if self._eval_n_episodes < 1: return with self._policy.deterministic(self._eval_deterministic): paths = rollouts( self._eval_env, self._policy, self.sampler._max_path_length, self._eval_n_episodes, ) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) iteration = epoch * self._epoch_length batch = self.sampler.random_batch() self.log_diagnostics(iteration, batch)
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records mean and standard deviation of Q-function and state value function, and TD-loss (mean squared Bellman error) for the sample batch. Also calls the `draw` method of the plotter, if plotter defined. """ feed_dict = self._get_feed_dict(iteration, batch) qf1, qf2, vf, td_loss1, td_loss2 = self._sess.run( (self._qf1_t, self._qf2_t, self._vf_t, self._td_loss1_t, self._td_loss2_t), feed_dict) logger.record_tabular('qf1-avg', np.mean(qf1)) logger.record_tabular('qf1-std', np.std(qf1)) logger.record_tabular('qf2-avg', np.mean(qf1)) logger.record_tabular('qf2-std', np.std(qf1)) logger.record_tabular('mean-qf-diff', np.mean(np.abs(qf1 - qf2))) logger.record_tabular('vf-avg', np.mean(vf)) logger.record_tabular('vf-std', np.std(vf)) logger.record_tabular('mean-sq-bellman-error1', td_loss1) logger.record_tabular('mean-sq-bellman-error2', td_loss2) self._policy.log_diagnostics(iteration, batch) if self._plotter: self._plotter.draw()
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger.""" feeds = { self._observations_ph: batch['observations'] } raw_actions, actions, log_pis = tf.get_default_session().run( (self._raw_actions, self._actions, self._log_pis), feeds) logger.record_tabular('policy-entropy-mean', -np.mean(log_pis)) logger.record_tabular('log-pi-min', np.min(log_pis)) logger.record_tabular('log-pi-max', np.max(log_pis)) logger.record_tabular('actions-mean', np.mean(actions)) logger.record_tabular('actions-min', np.min(actions)) logger.record_tabular('actions-max', np.max(actions)) logger.record_tabular('raw-actions-mean', np.mean(raw_actions)) logger.record_tabular('raw-actions-min', np.min(raw_actions)) logger.record_tabular('raw-actions-max', np.max(raw_actions))
def log_diagnostics(self): super(SimpleSampler, self).log_diagnostics() logger.record_tabular('max-path-return', self._max_path_return) logger.record_tabular('last-path-return', self._last_path_return) logger.record_tabular('episodes', self._n_episodes) logger.record_tabular('total-samples', self._total_samples)
def log_diagnostics(self): logger.record_tabular('pool-size', self.pool.size)