def _evaluate(self, policies, evaluation_env): """Perform evaluation for the current policy.""" if self._eval_n_episodes < 1: return # TODO: max_path_length should be a property of environment. paths = rollouts(evaluation_env, policies['seek'], self.sampler._max_path_length, self._eval_n_episodes) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) evaluation_env.log_diagnostics(paths) if self._eval_render: evaluation_env.render(paths) if self.sampler.batch_ready(): batch = self.sampler.random_batch() self.log_diagnostics(batch) return paths
def _evaluate(self, policy, evaluation_env): """Perform evaluation for the current policy.""" if self._eval_n_episodes < 1: return # TODO: max_path_length should be a property of environment. paths = rollouts(evaluation_env, policy, self.sampler._max_path_length, self._eval_n_episodes) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) evaluation_env.log_diagnostics(paths) if self._eval_render: evaluation_env.render(paths) if self.sampler.batch_ready(): batch = self.sampler.random_batch() self.log_diagnostics(batch)
def _evaluate(self, epoch): """Perform evaluation for the current policy. :param epoch: The epoch number. :return: None """ if self._eval_n_episodes < 1: return paths = rollouts(self._eval_env, self.policy, self._max_path_length, self._eval_n_episodes) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) logger.record_tabular('epoch', epoch) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) batch = self.pool.random_batch(self._batch_size) self.log_diagnostics(batch)
def _evaluate(self, epoch): logger.log("Collecting samples for evaluation") snapshot_dir = logger.get_snapshot_dir() paths = rollouts(self._env, self._eval_policy, self._max_path_length, self._n_eval_episodes) average_discounted_return = np.mean([ special.discount_return(path["rewards"], self._discount) for path in paths ]) returns = np.asarray([sum(path["rewards"]) for path in paths]) statistics = OrderedDict([ ('Epoch', epoch), ('AverageDiscountedReturn', average_discounted_return), ('Alpha', self._alpha), ('returns', returns) ]) for key, value in statistics.items(): logger.record_tabular(key, value) self._env.log_diagnostics(paths) # Plot test paths. if (hasattr(self._env, 'plot_paths') and self._env_plot_settings is not None): img_file = os.path.join(snapshot_dir, 'env_itr_%05d.png' % epoch) # Remove previous paths. if self._env_lines is not None: [path.remove() for path in self._env_lines] self._env_lines = self._env.plot_paths(paths, self._ax_env) plt.pause(0.001) plt.draw() self._fig_env.savefig(img_file, dpi=100) # Plot the Q-function level curves and action samples. if (hasattr(self._qf_eval, 'plot_level_curves') and self._q_plot_settings is not None): img_file = os.path.join(snapshot_dir, 'q_itr_%05d.png' % epoch) [ax.clear() for ax in self._ax_q_lst] self._qf_eval.plot_level_curves( ax_lst=self._ax_q_lst, observations=self._q_plot_settings['obs_lst'], action_dims=self._q_plot_settings['action_dims'], xlim=self._q_plot_settings['xlim'], ylim=self._q_plot_settings['ylim'], ) self._visualization_policy.plot_samples( self._ax_q_lst, self._q_plot_settings['obs_lst']) for ax in self._ax_q_lst: ax.set_xlim(self._q_plot_settings['xlim']) ax.set_ylim(self._q_plot_settings['ylim']) plt.pause(0.001) plt.draw() self._fig_q.savefig(img_file, dpi=100) gc.collect()