def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean([ special.discount_return(path["rewards"], self.discount) for path in paths ]) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean( np.square(np.concatenate([path["actions"] for path in paths]))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True)) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True)) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean( [special.discount_return(path["rewards"], self.discount) for path in paths] ) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean(np.square(np.concatenate( [path["actions"] for path in paths] ))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True) ) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True) ) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def evaluate(self, epoch, es_path_returns): logger.log("Collecting samples for evaluation") #we will replace with our version of running the simulator #and generating paths #paths = self._sample_paths(epoch) paths = [] for _ in range(10): path = rollout(env=self.training_env,agent=self.policy, animated=self.render,speedup=5,max_path_length=200,always_return_paths=True) paths.append(path) self.log_diagnostics(paths) rewards, terminals, obs, actions, next_obs = split_paths(paths) feed_dict = self._update_feed_dict(rewards, terminals, obs, actions, next_obs) # Compute statistics ( policy_loss, qf_loss, policy_output, target_policy_output, qf_output, target_qf_outputs, ys, ) = self.sess.run( [ self.policy_surrogate_loss, self.qf_loss, self.policy.output, self.target_policy.output, self.qf.output, self.target_qf.output, self.ys, ], feed_dict=feed_dict) discounted_returns = [ special.discount_return(path["rewards"], self.discount) for path in paths] returns = [sum(path["rewards"]) for path in paths] rewards = np.hstack([path["rewards"] for path in paths]) # Log statistics last_statistics = OrderedDict([ ('Epoch', epoch), ('AverageReturn', np.mean(returns)), ('PolicySurrogateLoss', policy_loss), ('QfLoss', qf_loss), ]) last_statistics.update(create_stats_ordered_dict('Ys', ys)) last_statistics.update(create_stats_ordered_dict('PolicyOutput', policy_output)) last_statistics.update(create_stats_ordered_dict('TargetPolicyOutput', target_policy_output)) last_statistics.update(create_stats_ordered_dict('QfOutput', qf_output)) last_statistics.update(create_stats_ordered_dict('TargetQfOutput', target_qf_outputs)) last_statistics.update(create_stats_ordered_dict('Rewards', rewards)) last_statistics.update(create_stats_ordered_dict('Returns', returns)) last_statistics.update(create_stats_ordered_dict('DiscountedReturns', discounted_returns)) if len(es_path_returns) > 0: last_statistics.update(create_stats_ordered_dict('TrainingReturns', es_path_returns)) for key, value in last_statistics.items(): logger.record_tabular(key, value) return last_statistics