def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr (int): Number of iteration. batch_size (int): Number of environment steps in one batch. whole_paths (bool): Whether to use whole path or truncated. Returns: list[dict]: A list of paths. """ if not batch_size: batch_size = self.algo.max_path_length * self.n_envs cur_policy_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_samples(self, itr): cur_policy_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = truncate_paths(paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Obtain samples.""" if not batch_size: batch_size = self.algo.max_path_length cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): if not batch_size: batch_size = self.algo.max_path_length * self.n_envs cur_policy_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Number of iteration. batch_size (int): Number of environment steps in one batch. whole_paths (bool): Whether to use whole path or truncated. Returns: list[dict]: A list of paths. """ if not batch_size: batch_size = self.algo.max_path_length cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) return paths if whole_paths else truncate_paths(paths, batch_size)
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean([ special.discount_return(path["rewards"], self.discount) for path in paths ]) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean( np.square(np.concatenate([path["actions"] for path in paths]))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True)) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True)) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if self.es_path_returns: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy from garage.sampler import parallel_sampler # normalize() makes sure that the actions for the environment lies within the # range [-1, 1] (only works for environments with continuous actions) env = TheanoEnv(normalize(CartpoleEnv())) # Initialize a neural network policy with a single hidden layer of 8 hidden # units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, )) parallel_sampler.populate_task(env, policy) parallel_sampler.initialize(10) paths = parallel_sampler.sample_paths(policy.get_param_values(), 100) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.01 # Construct the computation graph # Create a Theano variable for storing the observations We could have simply # written `observations_var = TT.matrix('observations')` instead for this