コード例 #1
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Collect samples for the given iteration number.

        Args:
            itr (int): Number of iteration.
            batch_size (int): Number of environment steps in one batch.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list[dict]: A list of paths.

        """
        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        cur_policy_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_policy_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )
        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
コード例 #2
0
ファイル: batch_sampler.py プロジェクト: Kelvinson/garage
 def obtain_samples(self, itr):
     cur_policy_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_policy_params,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
コード例 #3
0
ファイル: batch_sampler.py プロジェクト: JoleProject/Jole
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Obtain samples."""
        if not batch_size:
            batch_size = self.algo.max_path_length

        cur_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )

        return paths if whole_paths else truncate_paths(paths, batch_size)
コード例 #4
0
ファイル: batch_sampler.py プロジェクト: splendor-kill/garage
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        cur_policy_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_policy_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )
        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
コード例 #5
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Number of iteration.
            batch_size (int): Number of environment steps in one batch.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list[dict]: A list of paths.

        """
        if not batch_size:
            batch_size = self.algo.max_path_length

        cur_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )

        return paths if whole_paths else truncate_paths(paths, batch_size)
コード例 #6
0
ファイル: ddpg.py プロジェクト: ScapeQin/garage
    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean([
            special.discount_return(path["rewards"], self.discount)
            for path in paths
        ])

        returns = [sum(path["rewards"]) for path in paths]

        all_qs = np.concatenate(self.q_averages)
        all_ys = np.concatenate(self.y_averages)

        average_q_loss = np.mean(self.qf_loss_averages)
        average_policy_surr = np.mean(self.policy_surr_averages)
        average_action = np.mean(
            np.square(np.concatenate([path["actions"] for path in paths])))

        policy_reg_param_norm = np.linalg.norm(
            self.policy.get_param_values(regularizable=True))
        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True))

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('StdReturn', np.std(returns))
        logger.record_tabular('MaxReturn', np.max(returns))
        logger.record_tabular('MinReturn', np.min(returns))
        if self.es_path_returns:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn', np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn', np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageQLoss', average_q_loss)
        logger.record_tabular('AveragePolicySurr', average_policy_surr)
        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        logger.record_tabular('AverageAbsQYDiff',
                              np.mean(np.abs(all_qs - all_ys)))
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm)
        logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm)

        self.policy.log_diagnostics(paths)

        self.qf_loss_averages = []
        self.policy_surr_averages = []

        self.q_averages = []
        self.y_averages = []
        self.es_path_returns = []
コード例 #7
0
from garage.envs import normalize
from garage.envs.box2d import CartpoleEnv
from garage.theano.envs import TheanoEnv
from garage.theano.policies import GaussianMLPPolicy
from garage.sampler import parallel_sampler

# normalize() makes sure that the actions for the environment lies within the
# range [-1, 1] (only works for environments with continuous actions)
env = TheanoEnv(normalize(CartpoleEnv()))
# Initialize a neural network policy with a single hidden layer of 8 hidden
# units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, ))
parallel_sampler.populate_task(env, policy)
parallel_sampler.initialize(10)
paths = parallel_sampler.sample_paths(policy.get_param_values(), 100)
# We will collect 100 trajectories per iteration
N = 100
# Each trajectory will have at most 100 time steps
T = 100
# Number of iterations
n_itr = 100
# Set the discount factor for the problem
discount = 0.99
# Learning rate for the gradient update
learning_rate = 0.01

# Construct the computation graph

# Create a Theano variable for storing the observations We could have simply
# written `observations_var = TT.matrix('observations')` instead for this