Exemple #1
0
def test_parallel_sampling_deterministic_wo_min_steps(
    env: SimEnv,
    policy: Policy,
    min_rollouts: Optional[int],
    init_states: Optional[int],
    domain_params: Optional[List[dict]],
):
    env.max_steps = 20

    if init_states is not None:
        init_states = [
            env.spec.state_space.sample_uniform() for _ in range(init_states)
        ]

    nums_workers = (1, 2, 4)

    all_rollouts = []
    for num_workers in nums_workers:
        # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed
        # anyway).
        all_rollouts.append(
            ParallelRolloutSampler(
                env,
                NormalActNoiseExplStrat(policy, std_init=1.0),
                num_workers=num_workers,
                min_rollouts=min_rollouts,
                seed=0,
            ).sample(init_states=init_states, domain_params=domain_params))

    # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts.
    for ros in all_rollouts:
        for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]:
            # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions
            # are different when using the idle policy.
            if isinstance(policy, IdlePolicy):
                # The Quanser Ball Balancer is a deterministic environment (conditioned on the initial state). As the
                # idle policy is a deterministic policy, this will result in the rollouts being equivalent for each
                # initial state, so do not check for difference if the initial states where set.
                if init_states is None:
                    assert ro_a.rewards != pytest.approx(ro_b.rewards)
                    assert ro_a.observations != pytest.approx(
                        ro_b.observations)
            else:
                assert ro_a.rewards != pytest.approx(ro_b.rewards)
                assert ro_a.observations != pytest.approx(ro_b.observations)
                assert ro_a.actions != pytest.approx(ro_b.actions)

    # Test that the rollouts for all number of workers are equal.
    for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]:
        assert len(ros_a) == len(ros_b)
        for ro_a, ro_b in zip(ros_a, ros_b):
            assert ro_a.rewards == pytest.approx(ro_b.rewards)
            assert ro_a.observations == pytest.approx(ro_b.observations)
            assert ro_a.actions == pytest.approx(ro_b.actions)
Exemple #2
0
def test_action_statistics(env: SimEnv, policy: Policy):
    sigma = 1.0  # with lower values like 0.1 we can observe violations of the tolerances

    # Create an action-based exploration strategy
    explstrat = NormalActNoiseExplStrat(policy, std_init=sigma)

    # Sample a deterministic rollout
    ro_policy = rollout(env,
                        policy,
                        eval=True,
                        max_steps=1000,
                        stop_on_done=False,
                        seed=0)
    ro_policy.torch(to.get_default_dtype())

    # Run the exploration strategy on the previously sampled rollout
    if policy.is_recurrent:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _, _ = explstrat(ro_policy.observations)
        else:
            act_expl, _ = explstrat(ro_policy.observations)
        # Get the hidden states from the deterministic rollout
        hidden_states = ro_policy.hidden_states
    else:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _ = explstrat(ro_policy.observations)
        else:
            act_expl = explstrat(ro_policy.observations)
        hidden_states = [
            0.0
        ] * ro_policy.length  # just something that does not violate the format

    ro_expl = StepSequence(
        actions=act_expl[:-1],  # truncate act due to last obs
        observations=ro_policy.observations,
        rewards=ro_policy.rewards,  # don't care but necessary
        hidden_states=hidden_states,
    )
    ro_expl.torch()

    # Compute action statistics and the ground truth
    actstats = compute_action_statistics(ro_expl, explstrat)
    gt_logprobs = Normal(loc=ro_policy.actions,
                         scale=sigma).log_prob(ro_expl.actions)
    gt_entropy = Normal(loc=ro_policy.actions, scale=sigma).entropy()

    to.testing.assert_allclose(actstats.log_probs,
                               gt_logprobs,
                               rtol=1e-4,
                               atol=1e-5)
    to.testing.assert_allclose(actstats.entropy,
                               gt_entropy,
                               rtol=1e-4,
                               atol=1e-5)
Exemple #3
0
def test_parallel_sampling_deterministic_w_min_steps(
    env: SimEnv,
    policy: Policy,
    min_rollouts: Optional[int],
    min_steps: int,
    domain_params: Optional[List[dict]],
):
    env.max_steps = 20

    nums_workers = (1, 2, 4)

    all_rollouts = []
    for num_workers in nums_workers:
        # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed
        # anyway).
        all_rollouts.append(
            ParallelRolloutSampler(
                env,
                NormalActNoiseExplStrat(policy, std_init=1.0),
                num_workers=num_workers,
                min_rollouts=min_rollouts,
                min_steps=min_steps * env.max_steps,
                seed=0,
            ).sample(domain_params=domain_params))

    # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts.
    for ros in all_rollouts:
        for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]:
            # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions
            # are different when using the idle policy.
            if not isinstance(policy, IdlePolicy):
                assert ro_a.rewards != pytest.approx(ro_b.rewards)
                assert ro_a.observations != pytest.approx(ro_b.observations)
                assert ro_a.actions != pytest.approx(ro_b.actions)

    # Test that the rollouts for all number of workers are equal.
    for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]:
        assert sum([len(ro) for ro in ros_a]) == sum([len(ro) for ro in ros_b])
        assert sum([len(ro) for ro in ros_a]) >= min_steps * env.max_steps
        assert sum([len(ro) for ro in ros_b]) >= min_steps * env.max_steps
        assert len(ros_a) == len(ros_b)
        if min_rollouts is not None:
            assert len(ros_a) >= min_rollouts
            assert len(ros_b) >= min_rollouts
        for ro_a, ro_b in zip(ros_a, ros_b):
            assert ro_a.rewards == pytest.approx(ro_b.rewards)
            assert ro_a.observations == pytest.approx(ro_b.observations)
            assert ro_a.actions == pytest.approx(ro_b.actions)
def test_noise_on_act(env, policy):
    for _ in range(100):
        # Init the exploration strategy
        act_noise_strat = NormalActNoiseExplStrat(policy,
                                                  std_init=0.5,
                                                  train_mean=True)

        # Set new parameters for the exploration noise
        std = to.ones(env.act_space.flat_dim) * to.rand(1)
        mean = to.rand(env.act_space.shape)
        act_noise_strat.noise.adapt(mean, std)
        assert (mean == act_noise_strat.noise.mean).all()

        # Sample a random observation from the environment
        obs = to.from_numpy(env.obs_space.sample_uniform())

        # Get a clean and a noisy action
        act = policy(obs)  # policy expects Tensors
        act_noisy = act_noise_strat(
            obs)  # exploration strategy expects ndarrays
        assert isinstance(act, to.Tensor)
        assert not to.equal(act, act_noisy)
Exemple #5
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_epoch: int = 3,
                 eps_clip: float = 0.1,
                 batch_size: int = 64,
                 std_init: float = 1.0,
                 num_sampler_envs: int = 4,
                 max_grad_norm: float = None,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epoch: number of iterations over all gathered samples during one policy update
        :param eps_clip: max/min probability ratio, see [1]
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param num_sampler_envs: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created

        .. note::
            The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler
            schedules the maximum learning rate.
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        assert isinstance(policy, Policy)

        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.num_epoch = num_epoch
        self.eps_clip = eps_clip
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self.log_loss = True
        self._expl_strat = NormalActNoiseExplStrat(self._policy,
                                                   std_init=std_init)
        self.sampler = ParallelSampler(env,
                                       self._expl_strat,
                                       num_envs=num_sampler_envs,
                                       min_steps=min_steps,
                                       min_rollouts=min_rollouts)
        self.optim = to.optim.Adam(
            [{
                'params': self._expl_strat.policy.parameters()
            }, {
                'params': self._expl_strat.noise.parameters()
            }],
            lr=lr,
            eps=1e-5)
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)
Exemple #6
0
    def __init__(
        self,
        save_dir: str,
        env: Env,
        policy: Policy,
        lr: float = 5e-4,
        std_init: float = 0.15,
        min_steps: int = 1500,
        num_epochs: int = 10,
        max_iter: int = 500,
        num_teachers: int = 8,
        teacher_extra: Optional[dict] = None,
        teacher_policy: Optional[Policy] = None,
        teacher_algo: Optional[callable] = None,
        teacher_algo_hparam: Optional[dict] = None,
        randomizer: Optional[DomainRandomizer] = None,
        logger: Optional[StepLogger] = None,
        num_workers: int = 4,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                    By default, the learning rate is constant.
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epochs: number of epochs (how often we iterate over the same batch)
        :param max_iter: number of iterations (policy updates)
        :param num_teachers: number of teachers that are used for distillation
        :param teacher_extra: extra dict from PDDRTeachers algo. If provided, teachers are loaded from there
        :param teacher_policy: policy to be updated (is duplicated for each teacher)
        :param teacher_algo: algorithm class to be used for training the teachers
        :param teacher_algo_hparam: hyper-params to be used for teacher_algo
        :param randomizer: randomizer for sampling the teacher domain parameters; if `None`, the environment's default
                           one is used
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        :param num_workers: number of environments for parallel sampling
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, Policy):
            raise pyrado.TypeErr(given=policy, expected_type=Policy)

        # Call Algorithm's constructor.
        super().__init__(
            num_checkpoints=1, init_checkpoint=-1, save_dir=save_dir, max_iter=max_iter, policy=policy, logger=logger
        )

        # Store the inputs
        self.env_real = env
        self.min_steps = min_steps
        self.num_epochs = num_epochs
        self.num_teachers = num_teachers
        self.num_workers = num_workers

        self.teacher_policies = []
        self.teacher_envs = []
        self.teacher_expl_strats = []
        self.teacher_critics = []
        self.teacher_ex_dirs = []

        # Teachers
        if teacher_policy is not None and teacher_algo is not None and teacher_algo_hparam is not None:
            if not isinstance(teacher_policy, Policy):
                raise pyrado.TypeErr(given=teacher_policy, expected_type=Policy)
            if not issubclass(teacher_algo, Algorithm):
                raise pyrado.TypeErr(given=teacher_algo, expected_type=Algorithm)

            if randomizer is None:
                self.randomizer = create_default_randomizer(env)
            else:
                assert isinstance(randomizer, DomainRandomizer)
                self.randomizer = randomizer

            self.set_random_envs()

            # Prepare folders
            self.teacher_ex_dirs = [os.path.join(self.save_dir, f"teachers_{idx}") for idx in range(self.num_teachers)]
            for idx in range(self.num_teachers):
                os.makedirs(self.teacher_ex_dirs[idx], exist_ok=True)

            # Create teacher algos
            self.algos = [
                teacher_algo(
                    save_dir=self.teacher_ex_dirs[idx],
                    env=self.teacher_envs[idx],
                    policy=deepcopy(teacher_policy),
                    logger=None,
                    **deepcopy(teacher_algo_hparam),
                )
                for idx in range(self.num_teachers)
            ]
        elif teacher_extra is not None:
            self.unpack_teachers(teacher_extra)
            assert self.num_teachers == len(self.teacher_policies)
            self.reset_checkpoint()
        else:
            self.load_teachers()
            if self.num_teachers < len(self.teacher_policies):
                print(
                    f"You have loaded {len(self.teacher_policies)} teachers. Only the first {self.num_teachers} will be used!"
                )
                self.prune_teachers()
            assert self.num_teachers == len(self.teacher_policies)
            self.reset_checkpoint()

        # Student
        self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init)
        self.optimizer = to.optim.Adam([{"params": self.policy.parameters()}], lr=lr)

        # Environments
        self.samplers = [
            ParallelRolloutSampler(
                self.teacher_envs[t],
                deepcopy(self._expl_strat),
                num_workers=self.num_workers,
                min_steps=self.min_steps,
            )
            for t in range(self.num_teachers)
        ]

        self.teacher_weights = np.ones(self.num_teachers)

        # Distillation loss criterion
        self.criterion = to.nn.KLDivLoss(log_target=True, reduction="batchmean")
Exemple #7
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 particle_hparam: dict,
                 max_iter: int,
                 num_particles: int,
                 temperature: float,
                 lr: float,
                 horizon: int,
                 std_init: float = 1.0,
                 min_rollouts: int = None,
                 min_steps: int = 10000,
                 num_sampler_envs: int = 4,
                 serial: bool = True,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param particle_hparam: hyper-parameters for particle template construction
        :param max_iter: number of iterations
        :param num_particles: number of distinct particles
        :param temperature: the temperature of the SVGD determines how jointly the training takes place
        :param lr: the learning rate for the update of the particles
        :param horizon: horizon for each particle
        :param std_init: initial standard deviation for the exploration
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(particle_hparam, dict):
            raise pyrado.TypeErr(given=particle_hparam, expected_type=dict)
        if not all([
                key in particle_hparam
                for key in ['actor', 'value_fcn', 'critic']
        ]):
            raise AttributeError

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy=None, logger=logger)

        # Store the inputs
        self._env = env
        self.num_particles = num_particles
        self.horizon = horizon  # TODO @Robin: where is the horizon used?!
        self.lr = lr
        self.temperature = temperature
        self.serial = serial

        # Prepare placeholders for particles
        self.particles = [None] * num_particles
        self.expl_strats = [None] * num_particles
        self.optimizers = [None] * num_particles
        self.fixed_particles = [None] * num_particles
        self.fixed_expl_strats = [None] * num_particles
        self.samplers = [None] * num_particles
        self.count = 0
        self.updatecount = 0

        # Particle factory
        actor = FNNPolicy(spec=env.spec, **particle_hparam['actor'])
        value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                              **particle_hparam['value_fcn'])
        critic = GAE(value_fcn, **particle_hparam['critic'])
        particle = SVPGParticle(env.spec, actor, critic)

        for i in range(self.num_particles):
            self.particles[i] = deepcopy(particle)
            self.particles[i].init_param()
            self.expl_strats[i] = NormalActNoiseExplStrat(
                self.particles[i].actor, std_init)
            self.optimizers[i] = to.optim.Adam(
                self.expl_strats[i].parameters(), lr=self.lr)
            self.fixed_particles[i] = deepcopy(self.particles[i])
            self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i])

            if self.serial:
                self.samplers[i] = ParallelSampler(env,
                                                   self.expl_strats[i],
                                                   num_sampler_envs,
                                                   min_rollouts=min_rollouts,
                                                   min_steps=min_steps)
Exemple #8
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 vfcn_coeff: float = 0.5,
                 entropy_coeff: float = 1e-3,
                 batch_size: int = 32,
                 std_init: float = 1.0,
                 max_grad_norm: float = None,
                 num_workers: int = 4,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param vfcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2
        :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param num_workers: number of environments for parallel sampling
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.min_rollouts = min_rollouts
        self.min_steps = min_steps
        self.vfcn_coeff = vfcn_coeff
        self.entropy_coeff = entropy_coeff
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init)
        self.sampler = ParallelRolloutSampler(
            env, self.expl_strat,
            num_workers=num_workers,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.optim = to.optim.RMSprop(
            [{'params': self._policy.parameters()},
             {'params': self.expl_strat.noise.parameters()},
             {'params': self._critic.vfcn.parameters()}],
            lr=lr, eps=1e-5
        )
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)