コード例 #1
0
def test_param_expl_sampler(
    env: SimEnv,
    policy: Policy,
    num_init_states_per_domain: int,
    fixed_init_state: bool,
    num_domains: int,
    num_workers: int,
):
    pyrado.set_seed(0)

    # Add randomizer
    pert = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, pert)

    # Create the sampler
    sampler = ParameterExplorationSampler(env,
                                          policy,
                                          num_init_states_per_domain,
                                          num_domains,
                                          num_workers=num_workers)

    # Use some random parameters
    num_ps = 7
    params = to.rand(num_ps, policy.num_param)

    if fixed_init_state:
        # Sample a custom init state
        init_states = [env.init_space.sample_uniform()
                       ] * num_init_states_per_domain
    else:
        # Let the sampler forward to the env to randomly sample an init state
        init_states = None

    # Do the sampling
    samples = sampler.sample(param_sets=params, init_states=init_states)

    # Check if the correct number of rollouts has been sampled
    assert num_ps == len(samples)
    num_rollouts_per_param = num_init_states_per_domain * num_domains
    assert num_ps * num_rollouts_per_param == samples.num_rollouts
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for idx in range(num_rollouts_per_param):
        # Use the first parameter set as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[idx]

        # Iterate through others
        for ops in piter:
            other_ro = ops.rollouts[idx]
            # Compare domain params
            assert pivot.rollout_info["domain_param"] == other_ro.rollout_info[
                "domain_param"]
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(
                other_ro[0].observation)
コード例 #2
0
def test_cuda_sampling_w_dr(env: SimEnv, policy: Policy, num_workers: int):
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, randomizer)

    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=num_workers,
                                     min_rollouts=4)
    samples = sampler.sample()

    assert samples is not None
コード例 #3
0
def test_dr_wrapper_live_bob(env):
    param_init = env.domain_param
    randomizer = create_default_randomizer(env)
    wrapper = DomainRandWrapperLive(env, randomizer)
    # So far no randomization happened, thus the parameter should be equal
    assert env.domain_param == param_init

    # Randomize 10 times 1 new parameter set
    for _ in range(10):
        param_old = wrapper.domain_param
        wrapper.reset()
        assert param_old != wrapper.domain_param
コード例 #4
0
def test_combination():
    env = QCartPoleSwingUpSim(dt=1/50., max_steps=20)

    randomizer = create_default_randomizer(env)
    env_r = DomainRandWrapperBuffer(env, randomizer)
    env_r.fill_buffer(num_domains=3)

    dp_before = []
    dp_after = []
    for i in range(4):
        dp_before.append(env_r.domain_param)
        rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode())
        dp_after.append(env_r.domain_param)
        assert dp_after[i] != dp_before[i]
    assert dp_after[0] == dp_after[3]

    env_rn = ActNormWrapper(env)
    elb = {'x_dot': -213., 'theta_dot': -42.}
    eub = {'x_dot': 213., 'theta_dot': 42., 'x': 0.123}
    env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub)
    alb, aub = env_rn.act_space.bounds
    assert all(alb == -1)
    assert all(aub == 1)
    olb, oub = env_rn.obs_space.bounds
    assert all(olb == -1)
    assert all(oub == 1)

    ro_r = rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode())
    ro_rn = rollout(env_rn, DummyPolicy(env_rn.spec), eval=True, seed=0, render_mode=RenderMode())
    assert np.allclose(env_rn._process_obs(ro_r.observations), ro_rn.observations)

    env_rnp = ObsPartialWrapper(env_rn, idcs=['x_dot', r'cos_theta'])
    ro_rnp = rollout(env_rnp, DummyPolicy(env_rnp.spec), eval=True, seed=0, render_mode=RenderMode())

    env_rnpa = GaussianActNoiseWrapper(env_rnp,
                                       noise_mean=0.5*np.ones(env_rnp.act_space.shape),
                                       noise_std=0.1*np.ones(env_rnp.act_space.shape))
    ro_rnpa = rollout(env_rnpa, DummyPolicy(env_rnpa.spec), eval=True, seed=0, render_mode=RenderMode())
    assert np.allclose(ro_rnp.actions, ro_rnpa.actions)
    assert not np.allclose(ro_rnp.observations, ro_rnpa.observations)

    env_rnpd = ActDelayWrapper(env_rnp, delay=3)
    ro_rnpd = rollout(env_rnpd, DummyPolicy(env_rnpd.spec), eval=True, seed=0, render_mode=RenderMode())
    assert np.allclose(ro_rnp.actions, ro_rnpd.actions)
    assert not np.allclose(ro_rnp.observations, ro_rnpd.observations)

    assert isinstance(inner_env(env_rnpd), QCartPoleSwingUpSim)
    assert typed_env(env_rnpd, ObsPartialWrapper) is not None
    assert isinstance(env_rnpd, ActDelayWrapper)
    env_rnpdr = remove_env(env_rnpd, ActDelayWrapper)
    assert not isinstance(env_rnpdr, ActDelayWrapper)
コード例 #5
0
def test_pddr(ex_dir, env: SimEnv, policy, algo_hparam):
    pyrado.set_seed(0)

    # Create algorithm and train
    teacher_policy = deepcopy(policy)
    critic = GAE(
        vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                       hidden_sizes=[16, 16],
                       hidden_nonlin=to.tanh))
    teacher_algo_hparam = dict(critic=critic, min_steps=1500, max_iter=2)
    teacher_algo = PPO

    # Wrapper
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, randomizer)

    # Subroutine
    algo_hparam = dict(
        max_iter=2,
        min_steps=env.max_steps,
        std_init=0.15,
        num_epochs=10,
        num_teachers=2,
        teacher_policy=teacher_policy,
        teacher_algo=teacher_algo,
        teacher_algo_hparam=teacher_algo_hparam,
        num_workers=1,
    )

    algo = PDDR(ex_dir, env, policy, **algo_hparam)

    algo.train()

    assert algo.curr_iter == algo.max_iter

    # Save and load
    algo.save_snapshot(meta_info=None)
    algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir)
    assert isinstance(algo_loaded, Algorithm)
    policy_loaded = algo_loaded.policy

    # Check
    assert all(algo.policy.param_values == policy_loaded.param_values)

    # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading.
    env, policy, extra = load_experiment(ex_dir)
    assert isinstance(env, Env)
    assert isinstance(policy, Policy)
    assert isinstance(extra, dict)
コード例 #6
0
def test_dr_wrapper_live_bob(env):
    param_init = env.domain_param
    randomizer = create_default_randomizer(env)
    wrapper = DomainRandWrapperBuffer(env, randomizer)
    # So far no randomization happened, thus the parameter should be equal
    assert env.domain_param == param_init
    assert wrapper._buffer is None

    # Randomize 5 times 5 new parameter sets
    for _ in range(5):
        wrapper.fill_buffer(10)
        for i in range(10):
            param_old = wrapper.domain_param
            assert wrapper._ring_idx == i
            wrapper.reset()
            assert param_old != wrapper.domain_param
コード例 #7
0
def test_spota_ppo(ex_dir, env: SimEnv, spota_hparam: dict):
    pyrado.set_seed(0)

    # Environment and domain randomization
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperBuffer(env, randomizer)

    # Policy and subroutines
    policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh)
    vfcn = FNN(input_size=env.obs_space.flat_dim,
               output_size=1,
               hidden_sizes=[16, 16],
               hidden_nonlin=to.tanh)
    critic_hparam = dict(gamma=0.998,
                         lamda=0.95,
                         num_epoch=3,
                         batch_size=64,
                         lr=1e-3)
    critic_cand = GAE(vfcn, **critic_hparam)
    critic_refs = GAE(deepcopy(vfcn), **critic_hparam)

    subrtn_hparam_common = dict(
        # min_rollouts=0,  # will be overwritten by SPOTA
        min_steps=0,  # will be overwritten by SPOTA
        max_iter=2,
        num_epoch=3,
        eps_clip=0.1,
        batch_size=64,
        num_workers=1,
        std_init=0.5,
        lr=1e-2,
    )

    sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_common)
    sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs,
                  **subrtn_hparam_common)

    # Create algorithm and train
    algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam)
    algo.train()

    assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
コード例 #8
0
def test_dr_wrapper_buffer_bob(env: SimEnv, selection: str):
    param_init = env.domain_param
    randomizer = create_default_randomizer(env)
    wrapper = DomainRandWrapperBuffer(env, randomizer, selection)
    # So far no randomization happened, thus the parameter should be equal
    assert env.domain_param == param_init
    assert wrapper._buffer is None

    # Randomize 5 times 5 new parameter sets
    for _ in range(5):
        wrapper.fill_buffer(10)
        for i in range(10):
            param_old = wrapper.domain_param
            if selection == "cyclic":
                assert wrapper._ring_idx == i
            else:
                assert 0 <= wrapper._ring_idx < len(wrapper.buffer)
            wrapper.reset()
            if selection == "cyclic":
                assert param_old != wrapper.domain_param
コード例 #9
0
def test_param_expl_sampler(env: SimEnv, policy: Policy):
    # Add randomizer
    pert = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, pert)

    # Create the sampler
    num_rollouts_per_param = 12
    sampler = ParameterExplorationSampler(
        env,
        policy,
        num_workers=1,
        num_rollouts_per_param=num_rollouts_per_param)

    # Use some random parameters
    num_ps = 12
    params = to.rand(num_ps, policy.num_param)

    # Do the sampling
    samples = sampler.sample(params)

    assert num_ps == len(samples)
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for ri in range(num_rollouts_per_param):
        # Use the first paramset as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[ri]
        # Iterate through others
        for ops in piter:
            ro = ops.rollouts[ri]

            # Compare domain params
            assert pivot.rollout_info['domain_param'] == ro.rollout_info[
                'domain_param']
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(ro[0].observation)
コード例 #10
0
    env = QBallBalancerSim(**env_hparams)
    env = GaussianObsNoiseWrapper(
        env,
        noise_std=[
            1 / 180 * pi,
            1 / 180 * pi,
            0.005,
            0.005,  # [rad, rad, m, m, ...
            10 / 180 * pi,
            10 / 180 * pi,
            0.05,
            0.05
        ])  # ... rad/s, rad/s, m/s, m/s]
    # env = ObsPartialWrapper(env, mask=[0, 0, 0, 0, 1, 1, 0, 0])
    env = ActDelayWrapper(env)
    randomizer = create_default_randomizer(env)
    randomizer.add_domain_params(
        UniformDomainParam(name='act_delay',
                           mean=5,
                           halfspan=5,
                           clip_lo=0,
                           roundint=True))
    env = DomainRandWrapperBuffer(env, randomizer)

    # Policy
    policy_hparam = dict(feats=FeatureStack([identity_feat]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Initialize with Quanser's PD gains
    init_policy_param_values = to.tensor([
        -14., 0, -14 * 3.45, 0, 0, 0, -14 * 2.11, 0, 0, -14., 0, -14 * 3.45, 0,
コード例 #11
0
ファイル: pddr.py プロジェクト: fdamken/SimuRLacra
    def __init__(
        self,
        save_dir: str,
        env: Env,
        policy: Policy,
        lr: float = 5e-4,
        std_init: float = 0.15,
        min_steps: int = 1500,
        num_epochs: int = 10,
        max_iter: int = 500,
        num_teachers: int = 8,
        teacher_extra: Optional[dict] = None,
        teacher_policy: Optional[Policy] = None,
        teacher_algo: Optional[callable] = None,
        teacher_algo_hparam: Optional[dict] = None,
        randomizer: Optional[DomainRandomizer] = None,
        logger: Optional[StepLogger] = None,
        num_workers: int = 4,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                    By default, the learning rate is constant.
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epochs: number of epochs (how often we iterate over the same batch)
        :param max_iter: number of iterations (policy updates)
        :param num_teachers: number of teachers that are used for distillation
        :param teacher_extra: extra dict from PDDRTeachers algo. If provided, teachers are loaded from there
        :param teacher_policy: policy to be updated (is duplicated for each teacher)
        :param teacher_algo: algorithm class to be used for training the teachers
        :param teacher_algo_hparam: hyper-params to be used for teacher_algo
        :param randomizer: randomizer for sampling the teacher domain parameters; if `None`, the environment's default
                           one is used
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        :param num_workers: number of environments for parallel sampling
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, Policy):
            raise pyrado.TypeErr(given=policy, expected_type=Policy)

        # Call Algorithm's constructor.
        super().__init__(
            num_checkpoints=1, init_checkpoint=-1, save_dir=save_dir, max_iter=max_iter, policy=policy, logger=logger
        )

        # Store the inputs
        self.env_real = env
        self.min_steps = min_steps
        self.num_epochs = num_epochs
        self.num_teachers = num_teachers
        self.num_workers = num_workers

        self.teacher_policies = []
        self.teacher_envs = []
        self.teacher_expl_strats = []
        self.teacher_critics = []
        self.teacher_ex_dirs = []

        # Teachers
        if teacher_policy is not None and teacher_algo is not None and teacher_algo_hparam is not None:
            if not isinstance(teacher_policy, Policy):
                raise pyrado.TypeErr(given=teacher_policy, expected_type=Policy)
            if not issubclass(teacher_algo, Algorithm):
                raise pyrado.TypeErr(given=teacher_algo, expected_type=Algorithm)

            if randomizer is None:
                self.randomizer = create_default_randomizer(env)
            else:
                assert isinstance(randomizer, DomainRandomizer)
                self.randomizer = randomizer

            self.set_random_envs()

            # Prepare folders
            self.teacher_ex_dirs = [os.path.join(self.save_dir, f"teachers_{idx}") for idx in range(self.num_teachers)]
            for idx in range(self.num_teachers):
                os.makedirs(self.teacher_ex_dirs[idx], exist_ok=True)

            # Create teacher algos
            self.algos = [
                teacher_algo(
                    save_dir=self.teacher_ex_dirs[idx],
                    env=self.teacher_envs[idx],
                    policy=deepcopy(teacher_policy),
                    logger=None,
                    **deepcopy(teacher_algo_hparam),
                )
                for idx in range(self.num_teachers)
            ]
        elif teacher_extra is not None:
            self.unpack_teachers(teacher_extra)
            assert self.num_teachers == len(self.teacher_policies)
            self.reset_checkpoint()
        else:
            self.load_teachers()
            if self.num_teachers < len(self.teacher_policies):
                print(
                    f"You have loaded {len(self.teacher_policies)} teachers. Only the first {self.num_teachers} will be used!"
                )
                self.prune_teachers()
            assert self.num_teachers == len(self.teacher_policies)
            self.reset_checkpoint()

        # Student
        self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init)
        self.optimizer = to.optim.Adam([{"params": self.policy.parameters()}], lr=lr)

        # Environments
        self.samplers = [
            ParallelRolloutSampler(
                self.teacher_envs[t],
                deepcopy(self._expl_strat),
                num_workers=self.num_workers,
                min_steps=self.min_steps,
            )
            for t in range(self.num_teachers)
        ]

        self.teacher_weights = np.ones(self.num_teachers)

        # Distillation loss criterion
        self.criterion = to.nn.KLDivLoss(log_target=True, reduction="batchmean")
コード例 #12
0
def test_combination(env: SimEnv):
    pyrado.set_seed(0)
    env.max_steps = 20

    randomizer = create_default_randomizer(env)
    env_r = DomainRandWrapperBuffer(env, randomizer)
    env_r.fill_buffer(num_domains=3)

    dp_before = []
    dp_after = []
    for i in range(4):
        dp_before.append(env_r.domain_param)
        rollout(env_r,
                DummyPolicy(env_r.spec),
                eval=True,
                seed=0,
                render_mode=RenderMode())
        dp_after.append(env_r.domain_param)
        assert dp_after[i] != dp_before[i]
    assert dp_after[0] == dp_after[3]

    env_rn = ActNormWrapper(env)
    elb = {"x_dot": -213.0, "theta_dot": -42.0}
    eub = {"x_dot": 213.0, "theta_dot": 42.0, "x": 0.123}
    env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub)
    alb, aub = env_rn.act_space.bounds
    assert all(alb == -1)
    assert all(aub == 1)
    olb, oub = env_rn.obs_space.bounds
    assert all(olb == -1)
    assert all(oub == 1)

    ro_r = rollout(env_r,
                   DummyPolicy(env_r.spec),
                   eval=True,
                   seed=0,
                   render_mode=RenderMode())
    ro_rn = rollout(env_rn,
                    DummyPolicy(env_rn.spec),
                    eval=True,
                    seed=0,
                    render_mode=RenderMode())
    assert np.allclose(env_rn._process_obs(ro_r.observations),
                       ro_rn.observations)

    env_rnp = ObsPartialWrapper(
        env_rn, idcs=[env.obs_space.labels[2], env.obs_space.labels[3]])
    ro_rnp = rollout(env_rnp,
                     DummyPolicy(env_rnp.spec),
                     eval=True,
                     seed=0,
                     render_mode=RenderMode())

    env_rnpa = GaussianActNoiseWrapper(
        env_rnp,
        noise_mean=0.5 * np.ones(env_rnp.act_space.shape),
        noise_std=0.1 * np.ones(env_rnp.act_space.shape))
    ro_rnpa = rollout(env_rnpa,
                      DummyPolicy(env_rnpa.spec),
                      eval=True,
                      seed=0,
                      render_mode=RenderMode())
    assert not np.allclose(
        ro_rnp.observations,
        ro_rnpa.observations)  # the action noise changed to rollout

    env_rnpd = ActDelayWrapper(env_rnp, delay=3)
    ro_rnpd = rollout(env_rnpd,
                      DummyPolicy(env_rnpd.spec),
                      eval=True,
                      seed=0,
                      render_mode=RenderMode())
    assert np.allclose(ro_rnp.actions, ro_rnpd.actions)
    assert not np.allclose(ro_rnp.observations, ro_rnpd.observations)

    assert type(inner_env(env_rnpd)) == type(env)
    assert typed_env(env_rnpd, ObsPartialWrapper) is not None
    assert isinstance(env_rnpd, ActDelayWrapper)
    env_rnpdr = remove_env(env_rnpd, ActDelayWrapper)
    assert not isinstance(env_rnpdr, ActDelayWrapper)