Esempio n. 1
0
def test_domain_param_transforms(env: SimEnv, trafo_class: Type):
    pyrado.set_seed(0)

    # Create a mask for a random domain parameter
    offset = 1
    idx = random.randint(0, len(env.supported_domain_param) - 1)
    sel_dp_change = list(env.supported_domain_param)[idx]
    sel_dp_fix = list(
        env.supported_domain_param)[(idx + offset) %
                                    len(env.supported_domain_param)]
    while (offset == 1 or any([
            item in sel_dp_change for item in VORTEX_ONLY_DOMAIN_PARAM_LIST
    ]) or any([item in sel_dp_fix for item in VORTEX_ONLY_DOMAIN_PARAM_LIST])):
        idx = random.randint(0, len(env.supported_domain_param) - 1)
        sel_dp_change = list(env.supported_domain_param)[idx]
        sel_dp_fix = list(
            env.supported_domain_param)[(idx + offset) %
                                        len(env.supported_domain_param)]
        offset += 1

    mask = (sel_dp_change, )
    wenv = trafo_class(env, mask)
    assert isinstance(wenv, DomainParamTransform)

    # Check 5 random values
    for _ in range(5):
        # Change the selected domain parameter
        new_dp_val = random.random() * env.get_nominal_domain_param(
        )[sel_dp_change]
        new_dp_val = abs(
            new_dp_val) + 1e-6  # due to the domain of the new params
        transformed_new_dp_val = wenv.forward(new_dp_val)
        wenv.domain_param = {
            sel_dp_change: transformed_new_dp_val
        }  # calls inverse transform
        if not isinstance(inner_env(wenv), SimPyEnv):
            wenv.reset(
            )  # the RcsPySim and MujocoSim classes need to be reset to apply the new domain param

        # Test the actual domain param and the the getters
        assert inner_env(wenv)._domain_param[sel_dp_change] == pytest.approx(
            new_dp_val, abs=1e-5)
        assert wenv.domain_param[sel_dp_change] == pytest.approx(new_dp_val,
                                                                 abs=1e-5)
        assert wenv.domain_param[sel_dp_fix] != pytest.approx(new_dp_val)
Esempio n. 2
0
def test_npdr_and_bayessim(
    ex_dir,
    algo_name: str,
    env: SimEnv,
    num_segments: int,
    len_segments: int,
    num_real_rollouts: int,
    num_sbi_rounds: int,
    use_rec_act: bool,
):
    pyrado.set_seed(0)

    # Create a fake ground truth target domain
    env_real = deepcopy(env)
    dp_nom = env.get_nominal_domain_param()
    env_real.domain_param = dict(mass_pend_pole=dp_nom["mass_pend_pole"] * 1.2,
                                 length_pend_pole=dp_nom["length_pend_pole"] *
                                 0.8)

    # Reduce the number of steps to make this test run faster
    env.max_steps = 40
    env_real.max_steps = 40

    # Policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Define a mapping: index - domain parameter
    dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"}

    # Prior
    prior_hparam = dict(
        low=to.tensor(
            [dp_nom["mass_pend_pole"] * 0.5,
             dp_nom["length_pend_pole"] * 0.5]),
        high=to.tensor(
            [dp_nom["mass_pend_pole"] * 1.5,
             dp_nom["length_pend_pole"] * 1.5]),
    )
    prior = sbiutils.BoxUniform(**prior_hparam)

    # Time series embedding
    embedding = BayesSimEmbedding(
        env.spec,
        RolloutSamplerForSBI.get_dim_data(env.spec),
        downsampling_factor=3,
    )

    # Posterior (normalizing flow)
    posterior_hparam = dict(model="maf",
                            embedding_net=nn.Identity(),
                            hidden_features=20,
                            num_transforms=3)

    # Policy optimization subroutine
    subrtn_policy_hparam = dict(
        max_iter=1,
        pop_size=2,
        num_init_states_per_domain=1,
        num_domains=2,
        expl_std_init=0.1,
        expl_factor=1.1,
        num_workers=1,
    )
    subrtn_policy = HCNormal(ex_dir, env, policy, **subrtn_policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=1,
        num_sim_per_round=20,
        num_real_rollouts=num_real_rollouts,
        num_sbi_rounds=num_sbi_rounds,
        simulation_batch_size=1,
        normalize_posterior=False,
        num_eval_samples=2,
        num_segments=num_segments,
        len_segments=len_segments,
        use_rec_act=use_rec_act,
        stop_on_done=True,
        subrtn_sbi_training_hparam=dict(
            max_num_epochs=1),  # only train for 1 iteration
        # subrtn_sbi_sampling_hparam=dict(sample_with_mcmc=True, mcmc_parameters=dict(warmup_steps=20)),
        num_workers=1,
    )
    skip = False
    if algo_name == NPDR.name:
        algo = NPDR(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            prior=prior,
            embedding=embedding,
            subrtn_sbi_class=SNPE_C,
            posterior_hparam=posterior_hparam,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    elif algo_name == BayesSim.name:
        # We are not checking multi-round SNPE-A since it has known issues
        if algo_hparam["num_sbi_rounds"] > 1:
            skip = True
        algo = BayesSim(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            embedding=embedding,
            prior=prior,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    else:
        raise NotImplementedError

    if not skip:
        algo.train()
        # Just checking the interface here
        assert algo.curr_iter == algo.max_iter