Esempio n. 1
0
def create_qqsu_setup():
    # Environments
    env_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        mass_rot_pole=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        mass_pend_pole=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        length_rot_pole=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        length_pend_pole=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    # trafo_mask = [False, True, False, True, False, True, False, True]
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec)
    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=False)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
Esempio n. 2
0
def create_qq_setup(factor, dt, max_steps, render_mode):
    # Set up environment
    init_state = np.array([0.1, 0.0, 0.0, 0.0])
    env = QQubeSwingUpSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=render_mode,
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz_w = ro.actions

    # Time in seconds
    time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz)))
    time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz)))
    time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt),
                               int(len(act_500Hz_w)))

    # Plot
    _, ax = plt.subplots(nrows=1)
    ax.plot(time_500Hz, act_500Hz, label="500 Hz (original)")
    ax.plot(time_100Hz, act_100Hz, label="100 Hz", ls="--")
    ax.plot(time_500Hz_w, act_500Hz_w, label="500 Hz (wrapped)", ls="--")
    ax.legend()
    ax.set_ylabel(env.act_space.labels)
    ax.set_xlabel("time [s]")
Esempio n. 3
0
def test_bayrn_power(ex_dir, env: SimEnv, bayrn_hparam: dict):
    pyrado.set_seed(0)

    # Environments and domain randomization
    env_real = deepcopy(env)
    env_sim = DomainRandWrapperLive(env, create_zero_var_randomizer(env))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)
    env_real.domain_param = dict(mass_pend_pole=0.024 * 1.1,
                                 mass_rot_pole=0.095 * 1.1)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy and subroutine
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
    subrtn_hparam = dict(
        max_iter=1,
        pop_size=8,
        num_init_states_per_domain=1,
        num_is_samples=4,
        expl_std_init=0.1,
        num_workers=1,
    )
    subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam)

    # Set the boundaries for the GP
    dp_nom = inner_env(env_sim).get_nominal_domain_param()
    ddp_space = BoxSpace(
        bound_lo=np.array([
            0.8 * dp_nom["mass_pend_pole"], 1e-8,
            0.8 * dp_nom["mass_rot_pole"], 1e-8
        ]),
        bound_up=np.array([
            1.2 * dp_nom["mass_pend_pole"], 1e-7,
            1.2 * dp_nom["mass_rot_pole"], 1e-7
        ]),
    )

    # Create algorithm and train
    algo = BayRn(ex_dir,
                 env_sim,
                 env_real,
                 subrtn,
                 ddp_space,
                 **bayrn_hparam,
                 num_workers=1)
    algo.train()

    assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
Esempio n. 4
0
def create_qq_setup(factor, dt, max_steps):
    # Set up environment
    init_state = np.array([0.1, 0.0, 0.0, 0.0])
    env = QQubeSwingUpSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions
    act_100Hz_zoh = np.repeat(act_100Hz, 5, axis=0)

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz_wrapped = ro.actions

    # Plot
    _, ax = plt.subplots(nrows=1)
    ax.plot(act_500Hz, label="500 Hz (original)")
    ax.plot(act_100Hz_zoh, label="100 Hz (zoh)")
    ax.plot(act_500Hz_wrapped, label="500 Hz (wrapped)")
    ax.legend()
    ax.set_ylabel(env.act_space.labels)
    ax.set_xlabel("time steps")
    plt.show()
Esempio n. 5
0
def test_env_specific(env: Env):
    pyrado.set_seed(0)

    if "qbb" in env.name:
        policy = QBallBalancerPDCtrl(env.spec)
        policy.reset()
    elif "qcp" in env.name:
        policy = QCartPoleSwingUpAndBalanceCtrl(env.spec)
        policy.reset()
    elif "qq" in env.name:
        policy = QQubeSwingUpAndBalanceCtrl(env.spec)
        policy.reset()
    else:
        raise NotImplementedError

    # Sample an observation and do an action 10 times
    for _ in range(10):
        obs = env.obs_space.sample_uniform()
        obs = to.from_numpy(obs).to(dtype=to.get_default_dtype())
        act = policy(obs)
        assert isinstance(act, to.Tensor)
Esempio n. 6
0
from pyrado.environments.pysim.quanser_qube import QQubeSwingUpSim
from pyrado.policies.special.environment_specific import QQubeSwingUpAndBalanceCtrl
from pyrado.sampling.rollout import rollout
from pyrado.utils.data_types import RenderMode

if __name__ == '__main__':
    plt.rc('text', usetex=True)

    # Set up environment
    env = QQubeSwingUpSim(dt=1 / 500., max_steps=3500)
    env = GaussianObsNoiseWrapper(
        env, noise_std=[0., 0., 0., 0., 2.,
                        0])  # only noise on theta_dot [rad/s]

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(env,
                 policy,
                 render_mode=RenderMode(text=False, video=False),
                 eval=True)

    # Filter the observations of the last rollout
    theta_dot = ro.observations[:, 4]
    alpha_dot = ro.observations[:, 5]
    theta_dot_filt_3 = gaussian_filter1d(theta_dot, 3)
    theta_dot_filt_5 = gaussian_filter1d(theta_dot, 5)
    alpha_dot_filt_3 = gaussian_filter1d(alpha_dot, 3)
    alpha_dot_filt_5 = gaussian_filter1d(alpha_dot, 5)
Esempio n. 7
0
        length_pend_pole=dp_nom["length_pend_pole"] * 0.8,
        length_rot_pole=dp_nom["length_rot_pole"] * 0.9,
        gravity_const=dp_nom["gravity_const"] * 1.0,
    )
    # randomizer = DomainRandomizer(
    #     NormalDomainParam(name="damping_rot_pole", mean=dp_nom["damping_rot_pole"] * 2.0, std=dp_nom["motor_back_emf"] / 10, clip_lo=0.0),
    #     NormalDomainParam(name="damping_pend_pole", mean=dp_nom["damping_pend_pole"] * 2.0, std=dp_nom["motor_back_emf"] / 10, clip_lo=0.0),
    #     NormalDomainParam(name="motor_resistance", mean=dp_nom["motor_resistance"] * 1.1, std=dp_nom["motor_back_emf"] / 50, clip_lo=0.0),
    #     NormalDomainParam(name="Km", mean=dp_nom["motor_back_emf"] * 0.9, std=dp_nom["motor_back_emf"] / 50, clip_lo=0.0),
    # )
    # env_real = DomainRandWrapperBuffer(env_real, randomizer)
    # env_real.fill_buffer(num_real_rollouts)

    # Behavioral policy
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)

    # Define a mapping: index - domain parameter
    # dp_mapping = {0: "act_delay"}
    # dp_mapping = {0: "mass_rot_pole", 1: "mass_pend_pole", 2: "length_rot_pole", 3: "length_pend_pole"}
    dp_mapping = {
        0: "damping_rot_pole",
        1: "damping_pend_pole",
        2: "motor_resistance",
        3: "motor_back_emf",
        4: "mass_rot_pole",
        5: "mass_pend_pole",
        6: "length_rot_pole",
        7: "length_pend_pole",
        8: "gravity_const",
    }
Esempio n. 8
0
def test_npdr_and_bayessim(
    ex_dir,
    algo_name: str,
    env: SimEnv,
    num_segments: int,
    len_segments: int,
    num_real_rollouts: int,
    num_sbi_rounds: int,
    use_rec_act: bool,
):
    pyrado.set_seed(0)

    # Create a fake ground truth target domain
    env_real = deepcopy(env)
    dp_nom = env.get_nominal_domain_param()
    env_real.domain_param = dict(mass_pend_pole=dp_nom["mass_pend_pole"] * 1.2,
                                 length_pend_pole=dp_nom["length_pend_pole"] *
                                 0.8)

    # Reduce the number of steps to make this test run faster
    env.max_steps = 40
    env_real.max_steps = 40

    # Policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Define a mapping: index - domain parameter
    dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"}

    # Prior
    prior_hparam = dict(
        low=to.tensor(
            [dp_nom["mass_pend_pole"] * 0.5,
             dp_nom["length_pend_pole"] * 0.5]),
        high=to.tensor(
            [dp_nom["mass_pend_pole"] * 1.5,
             dp_nom["length_pend_pole"] * 1.5]),
    )
    prior = sbiutils.BoxUniform(**prior_hparam)

    # Time series embedding
    embedding = BayesSimEmbedding(
        env.spec,
        RolloutSamplerForSBI.get_dim_data(env.spec),
        downsampling_factor=3,
    )

    # Posterior (normalizing flow)
    posterior_hparam = dict(model="maf",
                            embedding_net=nn.Identity(),
                            hidden_features=20,
                            num_transforms=3)

    # Policy optimization subroutine
    subrtn_policy_hparam = dict(
        max_iter=1,
        pop_size=2,
        num_init_states_per_domain=1,
        num_domains=2,
        expl_std_init=0.1,
        expl_factor=1.1,
        num_workers=1,
    )
    subrtn_policy = HCNormal(ex_dir, env, policy, **subrtn_policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=1,
        num_sim_per_round=20,
        num_real_rollouts=num_real_rollouts,
        num_sbi_rounds=num_sbi_rounds,
        simulation_batch_size=1,
        normalize_posterior=False,
        num_eval_samples=2,
        num_segments=num_segments,
        len_segments=len_segments,
        use_rec_act=use_rec_act,
        stop_on_done=True,
        subrtn_sbi_training_hparam=dict(
            max_num_epochs=1),  # only train for 1 iteration
        # subrtn_sbi_sampling_hparam=dict(sample_with_mcmc=True, mcmc_parameters=dict(warmup_steps=20)),
        num_workers=1,
    )
    skip = False
    if algo_name == NPDR.name:
        algo = NPDR(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            prior=prior,
            embedding=embedding,
            subrtn_sbi_class=SNPE_C,
            posterior_hparam=posterior_hparam,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    elif algo_name == BayesSim.name:
        # We are not checking multi-round SNPE-A since it has known issues
        if algo_hparam["num_sbi_rounds"] > 1:
            skip = True
        algo = BayesSim(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            embedding=embedding,
            prior=prior,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    else:
        raise NotImplementedError

    if not skip:
        algo.train()
        # Just checking the interface here
        assert algo.curr_iter == algo.max_iter
Esempio n. 9
0
def test_sbi_embedding(
    ex_dir,
    env: SimEnv,
    embedding_name: str,
    num_segments: int,
    len_segments: int,
    stop_on_done: bool,
    state_mask_labels: Union[None, List[str]],
    act_mask_labels: Union[None, List[str]],
):
    pyrado.set_seed(0)

    # Reduce the number of steps to make this test run faster
    env.max_steps = 80

    # Policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Define a mapping: index - domain parameter
    dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"}

    # Time series embedding
    if embedding_name == LastStepEmbedding.name:
        embedding = LastStepEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == AllStepsEmbedding.name:
        embedding = AllStepsEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            env.max_steps,
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == DeltaStepsEmbedding.name:
        embedding = DeltaStepsEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            env.max_steps,
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == BayesSimEmbedding.name:
        embedding = BayesSimEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == DynamicTimeWarpingEmbedding.name:
        embedding = DynamicTimeWarpingEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == RNNEmbedding.name:
        embedding = RNNEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            hidden_size=10,
            num_recurrent_layers=1,
            output_size=1,
            len_rollouts=env.max_steps,
            downsampling_factor=1,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    else:
        raise NotImplementedError

    sampler = SimRolloutSamplerForSBI(
        env,
        policy,
        dp_mapping,
        embedding,
        num_segments,
        len_segments,
        stop_on_done,
        rollouts_real=None,
        use_rec_act=False,
    )

    # Test with 7 domain parameter sets
    data_sim = sampler(to.abs(to.randn(7, 2)))
    assert data_sim.shape == (7, embedding.dim_output)
Esempio n. 10
0
        t_end = 10  # s

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1 / 250.0, max_steps=int(t_end * 250))
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    # env_sim = ActDelayWrapper(env_sim)

    # Create the ground truth target domain and the behavioral policy
    if ectl:
        env_real = osp.join(pyrado.EVAL_DIR,
                            f"qq-su_ectrl_250Hz_{t_end}s")  # 5s long
        policy = QQubeSwingUpAndBalanceCtrl(
            env_sim.spec
        )  # replaced by the recorded actions if use_rec_act=True
    else:
        env_real = osp.join(pyrado.EVAL_DIR,
                            f"qq_chrip_10to0Hz_+1.5V_250Hz_{t_end}s")
        assert use_rec_act
        policy = DummyPolicy(env_sim.spec)  # replaced by recorded real actions

    # Define a mapping: index - domain parameter
    dp_mapping = {
        0: "damping_rot_pole",
        1: "damping_pend_pole",
        2: "motor_resistance",
        3: "motor_back_emf",
        4: "mass_rot_pole",
        5: "mass_pend_pole",