Example #1
0
def test_wrap_like_other_env(env: SimEnv):
    wenv_like = deepcopy(env)
    wenv_like.dt /= 3

    wenv = DownsamplingWrapper(env, factor=3)
    assert type(wenv_like) != type(wenv)
    wenv_like = wrap_like_other_env(wenv_like, wenv, use_downsampling=True)
    assert type(wenv_like) == type(wenv)

    wenv = ActNormWrapper(wenv)
    assert type(wenv_like) != type(wenv)
    wenv_like = wrap_like_other_env(wenv_like, wenv)
    assert type(wenv_like) == type(wenv)

    wenv = ObsNormWrapper(wenv)
    assert type(wenv_like) != type(wenv)
    wenv_like = wrap_like_other_env(wenv_like, wenv)
    assert type(wenv_like) == type(wenv)
    assert type(wenv_like.wrapped_env) == type(wenv.wrapped_env)

    wenv = ObsRunningNormWrapper(wenv)
    wenv_like = wrap_like_other_env(wenv_like, wenv)
    assert type(wenv_like) == type(wenv)
    assert type(wenv_like.wrapped_env) == type(wenv.wrapped_env)

    wenv = ObsPartialWrapper(wenv, idcs=["x"])
    wenv_like = wrap_like_other_env(wenv_like, wenv)
    assert type(wenv_like) == type(wenv)
    assert type(wenv_like.wrapped_env) == type(wenv.wrapped_env)
Example #2
0
def test_bayrn_power(ex_dir, env: SimEnv, bayrn_hparam: dict):
    pyrado.set_seed(0)

    # Environments and domain randomization
    env_real = deepcopy(env)
    env_sim = DomainRandWrapperLive(env, create_zero_var_randomizer(env))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)
    env_real.domain_param = dict(mass_pend_pole=0.024 * 1.1,
                                 mass_rot_pole=0.095 * 1.1)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy and subroutine
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
    subrtn_hparam = dict(
        max_iter=1,
        pop_size=8,
        num_init_states_per_domain=1,
        num_is_samples=4,
        expl_std_init=0.1,
        num_workers=1,
    )
    subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam)

    # Set the boundaries for the GP
    dp_nom = inner_env(env_sim).get_nominal_domain_param()
    ddp_space = BoxSpace(
        bound_lo=np.array([
            0.8 * dp_nom["mass_pend_pole"], 1e-8,
            0.8 * dp_nom["mass_rot_pole"], 1e-8
        ]),
        bound_up=np.array([
            1.2 * dp_nom["mass_pend_pole"], 1e-7,
            1.2 * dp_nom["mass_rot_pole"], 1e-7
        ]),
    )

    # Create algorithm and train
    algo = BayRn(ex_dir,
                 env_sim,
                 env_real,
                 subrtn,
                 ddp_space,
                 **bayrn_hparam,
                 num_workers=1)
    algo.train()

    assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
def experiment_wo_distruber(env_real: QuanserReal, env_sim: SimEnv):
    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt("Running the evaluation policy ...", "c", bright=True)
    return rollout(
        env_real,
        policy,
        eval=True,
        max_steps=args.max_steps,
        render_mode=RenderMode(text=True),
        no_reset=True,
        no_close=True,
    )
Example #4
0
    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    env_sim = ActNormWrapper(env_sim)
    env_sim = DomainRandWrapperLive(env_sim,
                                    create_zero_var_randomizer(env_sim))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real_hparams = dict(dt=1 / 500.0, max_steps=3000)
    env_real = QQubeSwingUpReal(**env_real_hparams)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy
    policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    policy = FNNPolicy(spec=env_sim.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9844224855479998,
        lamda=0.9700148505302241,
        num_epoch=5,
        batch_size=500,
        standardize_adv=False,
def experiment_w_distruber(env_real: QuanserReal, env_sim: SimEnv):
    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt("Running the evaluation policy ...", "c", bright=True)
    ro1 = rollout(
        env_real,
        policy,
        eval=True,
        max_steps=args.max_steps // 3,
        render_mode=RenderMode(),
        no_reset=True,
        no_close=True,
    )

    # Run disturber
    env_real = inner_env(env_real)  # since we are reusing it
    print_cbt("Running the 1st disturber ...", "c", bright=True)
    rollout(
        env_real,
        disturber_pos,
        eval=True,
        max_steps=steps_disturb,
        render_mode=RenderMode(),
        no_reset=True,
        no_close=True,
    )

    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt("Running the evaluation policy ...", "c", bright=True)
    ro2 = rollout(
        env_real,
        policy,
        eval=True,
        max_steps=args.max_steps // 3,
        render_mode=RenderMode(),
        no_reset=True,
        no_close=True,
    )

    # Run disturber
    env_real = inner_env(env_real)  # since we are reusing it
    print_cbt("Running the 2nd disturber ...", "c", bright=True)
    rollout(
        env_real,
        disturber_neg,
        eval=True,
        max_steps=steps_disturb,
        render_mode=RenderMode(),
        no_reset=True,
        no_close=True,
    )

    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt("Running the evaluation policy ...", "c", bright=True)
    ro3 = rollout(
        env_real,
        policy,
        eval=True,
        max_steps=args.max_steps // 3,
        render_mode=RenderMode(),
        no_reset=True,
        no_close=True,
    )

    return StepSequence.concat([ro1, ro2, ro3])
    # Evaluate all policies
    for i, (env_sim, policy) in enumerate(zip(env_sim_list, policy_list)):
        # Create a new sampler pool for every policy to synchronize the random seeds i.e. init states
        pool = SamplerPool(args.num_workers)

        # Seed the sampler
        if args.seed is not None:
            pool.set_seed(args.seed)
            print_cbt(
                f"Set the random number generators' seed to {args.seed}.", "w")
        else:
            print_cbt("No seed was set", "y")

        # Add the same wrappers as during training
        env = wrap_like_other_env(env, env_sim)

        # Sample rollouts
        ros = eval_nominal_domain(pool, env, policy, init_state_list)

        # Compute results metrics
        rets = [ro.undiscounted_return() for ro in ros]
        lengths = [float(ro.length)
                   for ro in ros]  # int values are not numeric in pandas
        df = df.append(pd.DataFrame(
            dict(policy=ex_labels[i], ret=rets, len=lengths)),
                       ignore_index=True)

    metrics = dict(
        avg_len=df.groupby("policy").mean()["len"].to_dict(),
        avg_ret=df.groupby("policy").mean()["ret"].to_dict(),