Example #1
0
def create_qqsu_setup():
    # Environments
    env_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        mass_rot_pole=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        mass_pend_pole=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        length_rot_pole=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        length_pend_pole=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    # trafo_mask = [False, True, False, True, False, True, False, True]
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec)
    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=False)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        Mr=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        Mp=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        Lr=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        Lp=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ('Mr', 'mean'),
        1: ('Mr', 'std'),
        2: ('Mp', 'mean'),
        3: ('Mp', 'std'),
        4: ('Lr', 'mean'),
        5: ('Lr', 'std'),
        6: ('Lp', 'mean'),
        7: ('Lp', 'std')
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9885,
        lamda=0.9648,
        num_epoch=2,
        batch_size=500,
        standardize_adv=False,
        lr=5.792e-4,
        max_grad_norm=1.,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=200,
        min_steps=3 * 23 * env_sim.max_steps,
        num_epoch=7,
        eps_clip=0.0744,
        batch_size=500,
        std_init=0.9074,
        lr=3.446e-04,
        max_grad_norm=1.,
        num_workers=1,
    )
    subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20)
    prior = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom),
        NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom),
        NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom),
        NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom),
    )
    ddp_policy = DomainDistrParamPolicy(
        mapping=dp_map,
        trafo_mask=trafo_mask,
        prior=prior,
        scale_params=trial.suggest_categorical('ddp_policy_scale_params',
                                               [True, False]),
    )
    subsubrtn_distr_hparam = dict(
        max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]),
        pop_size=trial.suggest_int('pop_size', 50, 500),
        num_rollouts=1,
        num_is_samples=trial.suggest_int('num_is_samples', 5, 20),
        expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1),
        expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]),
        extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3,
                                                     1e-1),
        extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0,
                                                10),
        num_workers=1,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    subsubrtn_distr = CEM(study_dir,
                          env_sim,
                          ddp_policy,
                          **subsubrtn_distr_hparam,
                          logger=csv_logger)
    obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight],
        num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20,
                                                 100),
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=trial.suggest_categorical('algo_max_iter', [10]),
        num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts',
                                                    [5]),
        warmstart=trial.suggest_categorical('algo_warmstart', [True]),
        thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn',
                                                    [50]),
        subrtn_snapshot_mode='latest',
    )
    algo = SimOpt(study_dir,
                  env_sim,
                  env_real,
                  subrtn_policy,
                  subrtn_distr,
                  **algo_hparam,
                  logger=csv_logger)

    # Jeeeha
    algo.train(seed=args.seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env_real, algo.policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
                              f'{BayRn.name}-{PoWER.name}_{QQubeSwingUpAndBalanceCtrl.name}_sim2sim',
                              f'rand-Mp-Mr_seed-{args.seed}')

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1/100., max_steps=600)
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    env_sim = DomainRandWrapperLive(env_sim, create_zero_var_randomizer(env_sim))
    dp_map = get_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real = QQubeSwingUpSim(**env_sim_hparams)
    env_real.domain_param = dict(
        Mp=0.024*1.1,
        Mr=0.095*1.1,
    )
    env_real_hparams = env_sim_hparams
    env_real = wrap_like_other_env(env_real, env_sim)

    # PoWER + energy-based controller setup
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
    subrtn_hparam = dict(
        max_iter=10,
        pop_size=50,
        num_rollouts=8,
        num_is_samples=5,
        expl_std_init=2.0,
        expl_std_min=0.02,
        symm_sampling=False,
    )

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    env_sim = DomainRandWrapperLive(env_sim,
                                    create_zero_var_randomizer(env_sim))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real = QQubeSwingUpSim(**env_sim_hparams)
    env_real.domain_param = dict(
        mass_pend_pole=0.024 * 1.1,
        mass_rot_pole=0.095 * 1.1,
    )
    env_real_hparams = env_sim_hparams
    env_real = wrap_like_other_env(env_real, env_sim)

    # PoWER and energy-based controller setup
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.0)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
    subrtn_hparam = dict(
        max_iter=5,
        pop_size=50,
        num_init_states_per_domain=4,
        num_domains=10,
        num_is_samples=5,
        expl_std_init=2.0,
        expl_std_min=0.02,