def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 250., max_steps=1500)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(feats=FeatureStack([
        identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat,
        ATan2Feat(1, 2),
        MultFeat([4, 5])
    ]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=50,
        pop_size=trial.suggest_int('pop_size', 50, 200),
        num_rollouts=trial.suggest_int('num_rollouts', 4, 10),
        num_is_samples=trial.suggest_int('num_is_samples', 5, 40),
        expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5),
        symm_sampling=trial.suggest_categorical('symm_sampling',
                                                [True, False]),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'),
                 env,
                 policy,
                 **algo_hparam,
                 logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
    env = DomainRandWrapperLive(env, randomizer)

    # Policy
    bounds = ([0.0, 0.25, 0.5], [1.0, 1.5, 2.5])
    policy_hparam = dict(rbf_hparam=dict(num_feat_per_dim=9, bounds=bounds, scale=None), dim_mask=2)
    policy = DualRBFLinearPolicy(env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=15,
        pop_size=100,
        num_is_samples=10,
        num_init_states_per_domain=2,
        num_domains=10,
        expl_std_init=np.pi / 12,
        expl_std_min=0.02,
        num_workers=8,
    )
    algo = PoWER(ex_dir, env, policy, **algo_hparam)

    # Save the hyper-parameters
    save_dicts_to_yaml(
        dict(env=env_hparams, seed=args.seed),
        dict(policy=policy_hparam),
        dict(algo=algo_hparam, algo_name=algo.name),
        save_dir=ex_dir,
    )

    # Jeeeha
    algo.train(seed=args.seed, snapshot_mode="best")
    env = DomainRandWrapperLive(env, randomizer)

    # Policy
    policy_hparam = dict(rbf_hparam=dict(num_feat_per_dim=10,
                                         bounds=(0., 1.),
                                         scale=None),
                         dim_mask=2)
    policy = DualRBFLinearPolicy(env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=50,
        pop_size=200,
        num_is_samples=10,
        num_rollouts=8,
        expl_std_init=np.pi / 12,
        expl_std_min=0.02,
        num_workers=8,
    )
    algo = PoWER(ex_dir, env, policy, **algo_hparam)

    # Save the hyper-parameters
    save_list_of_dicts_to_yaml([
        dict(env=env_hparams, seed=args.seed),
        dict(policy=policy_hparam),
        dict(algo=algo_hparam, algo_name=algo.name)
    ], ex_dir)

    # Jeeeha
    algo.train(seed=args.seed, snapshot_mode='best')
Esempio n. 4
0
        UniformDomainParam(name="joint_2_damping",
                           mean=9.4057e-03,
                           halfspan=5.0000e-04,
                           clip_lo=1e-6),
    )
    env = DomainRandWrapperLive(env, randomizer)

    # Policy
    policy_hparam = hparams["policy"]
    policy_hparam["rbf_hparam"].update({"scale": None})
    policy = DualRBFLinearPolicy(env.spec, **policy_hparam)
    policy.param_values = to.tensor(hparams["algo"]["policy_param_init"])

    # Algorithm
    algo_hparam = hparams["subroutine"]
    algo_hparam.update(
        {"num_workers":
         8})  # should be equivalent to the number of cores per job
    algo = PoWER(ex_dir, env, policy, **algo_hparam)

    # Save the hyper-parameters
    save_dicts_to_yaml(
        dict(env=env_hparams, seed=ex_dir.seed),
        dict(policy=policy_hparam),
        dict(algo=algo_hparam, algo_name=algo.name),
        save_dir=ex_dir,
    )

    # Jeeeha
    algo.train(seed=ex_dir.seed, snapshot_mode="latest")
                           clip_lo=0),
        UniformDomainParam(name='joint_damping',
                           mean=9.4057e-03,
                           halfspan=5.0000e-04,
                           clip_lo=1e-6),
    )
    env = DomainRandWrapperLive(env, randomizer)

    # Policy
    policy_hparam = hparams['policy']
    policy_hparam['rbf_hparam'].update({'scale': None})
    policy = DualRBFLinearPolicy(env.spec, **policy_hparam)
    policy.param_values = to.tensor(hparams['algo']['policy_param_init'])

    # Algorithm
    algo_hparam = hparams['subroutine']
    algo_hparam.update(
        {'num_workers':
         8})  # should be equivalent to the number of cores per job
    algo = PoWER(ex_dir, env, policy, **algo_hparam)

    # Save the hyper-parameters
    save_list_of_dicts_to_yaml([
        dict(env=env_hparams, seed=ex_dir.seed),
        dict(policy=policy_hparam),
        dict(algo=algo_hparam, algo_name=algo.name)
    ], ex_dir)

    # Jeeeha
    algo.train(seed=ex_dir.seed, snapshot_mode='latest')