コード例 #1
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 250., max_steps=1500)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(feats=FeatureStack([
        identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat,
        ATan2Feat(1, 2),
        MultFeat([4, 5])
    ]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=50,
        pop_size=trial.suggest_int('pop_size', 50, 200),
        num_rollouts=trial.suggest_int('num_rollouts', 4, 10),
        num_is_samples=trial.suggest_int('num_is_samples', 5, 40),
        expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5),
        symm_sampling=trial.suggest_categorical('symm_sampling',
                                                [True, False]),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'),
                 env,
                 policy,
                 **algo_hparam,
                 logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
コード例 #2
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env = QQubeSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            'shared_hidden_sizes_policy',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('shared_hidden_nonlin_policy',
                                      ['to_tanh', 'to_relu'])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    q_fcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical(
            'hidden_sizes_critic',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('hidden_nonlin_critic',
                                      ['to_tanh', 'to_relu'])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)
    q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            'min_steps_algo', [1]),  # , 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform('memory_size_algo',
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform('tau_algo', 0.99, 1.),
        alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9),
        learn_alpha=trial.suggest_categorical('learn_alpha_algo',
                                              [True, False]),
        standardize_rew=trial.suggest_categorical('standardize_rew_algo',
                                                  [False]),
        gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.),
        target_update_intvl=trial.suggest_categorical(
            'target_update_intvl_algo', [1, 5]),
        num_batch_updates=trial.suggest_categorical('num_batch_updates_algo',
                                                    [1, 5]),
        batch_size=trial.suggest_categorical('batch_size_algo',
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(ex_dir, f'trial_{trial.number}'))
    algo = SAC(ex_dir,
               env,
               policy,
               q_fcn_1,
               q_fcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(
        env, policy, num_envs=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
コード例 #3
0
ファイル: hopt_tspred_adn.py プロジェクト: fdamken/SimuRLacra
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Load the data
    data_set_name = "oscillation_50Hz_initpos-0.5"
    data = pd.read_csv(osp.join(pyrado.PERMA_DIR, "misc", f"{data_set_name}.csv"))
    if data_set_name == "daily_min_temperatures":
        data = to.tensor(data["Temp"].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif data_set_name == "monthly_sunspots":
        data = to.tensor(data["Sunspots"].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif "oscillation" in data_set_name:
        data = to.tensor(data["Positions"].values, dtype=to.get_default_dtype()).view(-1, 1)
    else:
        raise pyrado.ValueErr(
            given=data_set_name,
            eq_constraint="'daily_min_temperatures', 'monthly_sunspots', "
            "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4",
        )

    # Dataset
    data_set_hparam = dict(
        name=data_set_name,
        ratio_train=0.7,
        window_size=trial.suggest_int("dataset_window_size", 1, 100),
        standardize_data=False,
        scale_min_max_data=True,
    )
    dataset = TimeSeriesDataSet(data, **data_set_hparam)

    # Policy
    policy_hparam = dict(
        dt=0.02 if "oscillation" in data_set_name else 1.0,
        obs_layer=None,
        activation_nonlin=to.tanh,
        potentials_dyn_fcn=fcn_from_str(
            trial.suggest_categorical("policy_potentials_dyn_fcn", ["pd_linear", "pd_cubic"])
        ),
        tau_init=trial.suggest_loguniform("policy_tau_init", 1e-2, 1e3),
        tau_learnable=True,
        kappa_init=trial.suggest_categorical("policy_kappa_init", [0, 1e-4, 1e-2]),
        kappa_learnable=True,
        capacity_learnable=True,
        potential_init_learnable=trial.suggest_categorical("policy_potential_init_learnable", [True, False]),
        init_param_kwargs=trial.suggest_categorical("policy_init_param_kwargs", [None]),
        use_cuda=False,
    )
    policy = ADNPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        windowed=trial.suggest_categorical("algo_windowed", [True, False]),
        max_iter=1000,
        optim_class=optim.Adam,
        optim_hparam=dict(
            lr=trial.suggest_uniform("optim_lr", 5e-4, 5e-2),
            eps=trial.suggest_uniform("optim_eps", 1e-8, 1e-5),
            weight_decay=trial.suggest_uniform("optim_weight_decay", 5e-5, 5e-3),
        ),
        loss_fcn=nn.MSELoss(),
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f"trial_{trial.number}"))
    algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    num_init_samples = dataset.window_size
    _, loss_trn = TSPred.evaluate(
        policy,
        dataset.data_trn_inp,
        dataset.data_trn_targ,
        windowed=algo.windowed,
        num_init_samples=num_init_samples,
        cascaded=False,
    )
    _, loss_tst = TSPred.evaluate(
        policy,
        dataset.data_tst_inp,
        dataset.data_tst_targ,
        windowed=algo.windowed,
        num_init_samples=num_init_samples,
        cascaded=False,
    )

    return loss_trn
コード例 #4
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500)
    env = BallOnPlate2DSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64),
                                           (16, 16, 16), (32, 32, 32)]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("shared_hidden_nonlin_policy",
                                      ["to_tanh", "to_relu"])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    qfcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical("hidden_sizes_critic",
                                               [(16, 16), (32, 32), (64, 64),
                                                (16, 16, 16), (32, 32, 32)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_critic",
                                      ["to_tanh", "to_relu"])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                       **qfcn_hparam)
    qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                       **qfcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            "min_steps_algo", [1]),  # 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform("memory_size_algo",
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform("tau_algo", 0.99, 1.0),
        ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9),
        learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo",
                                                  [True, False]),
        standardize_rew=trial.suggest_categorical("standardize_rew_algo",
                                                  [False]),
        gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0),
        target_update_intvl=trial.suggest_categorical(
            "target_update_intvl_algo", [1, 5]),
        num_updates_per_step=trial.suggest_categorical(
            "num_batch_updates_algo", [1, 5]),
        batch_size=trial.suggest_categorical("batch_size_algo",
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f"trial_{trial.number}"))
    algo = SAC(study_dir,
               env,
               policy,
               qfcn_1,
               qfcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
コード例 #5
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Load the data
    data_set_name = 'oscillation_50Hz_initpos-0.5'
    data = pd.read_csv(osp.join(pyrado.PERMA_DIR, 'time_series', f'{data_set_name}.csv'))
    if data_set_name == 'daily_min_temperatures':
        data = to.tensor(data['Temp'].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif data_set_name == 'monthly_sunspots':
        data = to.tensor(data['Sunspots'].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif 'oscillation' in data_set_name:
        data = to.tensor(data['Positions'].values, dtype=to.get_default_dtype()).view(-1, 1)
    else:
        raise pyrado.ValueErr(
            given=data_set_name, eq_constraint="'daily_min_temperatures', 'monthly_sunspots', "
                                               "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4")

    # Dataset
    data_set_hparam = dict(
        name=data_set_name,
        ratio_train=0.7,
        window_size=trial.suggest_int('dataset_window_size', 1, 100),
        standardize_data=False,
        scale_min_max_data=True
    )
    dataset = TimeSeriesDataSet(data, **data_set_hparam)

    # Policy
    policy_hparam = dict(
        dt=0.02 if 'oscillation' in data_set_name else 1.,
        hidden_size=trial.suggest_int('policy_hidden_size', 2, 51),
        obs_layer=None,
        activation_nonlin=fcn_from_str(
            trial.suggest_categorical('policy_activation_nonlin', ['to_tanh', 'to_sigmoid'])),
        mirrored_conv_weights=trial.suggest_categorical('policy_mirrored_conv_weights', [True, False]),
        conv_out_channels=1,
        conv_kernel_size=None,
        conv_padding_mode=trial.suggest_categorical('policy_conv_padding_mode', ['zeros', 'circular']),
        tau_init=trial.suggest_loguniform('policy_tau_init', 1e-2, 1e3),
        tau_learnable=True,
        kappa_init=trial.suggest_categorical('policy_kappa_init', [0, 1e-4, 1e-2]),
        kappa_learnable=True,
        potential_init_learnable=trial.suggest_categorical('policy_potential_init_learnable', [True, False]),
        init_param_kwargs=trial.suggest_categorical('policy_init_param_kwargs', [None, dict(bell=True)]),
        use_cuda=False
    )
    policy = NFPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        windowed=trial.suggest_categorical('algo_windowed', [True, False]),
        max_iter=1000,
        optim_class=optim.Adam,
        optim_hparam=dict(
            lr=trial.suggest_uniform('optim_lr', 5e-4, 5e-2),
            eps=trial.suggest_uniform('optim_eps', 1e-8, 1e-5),
            weight_decay=trial.suggest_uniform('optim_weight_decay', 5e-5, 5e-3)
        ),
        loss_fcn=nn.MSELoss(),
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}'))
    algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    num_init_samples = dataset.window_size
    _, loss_trn = TSPred.evaluate(policy, dataset.data_trn_inp, dataset.data_trn_targ, windowed=algo.windowed,
                                  num_init_samples=num_init_samples, cascaded=False)
    _, loss_tst = TSPred.evaluate(policy, dataset.data_tst_inp, dataset.data_tst_targ, windowed=algo.windowed,
                                  num_init_samples=num_init_samples, cascaded=False)

    return loss_trn
コード例 #6
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        Mr=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        Mp=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        Lr=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        Lp=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ('Mr', 'mean'),
        1: ('Mr', 'std'),
        2: ('Mp', 'mean'),
        3: ('Mp', 'std'),
        4: ('Lr', 'mean'),
        5: ('Lr', 'std'),
        6: ('Lp', 'mean'),
        7: ('Lp', 'std')
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9885,
        lamda=0.9648,
        num_epoch=2,
        batch_size=500,
        standardize_adv=False,
        lr=5.792e-4,
        max_grad_norm=1.,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=200,
        min_steps=3 * 23 * env_sim.max_steps,
        num_epoch=7,
        eps_clip=0.0744,
        batch_size=500,
        std_init=0.9074,
        lr=3.446e-04,
        max_grad_norm=1.,
        num_workers=1,
    )
    subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20)
    prior = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom),
        NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom),
        NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom),
        NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom),
    )
    ddp_policy = DomainDistrParamPolicy(
        mapping=dp_map,
        trafo_mask=trafo_mask,
        prior=prior,
        scale_params=trial.suggest_categorical('ddp_policy_scale_params',
                                               [True, False]),
    )
    subsubrtn_distr_hparam = dict(
        max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]),
        pop_size=trial.suggest_int('pop_size', 50, 500),
        num_rollouts=1,
        num_is_samples=trial.suggest_int('num_is_samples', 5, 20),
        expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1),
        expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]),
        extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3,
                                                     1e-1),
        extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0,
                                                10),
        num_workers=1,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    subsubrtn_distr = CEM(study_dir,
                          env_sim,
                          ddp_policy,
                          **subsubrtn_distr_hparam,
                          logger=csv_logger)
    obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight],
        num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20,
                                                 100),
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=trial.suggest_categorical('algo_max_iter', [10]),
        num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts',
                                                    [5]),
        warmstart=trial.suggest_categorical('algo_warmstart', [True]),
        thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn',
                                                    [50]),
        subrtn_snapshot_mode='latest',
    )
    algo = SimOpt(study_dir,
                  env_sim,
                  env_real,
                  subrtn_policy,
                  subrtn_distr,
                  **algo_hparam,
                  logger=csv_logger)

    # Jeeeha
    algo.train(seed=args.seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env_real, algo.policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
コード例 #7
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1/100., max_steps=600)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy_hparam = dict(
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )  # FNN
    # policy_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]),
    # )  # LSTM & GRU
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # policy = GRUPolicy(spec=env.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    # vfcn_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]),
    # )  # LSTM & GRU
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    critic_hparam = dict(
        batch_size=500,
        gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam
    )
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=250,
        batch_size=500,
        min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}'))
    algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(env, policy, num_workers=1,
                                     min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret