def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 250., max_steps=1500) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(feats=FeatureStack([ identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat([4, 5]) ])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=50, pop_size=trial.suggest_int('pop_size', 50, 200), num_rollouts=trial.suggest_int('num_rollouts', 4, 10), num_is_samples=trial.suggest_int('num_is_samples', 5, 40), expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5), symm_sampling=trial.suggest_categorical('symm_sampling', [True, False]), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'), env, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
env = DomainRandWrapperLive(env, randomizer) # Policy bounds = ([0.0, 0.25, 0.5], [1.0, 1.5, 2.5]) policy_hparam = dict(rbf_hparam=dict(num_feat_per_dim=9, bounds=bounds, scale=None), dim_mask=2) policy = DualRBFLinearPolicy(env.spec, **policy_hparam) # Algorithm algo_hparam = dict( max_iter=15, pop_size=100, num_is_samples=10, num_init_states_per_domain=2, num_domains=10, expl_std_init=np.pi / 12, expl_std_min=0.02, num_workers=8, ) algo = PoWER(ex_dir, env, policy, **algo_hparam) # Save the hyper-parameters save_dicts_to_yaml( dict(env=env_hparams, seed=args.seed), dict(policy=policy_hparam), dict(algo=algo_hparam, algo_name=algo.name), save_dir=ex_dir, ) # Jeeeha algo.train(seed=args.seed, snapshot_mode="best")
env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = dict(rbf_hparam=dict(num_feat_per_dim=10, bounds=(0., 1.), scale=None), dim_mask=2) policy = DualRBFLinearPolicy(env.spec, **policy_hparam) # Algorithm algo_hparam = dict( max_iter=50, pop_size=200, num_is_samples=10, num_rollouts=8, expl_std_init=np.pi / 12, expl_std_min=0.02, num_workers=8, ) algo = PoWER(ex_dir, env, policy, **algo_hparam) # Save the hyper-parameters save_list_of_dicts_to_yaml([ dict(env=env_hparams, seed=args.seed), dict(policy=policy_hparam), dict(algo=algo_hparam, algo_name=algo.name) ], ex_dir) # Jeeeha algo.train(seed=args.seed, snapshot_mode='best')
UniformDomainParam(name="joint_2_damping", mean=9.4057e-03, halfspan=5.0000e-04, clip_lo=1e-6), ) env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = hparams["policy"] policy_hparam["rbf_hparam"].update({"scale": None}) policy = DualRBFLinearPolicy(env.spec, **policy_hparam) policy.param_values = to.tensor(hparams["algo"]["policy_param_init"]) # Algorithm algo_hparam = hparams["subroutine"] algo_hparam.update( {"num_workers": 8}) # should be equivalent to the number of cores per job algo = PoWER(ex_dir, env, policy, **algo_hparam) # Save the hyper-parameters save_dicts_to_yaml( dict(env=env_hparams, seed=ex_dir.seed), dict(policy=policy_hparam), dict(algo=algo_hparam, algo_name=algo.name), save_dir=ex_dir, ) # Jeeeha algo.train(seed=ex_dir.seed, snapshot_mode="latest")
clip_lo=0), UniformDomainParam(name='joint_damping', mean=9.4057e-03, halfspan=5.0000e-04, clip_lo=1e-6), ) env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = hparams['policy'] policy_hparam['rbf_hparam'].update({'scale': None}) policy = DualRBFLinearPolicy(env.spec, **policy_hparam) policy.param_values = to.tensor(hparams['algo']['policy_param_init']) # Algorithm algo_hparam = hparams['subroutine'] algo_hparam.update( {'num_workers': 8}) # should be equivalent to the number of cores per job algo = PoWER(ex_dir, env, policy, **algo_hparam) # Save the hyper-parameters save_list_of_dicts_to_yaml([ dict(env=env_hparams, seed=ex_dir.seed), dict(policy=policy_hparam), dict(algo=algo_hparam, algo_name=algo.name) ], ex_dir) # Jeeeha algo.train(seed=ex_dir.seed, snapshot_mode='latest')