# Algorithm algo_hparam = dict( max_iter=1000*env.max_steps, memory_size=100*env.max_steps, gamma=0.995, num_batch_updates=1, tau=0.995, ent_coeff_init=0.2, learn_ent_coeff=True, target_update_intvl=5, standardize_rew=False, min_steps=1, batch_size=256, num_workers=4, lr=3e-4, ) algo = SAC(ex_dir, env, policy, qfcn_1, qfcn_2, **algo_hparam) # Save the hyper-parameters save_list_of_dicts_to_yaml([ dict(env=env_hparams, seed=args.seed), dict(policy=policy_hparam), dict(qfcn=qfcn_hparam), dict(algo=algo_hparam, algo_name=algo.name)], ex_dir ) # Jeeeha algo.train(seed=args.seed)
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical("shared_hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict( hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( "min_steps_algo", [1]), # 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform("memory_size_algo", 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform("tau_algo", 0.99, 1.0), ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9), learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo", [True, False]), standardize_rew=trial.suggest_categorical("standardize_rew_algo", [False]), gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0), target_update_intvl=trial.suggest_categorical( "target_update_intvl_algo", [1, 5]), num_updates_per_step=trial.suggest_categorical( "num_batch_updates_algo", [1, 5]), batch_size=trial.suggest_categorical("batch_size_algo", [128, 256, 512]), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f"trial_{trial.number}")) algo = SAC(study_dir, env, policy, qfcn_1, qfcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret