def test_sprl(ex_dir, env: SimEnv, optimize_mean: bool): pyrado.set_seed(0) env = ActNormWrapper(env) env_sprl_params = [ dict( name="gravity_const", target_mean=to.tensor([9.81]), target_cov_chol_flat=to.tensor([1.0]), init_mean=to.tensor([9.81]), init_cov_chol_flat=to.tensor([0.05]), ) ] radnomizer = DomainRandomizer( *[SelfPacedDomainParam(**p) for p in env_sprl_params]) env = DomainRandWrapperLive(env, randomizer=radnomizer) policy = FNNPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844534412010116, lamda=0.9710614403461155, num_epoch=10, batch_size=150, standardize_adv=False, lr=0.00016985313083236645, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=1, eps_clip=0.12648736789309026, min_steps=10 * env.max_steps, num_epoch=3, batch_size=150, std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1.0, num_workers=1, ) algo_hparam = dict( kl_constraints_ub=8000, performance_lower_bound=500, std_lower_bound=0.4, kl_threshold=200, max_iter=1, optimize_mean=optimize_mean, ) algo = SPRL(env, PPO(ex_dir, env, policy, critic, **subrtn_hparam), **algo_hparam) algo.train(snapshot_mode="latest") assert algo.curr_iter == algo.max_iter
def test_arpl(ex_dir, env: SimEnv): pyrado.set_seed(0) env = ActNormWrapper(env) env = StateAugmentationWrapper(env, domain_param=None) policy = FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844534412010116, lamda=0.9710614403461155, num_epoch=10, batch_size=150, standardize_adv=False, lr=0.00016985313083236645, ) critic = GAE(vfcn, **critic_hparam) algo_hparam = dict( max_iter=2, min_steps=23 * env.max_steps, min_rollouts=None, num_epoch=5, eps_clip=0.085, batch_size=150, std_init=0.995, lr=2e-4, num_workers=1, ) arpl_hparam = dict( max_iter=2, steps_num=23 * env.max_steps, halfspan=0.05, dyn_eps=0.07, dyn_phi=0.25, obs_phi=0.1, obs_eps=0.05, proc_phi=0.1, proc_eps=0.03, torch_observation=True, ) ppo = PPO(ex_dir, env, policy, critic, **algo_hparam) algo = ARPL(ex_dir, env, ppo, policy, ppo.expl_strat, **arpl_hparam) algo.train(snapshot_mode="best")
def test_adr_reward_generator(env): reference_env = env random_env = deepcopy(env) reward_generator = RewardGenerator( env_spec=random_env.spec, batch_size=256, reward_multiplier=1, lr=5e-3, ) policy = FNNPolicy(reference_env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) dr = create_default_randomizer_omo() dr.randomize(num_samples=1) random_env.domain_param = dr.get_params(fmt="dict", dtype="numpy") reference_sampler = ParallelRolloutSampler(reference_env, policy, num_workers=1, min_steps=1000) random_sampler = ParallelRolloutSampler(random_env, policy, num_workers=1, min_steps=1000) losses = [] for i in range(200): reference_traj = StepSequence.concat(reference_sampler.sample()) random_traj = StepSequence.concat(random_sampler.sample()) losses.append(reward_generator.train(reference_traj, random_traj, 10)) assert losses[len(losses) - 1] < losses[0]
def test_actor_critic(ex_dir, env: SimEnv, policy: Policy, algo, algo_hparam, vfcn_type, use_cuda): pyrado.set_seed(0) if use_cuda: policy._device = "cuda" policy = policy.to(device="cuda") # Create value function if vfcn_type == "fnn-plain": vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda, ) elif vfcn_type == FNNPolicy.name: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) vfcn = FNNPolicy(vf_spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) elif vfcn_type == RNNPolicy.name: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) vfcn = RNNPolicy(vf_spec, hidden_size=16, num_recurrent_layers=1, use_cuda=use_cuda) else: raise NotImplementedError # Create critic critic_hparam = dict( gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) # Common hyper-parameters common_hparam = dict(max_iter=2, min_rollouts=3, num_workers=1) # Add specific hyper parameters if any common_hparam.update(algo_hparam) # Create algorithm and train algo = algo(ex_dir, env, policy, critic, **common_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_pddr(ex_dir, env: SimEnv, policy, algo_hparam): pyrado.set_seed(0) # Create algorithm and train teacher_policy = deepcopy(policy) critic = GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh)) teacher_algo_hparam = dict(critic=critic, min_steps=1500, max_iter=2) teacher_algo = PPO # Wrapper randomizer = create_default_randomizer(env) env = DomainRandWrapperLive(env, randomizer) # Subroutine algo_hparam = dict( max_iter=2, min_steps=env.max_steps, std_init=0.15, num_epochs=10, num_teachers=2, teacher_policy=teacher_policy, teacher_algo=teacher_algo, teacher_algo_hparam=teacher_algo_hparam, num_workers=1, ) algo = PDDR(ex_dir, env, policy, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy # Check assert all(algo.policy.param_values == policy_loaded.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)
def __init__( self, env_spec: EnvSpec, batch_size: int, reward_multiplier: float, lr: float = 3e-3, logger: StepLogger = None, device: str = "cuda" if to.cuda.is_available() else "cpu", ): """ Constructor :param env_spec: environment specification :param batch_size: batch size for each update step :param reward_multiplier: factor for the predicted probability :param lr: learning rate :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ self.device = device self.batch_size = batch_size self.reward_multiplier = reward_multiplier self.lr = lr spec = EnvSpec( obs_space=BoxSpace.cat( [env_spec.obs_space, env_spec.obs_space, env_spec.act_space]), act_space=BoxSpace(bound_lo=[0], bound_up=[1]), ) self.discriminator = FNNPolicy(spec=spec, hidden_nonlin=to.tanh, hidden_sizes=[62], output_nonlin=to.sigmoid) self.loss_fcn = nn.BCELoss() self.optimizer = to.optim.Adam(self.discriminator.parameters(), lr=lr, eps=1e-5) self.logger = logger
def test_spota_ppo(ex_dir, env: SimEnv, spota_hparam: dict): pyrado.set_seed(0) # Environment and domain randomization randomizer = create_default_randomizer(env) env = DomainRandWrapperBuffer(env, randomizer) # Policy and subroutines policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh) vfcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) critic_hparam = dict(gamma=0.998, lamda=0.95, num_epoch=3, batch_size=64, lr=1e-3) critic_cand = GAE(vfcn, **critic_hparam) critic_refs = GAE(deepcopy(vfcn), **critic_hparam) subrtn_hparam_common = dict( # min_rollouts=0, # will be overwritten by SPOTA min_steps=0, # will be overwritten by SPOTA max_iter=2, num_epoch=3, eps_clip=0.1, batch_size=64, num_workers=1, std_init=0.5, lr=1e-2, ) sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_common) sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs, **subrtn_hparam_common) # Create algorithm and train algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam) algo.train() assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
# Environment env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=[32, 32], shared_hidden_nonlin=to.relu, ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( max_iter=1000 * env.max_steps, memory_size=1000 * env.max_steps, gamma=0.995, num_updates_per_step=1, tau=0.99, ent_coeff_init=0.2, learn_ent_coeff=False, target_update_intvl=1, standardize_rew=False, min_steps=1,
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical("shared_hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict( hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( "min_steps_algo", [1]), # 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform("memory_size_algo", 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform("tau_algo", 0.99, 1.0), ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9), learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo", [True, False]), standardize_rew=trial.suggest_categorical("standardize_rew_algo", [False]), gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0), target_update_intvl=trial.suggest_categorical( "target_update_intvl_algo", [1, 5]), num_updates_per_step=trial.suggest_categorical( "num_batch_updates_algo", [1, 5]), batch_size=trial.suggest_categorical("batch_size_algo", [128, 256, 512]), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f"trial_{trial.number}")) algo = SAC(study_dir, env, policy, qfcn_1, qfcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
policy_hparam = dict( # feats=FeatureStack( # [ # RFFeat( # env.obs_space.flat_dim, num_feat_per_dim=500, bandwidth=1/env.obs_space.bound_up, use_cuda=True # ) # ] # ) # feats=FeatureStack(RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=None, use_cuda=True)) feats=FeatureStack(identity_feat, sin_feat)) policy = LinearPolicy(spec=env.spec, **policy_hparam, use_cuda=True) # Critic vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam, use_cuda=True) critic_hparam = dict( gamma=0.99, lamda=0.95, batch_size=100, standardize_adv=True, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.99), ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( max_iter=500, min_steps=env.max_steps * 10,
def fnn_policy(env: Env): return FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh)
class RewardGenerator: """ Class for generating the discriminator rewards in ADR. Generates a reward using a trained discriminator network. """ def __init__( self, env_spec: EnvSpec, batch_size: int, reward_multiplier: float, lr: float = 3e-3, logger: StepLogger = None, device: str = "cuda" if to.cuda.is_available() else "cpu", ): """ Constructor :param env_spec: environment specification :param batch_size: batch size for each update step :param reward_multiplier: factor for the predicted probability :param lr: learning rate :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ self.device = device self.batch_size = batch_size self.reward_multiplier = reward_multiplier self.lr = lr spec = EnvSpec( obs_space=BoxSpace.cat( [env_spec.obs_space, env_spec.obs_space, env_spec.act_space]), act_space=BoxSpace(bound_lo=[0], bound_up=[1]), ) self.discriminator = FNNPolicy(spec=spec, hidden_nonlin=to.tanh, hidden_sizes=[62], output_nonlin=to.sigmoid) self.loss_fcn = nn.BCELoss() self.optimizer = to.optim.Adam(self.discriminator.parameters(), lr=lr, eps=1e-5) self.logger = logger def get_reward(self, traj: StepSequence): traj = preprocess_rollout(traj) with to.no_grad(): reward = self.discriminator.forward(traj).cpu() return to.log(reward.mean()) * self.reward_multiplier def train(self, reference_trajectory: StepSequence, randomized_trajectory: StepSequence, num_epoch: int) -> to.Tensor: reference_batch = reference_trajectory.split_shuffled_batches( self.batch_size) random_batch = randomized_trajectory.split_shuffled_batches( self.batch_size) loss = None for _ in tqdm(range(num_epoch), "Discriminator Epoch", num_epoch): try: reference_batch_now = preprocess_rollout(next(reference_batch)) random_batch_now = preprocess_rollout(next(random_batch)) except StopIteration: break if reference_batch_now.shape[ 0] < self.batch_size - 1 or random_batch_now.shape[ 0] < self.batch_size - 1: break random_results = self.discriminator(random_batch_now) reference_results = self.discriminator(reference_batch_now) self.optimizer.zero_grad() loss = self.loss_fcn(random_results, to.ones( self.batch_size - 1, 1)) + self.loss_fcn( reference_results, to.zeros(self.batch_size - 1, 1)) loss.backward() self.optimizer.step() # Logging if self.logger is not None: self.logger.add_value("discriminator_loss", loss) return loss
def test_simopt_cem_ppo(ex_dir, env: SimEnv): pyrado.set_seed(0) # Environments env_real = deepcopy(env) env_real = ActNormWrapper(env_real) env_sim = ActNormWrapper(env) randomizer = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.99, lamda=0.98, num_epoch=2, batch_size=128, standardize_adv=True, lr=8e-4, max_grad_norm=5.0, ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=2, eps_clip=0.13, min_steps=4 * env_sim.max_steps, num_epoch=3, batch_size=128, std_init=0.75, lr=3e-04, max_grad_norm=1.0, num_workers=1, ) subrtn_policy = PPO(ex_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10), ) ddp_policy_hparam = dict(mapping=dp_map, trafo_mask=trafo_mask, scale_params=True) ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam) subsubrtn_distr_hparam = dict( max_iter=2, pop_size=10, num_init_states_per_domain=1, num_is_samples=8, expl_std_init=1e-2, expl_std_min=1e-5, extra_expl_std_init=1e-2, extra_expl_decay_iter=5, num_workers=1, ) subsubrtn_distr = CEM(ex_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, 10, 10], num_rollouts_per_distr=3, num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behavior_policy=behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=1, num_eval_rollouts=5, warmstart=True, ) algo = SimOpt(ex_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_basic_meta(ex_dir, policy, env: SimEnv, algo, algo_hparam: dict): pyrado.set_seed(0) # Policy and subroutine env = GaussianObsNoiseWrapper( env, noise_std=[ 1 / 180 * np.pi, 1 / 180 * np.pi, 0.0025, 0.0025, 2 / 180 * np.pi, 2 / 180 * np.pi, 0.05, 0.05, ], ) env = ActNormWrapper(env) env = ActDelayWrapper(env) randomizer = create_default_randomizer_qbb() randomizer.add_domain_params( UniformDomainParam(name="act_delay", mean=15, halfspan=15, clip_lo=0, roundint=True)) env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN policy = FNNPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9995, lamda=0.98, num_epoch=2, batch_size=64, lr=5e-4, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=3, min_rollouts=5, num_epoch=2, eps_clip=0.1, batch_size=64, std_init=0.8, lr=2e-4, num_workers=1, ) subrtn = PPO(ex_dir, env, policy, critic, **subrtn_hparam) algo = algo(env, subrtn, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def fnn_policy_cuda(env: Env): return FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=True)
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma", [None, 0.99, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical("hidden_sizes_policy", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) # Critic vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) critic_hparam = dict( batch_size=250, gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0), lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0), num_epoch=trial.suggest_int("num_epoch_critic", 1, 10), lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3), standardize_adv=trial.suggest_categorical("standardize_adv_critic", [True, False]), max_grad_norm=trial.suggest_categorical("max_grad_norm_critic", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=300, batch_size=250, min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) * env.max_steps, num_epoch=trial.suggest_int("num_epoch_algo", 1, 10), eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2), std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical("max_grad_norm_algo", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } trafo_mask = [False, True, False, True, False, True, False, True] env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844224855479998, lamda=0.9700148505302241, num_epoch=5, batch_size=500, standardize_adv=False, lr=7.058326426522811e-4, max_grad_norm=6.0, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999), ) critic = GAE(vfcn, **critic_hparam)
def __init__( self, save_dir: pyrado.PathLike, env: Env, particle_hparam: dict, max_iter: int, num_particles: int, temperature: float, lr: float, horizon: int, std_init: float = 1.0, min_rollouts: int = None, min_steps: int = 10000, num_workers: int = 4, serial: bool = True, logger: StepLogger = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param particle_hparam: hyper-parameters for particle template construction :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_particles: number of distinct particles :param temperature: the temperature of the SVGD determines how jointly the training takes place :param lr: the learning rate for the update of the particles :param horizon: horizon for each particle :param std_init: initial standard deviation for the exploration :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_workers: number of environments for parallel sampling :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(particle_hparam, dict): raise pyrado.TypeErr(given=particle_hparam, expected_type=dict) if not all([key in particle_hparam for key in ["actor", "vfcn", "critic"]]): raise AttributeError # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy=None, logger=logger) # Store the inputs self._env = env self.num_particles = num_particles self.horizon = horizon self.lr = lr self.temperature = temperature self.serial = serial # Prepare placeholders for particles self.particles = [None] * num_particles self.particleSteps = [None] * num_particles self.expl_strats = [None] * num_particles self.optimizers = [None] * num_particles self.fixed_particles = [None] * num_particles self.fixed_expl_strats = [None] * num_particles self.samplers = [None] * num_particles self.count = 0 self.update_count = 0 # Particle factory actor = FNNPolicy(spec=env.spec, **particle_hparam["actor"]) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam["vfcn"]) critic = GAE(vfcn, **particle_hparam["critic"]) self.register_as_logger_parent(critic) particle = SVPGParticle(env.spec, actor, critic) for i in range(self.num_particles): self.particles[i] = deepcopy(particle) self.particles[i].init_param() self.expl_strats[i] = NormalActNoiseExplStrat(self.particles[i].actor, std_init) self.optimizers[i] = to.optim.Adam(self.expl_strats[i].parameters(), lr=self.lr) self.fixed_particles[i] = deepcopy(self.particles[i]) self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i]) self.particleSteps[i] = 0 if self.serial: self.samplers[i] = ParallelRolloutSampler( env, self.expl_strats[i], num_workers, min_rollouts=min_rollouts, min_steps=min_steps )
def test_snapshots_notmeta(ex_dir, env: SimEnv, policy, algo_class, algo_hparam): # Collect hyper-parameters, create algorithm, and train common_hparam = dict(max_iter=1, num_workers=1) common_hparam.update(algo_hparam) if issubclass(algo_class, ActorCritic): common_hparam.update( min_rollouts=3, critic=GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh)), ) elif issubclass(algo_class, ParameterExploring): common_hparam.update(num_init_states_per_domain=1) elif issubclass(algo_class, (DQL, SAC)): common_hparam.update(memory_size=1000, num_updates_per_step=2, gamma=0.99, min_rollouts=1) fnn_hparam = dict(hidden_sizes=[8, 8], hidden_nonlin=to.tanh) if issubclass(algo_class, DQL): # Override the setting env = BallOnBeamDiscSim(env.dt, env.max_steps) net = FNN( input_size=DiscreteActQValPolicy.get_qfcn_input_size(env.spec), output_size=DiscreteActQValPolicy.get_qfcn_output_size(), **fnn_hparam, ) policy = DiscreteActQValPolicy(spec=env.spec, net=net) else: # Override the setting env = ActNormWrapper(env) policy = TwoHeadedGRUPolicy(env.spec, shared_hidden_size=8, shared_num_recurrent_layers=1) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) common_hparam.update(qfcn_1=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) common_hparam.update(qfcn_2=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) else: raise NotImplementedError # Simulate training algo = algo_class(ex_dir, env, policy, **common_hparam) algo.policy.param_values += to.tensor([42.0]) if isinstance(algo, ActorCritic): algo.critic.vfcn.param_values += to.tensor([42.0]) # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy if isinstance(algo, ActorCritic): critic_loaded = algo_loaded.critic # Check assert all(algo.policy.param_values == policy_loaded.param_values) if isinstance(algo, ActorCritic): assert all( algo.critic.vfcn.param_values == critic_loaded.vfcn.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)
f"{PPO.name}_{FNNPolicy.name}", f"{args.frequency}Hz_{args.max_steps}ROLen_{args.ppo_iterations}PPOIter_{args.sprl_iterations}SPRLIter_cov_only{args.cov_only}_seed_{args.seed}", ) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environment env_hparams = dict(dt=1 / float(args.frequency), max_steps=args.max_steps) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) # FNN # policy_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) # FNN # vfcn_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844224855479998, lamda=0.9700148505302241, num_epoch=5, batch_size=500, standardize_adv=False, lr=7.058326426522811e-4,
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environments env_hparams = dict(dt=1 / 100.0, max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( mass_rot_pole=0.095 * 0.9, # 0.095*0.9 = 0.0855 mass_pend_pole=0.024 * 1.1, # 0.024*1.1 = 0.0264 length_rot_pole=0.085 * 0.9, # 0.085*0.9 = 0.0765 length_pend_pole=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9885, lamda=0.9648, num_epoch=2, batch_size=500, standardize_adv=False, lr=5.792e-4, max_grad_norm=1.0, ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=200, min_steps=3 * 23 * env_sim.max_steps, num_epoch=7, eps_clip=0.0744, batch_size=500, std_init=0.9074, lr=3.446e-04, max_grad_norm=1.0, num_workers=1, ) subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior_std_denom = trial.suggest_uniform("prior_std_denom", 5, 20) prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / prior_std_denom), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / prior_std_denom), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / prior_std_denom), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / prior_std_denom), ) ddp_policy = DomainDistrParamPolicy( mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=trial.suggest_categorical("ddp_policy_scale_params", [True, False]), ) subsubrtn_distr_hparam = dict( max_iter=trial.suggest_categorical("subsubrtn_distr_max_iter", [20]), pop_size=trial.suggest_int("pop_size", 50, 500), num_init_states_per_domain=1, num_is_samples=trial.suggest_int("num_is_samples", 5, 20), expl_std_init=trial.suggest_loguniform("expl_std_init", 1e-3, 1e-1), expl_std_min=trial.suggest_categorical("expl_std_min", [1e-4]), extra_expl_std_init=trial.suggest_loguniform("expl_std_init", 1e-3, 1e-1), extra_expl_decay_iter=trial.suggest_int("extra_expl_decay_iter", 0, 10), num_workers=1, ) csv_logger = create_csv_step_logger( osp.join(study_dir, f"trial_{trial.number}")) subsubrtn_distr = CEM(study_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam, logger=csv_logger) obs_vel_weight = trial.suggest_loguniform("obs_vel_weight", 1, 100) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight], num_rollouts_per_distr=trial.suggest_int("num_rollouts_per_distr", 20, 100), num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=trial.suggest_categorical("algo_max_iter", [10]), num_eval_rollouts=trial.suggest_categorical("algo_num_eval_rollouts", [5]), warmstart=trial.suggest_categorical("algo_warmstart", [True]), thold_succ_subrtn=trial.suggest_categorical("algo_thold_succ_subrtn", [50]), subrtn_snapshot_mode="latest", ) algo = SimOpt(study_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam, logger=csv_logger) # Jeeeha algo.train(seed=args.seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env_real, algo.policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret