def get_uniform_masses_lengths_randomizer_qq(frac_halfspan: float): """ Get a uniform randomizer that applies to all masses and lengths of the Quanser Qube according to a fraction of their nominal parameter values :param frac_halfspan: fraction of the nominal parameter value :return: `DomainRandomizer` with uniformly distributed masses and lengths """ from pyrado.environments.pysim.quanser_qube import QQubeSim dp_nom = QQubeSim.get_nominal_domain_param() return DomainRandomizer( UniformDomainParam(name='Mp', mean=dp_nom['Mp'], halfspan=dp_nom['Mp'] / frac_halfspan, clip_lo=1e-3), UniformDomainParam(name='Mr', mean=dp_nom['Mr'], halfspan=dp_nom['Mr'] / frac_halfspan, clip_lo=1e-3), UniformDomainParam(name='Lr', mean=dp_nom['Lr'], halfspan=dp_nom['Lr'] / frac_halfspan, clip_lo=1e-2), UniformDomainParam(name='Lp', mean=dp_nom['Lp'], halfspan=dp_nom['Lp'] / frac_halfspan, clip_lo=1e-2), )
def create_default_randomizer_qq() -> DomainRandomizer: """ Create the default randomizer for the `QQubeSim`. :return: randomizer based on the nominal domain parameter values """ from pyrado.environments.pysim.quanser_qube import QQubeSim dp_nom = QQubeSim.get_nominal_domain_param() return DomainRandomizer( NormalDomainParam(name="gravity_const", mean=dp_nom["gravity_const"], std=dp_nom["gravity_const"] / 10, clip_lo=1e-3), NormalDomainParam(name="motor_resistance", mean=dp_nom["motor_resistance"], std=dp_nom["motor_resistance"] / 5, clip_lo=1e-3), NormalDomainParam(name="motor_back_emf", mean=dp_nom["motor_back_emf"], std=dp_nom["motor_back_emf"] / 5, clip_lo=1e-4), NormalDomainParam(name="mass_rot_pole", mean=dp_nom["mass_rot_pole"], std=dp_nom["mass_rot_pole"] / 5, clip_lo=1e-4), NormalDomainParam(name="length_rot_pole", mean=dp_nom["length_rot_pole"], std=dp_nom["length_rot_pole"] / 5, clip_lo=1e-4), NormalDomainParam(name="damping_rot_pole", mean=dp_nom["damping_rot_pole"], std=dp_nom["damping_rot_pole"] / 4, clip_lo=1e-9), NormalDomainParam(name="mass_pend_pole", mean=dp_nom["mass_pend_pole"], std=dp_nom["mass_pend_pole"] / 5, clip_lo=1e-4), NormalDomainParam(name="length_pend_pole", mean=dp_nom["length_pend_pole"], std=dp_nom["length_pend_pole"] / 5, clip_lo=1e-4), NormalDomainParam( name="damping_pend_pole", mean=dp_nom["damping_pend_pole"], std=dp_nom["damping_pend_pole"] / 4, clip_lo=1e-9, ), )
def get_default_randomizer_qq() -> DomainRandomizer: """ Get the default randomizer for the `QQubeSim`. :return: randomizer based on the nominal domain parameter values """ from pyrado.environments.pysim.quanser_qube import QQubeSim dp_nom = QQubeSim.get_nominal_domain_param() return DomainRandomizer( NormalDomainParam(name='g', mean=dp_nom['g'], std=dp_nom['g'] / 5, clip_lo=1e-3), NormalDomainParam(name='Rm', mean=dp_nom['Rm'], std=dp_nom['Rm'] / 5, clip_lo=1e-3), NormalDomainParam(name='km', mean=dp_nom['km'], std=dp_nom['km'] / 5, clip_lo=1e-4), NormalDomainParam(name='Mr', mean=dp_nom['Mr'], std=dp_nom['Mr'] / 5, clip_lo=1e-4), NormalDomainParam(name='Lr', mean=dp_nom['Lr'], std=dp_nom['Lr'] / 5, clip_lo=1e-4), NormalDomainParam(name='Dr', mean=dp_nom['Dr'], std=dp_nom['Dr'] / 5, clip_lo=1e-9), NormalDomainParam(name='Mp', mean=dp_nom['Mp'], std=dp_nom['Mp'] / 5, clip_lo=1e-4), NormalDomainParam(name='Lp', mean=dp_nom['Lp'], std=dp_nom['Lp'] / 5, clip_lo=1e-4), NormalDomainParam(name='Dp', mean=dp_nom['Dp'], std=dp_nom['Dp'] / 5, clip_lo=1e-9))
""" Test predefined energy-based controller to make the Quanser Qube swing up. """ import torch as to from pyrado.environments.pysim.quanser_qube import QQubeSim from pyrado.domain_randomization.utils import print_domain_params from pyrado.policies.environment_specific import QQubeSwingUpAndBalanceCtrl from pyrado.sampling.rollout import rollout, after_rollout_query from pyrado.utils.data_types import RenderMode from pyrado.utils.input_output import print_cbt if __name__ == '__main__': # Set up environment env = QQubeSim(dt=1/500., max_steps=4000) # Set up policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate done, param, state = False, None, None while not done: ro = rollout(env, policy, render_mode=RenderMode(text=False, video=True), eval=True, reset_kwargs=dict(domain_param=param, init_state=state)) print_domain_params(env.domain_param) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, state, param = after_rollout_query(env, policy, ro)
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 100., max_steps=600) env = QQubeSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( 'shared_hidden_sizes_policy', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical('shared_hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic q_fcn_hparam = dict( hidden_sizes=trial.suggest_categorical( 'hidden_sizes_critic', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( 'min_steps_algo', [1]), # , 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform('memory_size_algo', 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform('tau_algo', 0.99, 1.), alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9), learn_alpha=trial.suggest_categorical('learn_alpha_algo', [True, False]), standardize_rew=trial.suggest_categorical('standardize_rew_algo', [False]), gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.), target_update_intvl=trial.suggest_categorical( 'target_update_intvl_algo', [1, 5]), num_batch_updates=trial.suggest_categorical('num_batch_updates_algo', [1, 5]), batch_size=trial.suggest_categorical('batch_size_algo', [128, 256, 512]), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(ex_dir, f'trial_{trial.number}')) algo = SAC(ex_dir, env, policy, q_fcn_1, q_fcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler( env, policy, num_envs=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
from pyrado.policies.features import FeatureStack, identity_feat, sign_feat, abs_feat, squared_feat, qubic_feat, \ bell_feat, RandFourierFeat, MultFeat from pyrado.policies.linear import LinearPolicy import torch as to if __name__ == '__main__': # Experiment (set seed before creating the modules) # ex_dir = setup_experiment(QQubeSim.name, PoWER.name, f'{LinearPolicy}_actnorm', seed=1) ex_dir = setup_experiment(QQubeSim.name, PoWER.name, QQubeSwingUpAndBalanceCtrl.name, seed=1) # Environment env_hparams = dict(dt=1 / 500., max_steps=5000) env = QQubeSim(**env_hparams) # env = ActNormWrapper(env) # Policy # policy_hparam = dict( # # feats=FeatureStack([RandFourierFeat(env.obs_space.flat_dim, num_feat=20, bandwidth=env.obs_space.bound_up)]) # feats=FeatureStack([identity_feat, sign_feat, abs_feat, squared_feat, # MultFeat([2, 5]), MultFeat([3, 5]), MultFeat([4, 5])]) # ) # policy = LinearPolicy(spec=env.spec, **policy_hparam) # policy_hparam = dict(energy_gain=0.587, ref_energy=0.827) policy_hparam = dict( ref_energy=0.02, energy_gain=50., energy_th_gain=0.3, # This parameter is fixed. (requires_grad = False) acc_max=5.,
def default_qq(): return QQubeSim(dt=0.004, max_steps=4000)
bell_feat, MultFeat from pyrado.policies.linear import LinearPolicy from pyrado.utils.experiments import wrap_like_other_env if __name__ == '__main__': # Experiment (set seed before creating the modules) # ex_dir = setup_experiment(QQubeSim.name, f'{BayRn.name}_{PoWER.name}-sim2sim', '100Hz_lin_dr-Mp+', seed=111) ex_dir = setup_experiment( QQubeSim.name, f'{BayRn.name}_{PoWER.name}-sim2sim', f'{QQubeSwingUpAndBalanceCtrl.name}_100Hz_dr-Mp+Mr+', seed=1111) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_sim = QQubeSim(**env_hparams) env_sim = DomainRandWrapperLive(env_sim, get_zero_var_randomizer(env_sim)) dp_map = get_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real = QQubeSim(**env_hparams) env_real.domain_param = dict(Mp=0.026, Mr=0.097) # env_real = QQubeReal(**env_hparams) env_real = wrap_like_other_env(env_real, env_sim) # Policy # policy_hparam = dict( # feats=FeatureStack([identity_feat, sign_feat, abs_feat, squared_feat, qubic_feat, # MultFeat([2, 5]), MultFeat([3, 5]), MultFeat([4, 5])]) # ) # policy = LinearPolicy(spec=env_sim.spec, **policy_hparam)
from pyrado.logger.experiment import ask_for_experiment from pyrado.policies.environment_specific import QQubeSwingUpAndBalanceCtrl from pyrado.sampling.rollout import rollout, after_rollout_query from pyrado.utils.argparser import get_argparser from pyrado.utils.experiments import load_experiment from pyrado.utils.input_output import print_cbt from pyrado.utils.data_types import RenderMode import torch as to import numpy as np if __name__ == '__main__': # Parse command line arguments args = get_argparser().parse_args() # Load the environment and the policy env = QQubeSim(args.dt, args.max_steps) # runs infinitely by default # policy = QQubeSwingUpAndBalanceCtrl(env.spec) # policy = QQubeSwingUpAndBalanceCtrl( # env.spec, # ref_energy=0.04, # Quanser's value: 0.02 # energy_gain=30., # Quanser's value: 50 # energy_th_gain=0.4, # former: 0.4 # acc_max=5., # Quanser's value: 6 # alpha_max_pd_enable=10., # Quanser's value: 20 # pd_gains=to.tensor([-0.42, 18.45, -0.53, 1.53])) # QUANSER # policy = QQubeSwingUpAndBalanceCtrl( # env.spec,
# param_spec['m_pole'] = np.linspace(0.127*0.7, 0.127*1.3, num=11, endpoint=True) # param_spec['l_pole'] = np.linspace(0.641/2*0.7, 0.641/2*1.3, num=11, endpoint=True) # Get the experiments' directories to load from prefixes = [ osp.join(pyrado.EXP_DIR, 'FILL_IN', 'FILL_IN'), ] exp_names = [ '', ] exp_labels = [ '', ] elif args.env_name == QQubeSim.name: env = QQubeSim(dt=args.dt, max_steps=args.max_steps) # param_spec['g'] = np.linspace(9.81*0.7, 9.81*1.3, num=11, endpoint=True) # param_spec['Rm'] = np.linspace(8.4*0.7, 8.4*1.3, num=11, endpoint=True) # param_spec['km'] = np.linspace(0.042*0.7, 0.042*1.3, num=11, endpoint=True) # param_spec['Mr'] = np.linspace(0.095*0.7, 0.095*1.3, num=11, endpoint=True) # param_spec['Lr'] = np.linspace(0.085*0.7, 0.085*1.3, num=11, endpoint=True) # param_spec['Dr'] = np.linspace(5e-6*0.2, 5e-6*5, num=11, endpoint=True) # 5e-6 # param_spec['Mp'] = np.linspace(0.024*0.7, 0.024*1.3, num=11, endpoint=True) # param_spec['Lp'] = np.linspace(0.129*0.7, 0.129*1.3, num=11, endpoint=True) # param_spec['Dp'] = np.linspace(1e-6*0.2, 1e-6n*5, num=11, endpoint=True) # 1e-6 # Get the experiments' directories to load from prefixes = [ osp.join(pyrado.EXP_DIR, 'FILL_IN', 'FILL_IN'), ]
print_cbt(f'Set maximum number of time steps to {args.max_steps}', 'y') # ex_dir = input('Enter a root directory that contains one or more experiment directories:\n') # Get the experiment's directory to load from ex_dir = ask_for_experiment() dirs = [x[0] for x in os.walk(ex_dir)][1:] num_policies = len(dirs) print(f'Found {num_policies} policies.') # Specify domain parameters param_names = ['Dp', 'Dr', 'Mp', 'Mr', 'Lp', 'Lr'] num_param = len(param_names) num_samples = 10 # Create one-dim evaluation grid for multiple parameters nom_params = QQubeSim.get_nominal_domain_param() param_values = dict( Dp=np.logspace(-8, -4, num_samples), Dr=np.logspace(-8, -4, num_samples), Mp=np.linspace(0.6 * nom_params['Mp'], 1.5 * nom_params['Mp'], num_samples), Mr=np.linspace(0.6 * nom_params['Mr'], 1.5 * nom_params['Mr'], num_samples), Lp=np.linspace(0.6 * nom_params['Lp'], 1.5 * nom_params['Lp'], num_samples), Lr=np.linspace(0.6 * nom_params['Lr'], 1.5 * nom_params['Lr'], num_samples), ) # Set up the environment env = ActNormWrapper(QQubeSim(dt=1 / 100., max_steps=args.max_steps))
from pyrado.algorithms.bayrn import BayRn from pyrado.logger.experiment import setup_experiment, save_list_of_dicts_to_yaml from pyrado.policies.fnn import FNNPolicy from pyrado.policies.rnn import LSTMPolicy, GRUPolicy from pyrado.utils.data_types import EnvSpec from pyrado.utils.experiments import wrap_like_other_env if __name__ == '__main__': # Experiment (set seed before creating the modules) ex_dir = setup_experiment(QQubeSim.name, f'{BayRn.name}_{PPO.name}', f'{FNNPolicy.name}_actnorm_dr-Mp-Mr-Lp-Lr', seed=111) # Environments env_hparams = dict(dt=1/100., max_steps=600) env_sim = QQubeSim(**env_hparams) env_sim = ActNormWrapper(env_sim) env_sim = DomainRandWrapperLive(env_sim, get_zero_var_randomizer(env_sim)) dp_map = get_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real = QQubeReal(**env_hparams) env_real = wrap_like_other_env(env_real, env_sim) # Policy policy_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh) # FNN # policy_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU policy = FNNPolicy(spec=env_sim.spec, **policy_hparam) # policy = RNNPolicy(spec=env_sim.spec, **policy_hparam) # policy = LSTMPolicy(spec=env_sim.spec, **policy_hparam) # policy = GRUPolicy(spec=env_sim.spec, **policy_hparam)
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1/100., max_steps=600) env = QQubeSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # FNN # policy_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]), # ) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic value_fcn_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) # value_fcn_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]), # ) # LSTM & GRU value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) # value_fcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) critic_hparam = dict( gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), batch_size=150, lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]), max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]), # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) critic = GAE(value_fcn, **critic_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=300, min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), batch_size=150, std_init=trial.suggest_uniform('std_init_algo', 0.6, 1.0), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]), # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) csv_logger = create_csv_step_logger(osp.join(ex_dir, f'trial_{trial.number}')) algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler(env, policy, num_envs=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret