def create_qqsu_setup(): # Environments env_hparams = dict(dt=1 / 100.0, max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( mass_rot_pole=0.095 * 0.9, # 0.095*0.9 = 0.0855 mass_pend_pole=0.024 * 1.1, # 0.024*1.1 = 0.0264 length_rot_pole=0.085 * 0.9, # 0.085*0.9 = 0.0765 length_pend_pole=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e-9, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } # trafo_mask = [False, True, False, True, False, True, False, True] trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec) prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10), ) ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=False) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
def test_velocity_filter(plot: bool): # Set up environment env_gt = QQubeSwingUpSim(dt=1 / 500.0, max_steps=350) env_gt.init_space = SingularStateSpace(np.array([0.1, np.pi / 2, 3.0, 0])) env_filt = ObsVelFiltWrapper(env_gt, idcs_pos=["theta", "alpha"], idcs_vel=["theta_dot", "alpha_dot"]) # Set up policy policy = IdlePolicy(env_gt.spec) # Simulate ro_gt = rollout(env_gt, policy) ro_filt = rollout(env_filt, policy) # Filter the observations of the last rollout theta_dot_gt = ro_gt.observations[:, 4] alpha_dot_gt = ro_gt.observations[:, 5] theta_dot_filt = ro_filt.observations[:, 4] alpha_dot_filt = ro_filt.observations[:, 5] assert theta_dot_filt[0] != pytest.approx( theta_dot_gt[0]) # can't be equal since we set an init vel of 3 rad/s assert alpha_dot_filt[0] == pytest.approx(alpha_dot_gt[0], abs=1e-4) # Compute the error rmse_theta = rmse(theta_dot_gt, theta_dot_filt) rmse_alpha = rmse(alpha_dot_gt, alpha_dot_filt) if plot: from matplotlib import pyplot as plt # Plot the filtered signals versus the orignal observations plt.rc("text", usetex=True) fix, axs = plt.subplots(2, figsize=(16, 9)) axs[0].plot(theta_dot_gt, label=r"$\dot{\theta}_{true}$") axs[0].plot(theta_dot_filt, label=r"$\dot{\theta}_{filt}$") axs[1].plot(alpha_dot_gt, label=r"$\dot{\alpha}_{true}$") axs[1].plot(alpha_dot_filt, label=r"$\dot{\alpha}_{filt}$") axs[0].set_title(rf"RMSE($\theta$): {rmse_theta}") axs[0].set_ylabel(r"$\dot{\theta}$ [rad/s]") axs[0].legend() axs[1].set_title(rf"RMSE($\alpha$): {rmse_alpha}") axs[1].set_xlabel("time steps") axs[1].set_ylabel(r"$\dot{\alpha}$ [rad/s]") axs[1].legend() plt.show()
def __init__(self, env_spec: EnvSpec, ref_energy: float, energy_gain: float, th_gain: float, acc_max: float, use_cuda: bool = False): """ Constructor :param env_spec: environment specification :param ref_energy: reference energy level [J] :param energy_gain: P-gain on the energy [m/s/J] :param th_gain: P-gain on angle theta :param acc_max: maximum linear acceleration of the pendulum pivot [m/s**2] :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU """ super().__init__(env_spec, use_cuda) # Initialize parameters self._log_E_ref = nn.Parameter(to.log(to.tensor(ref_energy)), requires_grad=True) self._log_E_gain = nn.Parameter(to.log(to.tensor(energy_gain)), requires_grad=True) self._th_gain = nn.Parameter(to.tensor(th_gain), requires_grad=True) self.acc_max = to.tensor(acc_max) self.dp_nom = QQubeSwingUpSim.get_nominal_domain_param()
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 250., max_steps=1500) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(feats=FeatureStack([ identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat([4, 5]) ])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=50, pop_size=trial.suggest_int('pop_size', 50, 200), num_rollouts=trial.suggest_int('num_rollouts', 4, 10), num_is_samples=trial.suggest_int('num_is_samples', 5, 40), expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5), symm_sampling=trial.suggest_categorical('symm_sampling', [True, False]), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'), env, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def create_qq_setup(factor, dt, max_steps, render_mode): # Set up environment init_state = np.array([0.1, 0.0, 0.0, 0.0]) env = QQubeSwingUpSim(dt=dt, max_steps=max_steps) env = ActNormWrapper(env) # Set up policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=render_mode, max_steps=max_steps, ) act_500Hz = ro.actions ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt * factor), init_state=init_state), render_mode=render_mode, max_steps=int(max_steps / factor), ) act_100Hz = ro.actions env = DownsamplingWrapper(env, factor) ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=render_mode, max_steps=max_steps, ) act_500Hz_w = ro.actions # Time in seconds time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz))) time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz))) time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt), int(len(act_500Hz_w))) # Plot _, ax = plt.subplots(nrows=1) ax.plot(time_500Hz, act_500Hz, label="500 Hz (original)") ax.plot(time_100Hz, act_100Hz, label="100 Hz", ls="--") ax.plot(time_500Hz_w, act_500Hz_w, label="500 Hz (wrapped)", ls="--") ax.legend() ax.set_ylabel(env.act_space.labels) ax.set_xlabel("time [s]")
def create_qq_setup(factor, dt, max_steps): # Set up environment init_state = np.array([0.1, 0.0, 0.0, 0.0]) env = QQubeSwingUpSim(dt=dt, max_steps=max_steps) env = ActNormWrapper(env) # Set up policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=RenderMode(video=True), max_steps=max_steps, ) act_500Hz = ro.actions ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt * factor), init_state=init_state), render_mode=RenderMode(video=True), max_steps=int(max_steps / factor), ) act_100Hz = ro.actions act_100Hz_zoh = np.repeat(act_100Hz, 5, axis=0) env = DownsamplingWrapper(env, factor) ro = rollout( env, policy, reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state), render_mode=RenderMode(video=True), max_steps=max_steps, ) act_500Hz_wrapped = ro.actions # Plot _, ax = plt.subplots(nrows=1) ax.plot(act_500Hz, label="500 Hz (original)") ax.plot(act_100Hz_zoh, label="100 Hz (zoh)") ax.plot(act_500Hz_wrapped, label="500 Hz (wrapped)") ax.legend() ax.set_ylabel(env.act_space.labels) ax.set_xlabel("time steps") plt.show()
def __init__( self, env_spec: EnvSpec, ref_energy: float, energy_gain: float, th_gain: float, acc_max: float, reset_domain_param: bool = True, use_cuda: bool = False, ): """ Constructor :param env_spec: environment specification :param ref_energy: reference energy level [J] :param energy_gain: P-gain on the energy [m/s/J] :param th_gain: P-gain on angle theta :param acc_max: maximum linear acceleration of the pendulum pivot [m/s**2] :param reset_domain_param: if `True` the domain parameters are reset if the they are present as a entry in the kwargs passed to `reset()`. If `False` they are ignored. :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU """ super().__init__(env_spec, use_cuda) # Initial parameters self._log_E_ref_init = to.log(to.tensor(ref_energy)) self._log_E_gain_init = to.log(to.tensor(energy_gain)) self._th_gain_init = to.tensor(th_gain) # Define parameters self._log_E_ref = nn.Parameter(to.empty_like(self._log_E_ref_init), requires_grad=True) self._log_E_gain = nn.Parameter(to.empty_like(self._log_E_gain_init), requires_grad=True) self._th_gain = nn.Parameter(to.empty_like(self._th_gain_init), requires_grad=True) self.acc_max = to.tensor(acc_max) self._domain_param = QQubeSwingUpSim.get_nominal_domain_param() self._reset_domain_param = reset_domain_param # Default initialization self.init_param(None)
def default_qqsu(): return QQubeSwingUpSim(dt=0.004, max_steps=4000)
from pyrado.environments.mujoco.quanser_qube import QQubeStabMjSim, QQubeSwingUpMjSim from pyrado.environments.pysim.quanser_qube import QQubeStabSim, QQubeSwingUpSim from pyrado.policies.special.environment_specific import QQubeSwingUpAndBalanceCtrl from pyrado.sampling.rollout import after_rollout_query, rollout from pyrado.utils.argparser import get_argparser from pyrado.utils.data_types import RenderMode from pyrado.utils.input_output import print_cbt if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() dt = 1 / 500.0 max_steps = 3500 if args.env_name == "qq-su": env = QQubeSwingUpSim(dt=dt, max_steps=max_steps) elif args.env_name == "qq-mj-su": env = QQubeSwingUpMjSim(dt=dt, max_steps=max_steps) elif args.env_name == "qq-st": env = QQubeStabSim(dt=dt, max_steps=max_steps) elif args.env_name == "qq-mj-st": env = QQubeStabMjSim(dt=dt, max_steps=max_steps) else: raise pyrado.ValueErr( given_name="--env_name", given=args.env_name, eq_constraint="'qq-su', 'qq-mj-su', 'qq-st', or 'qq-mj-st'", ) policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate
parser.add_argument("--train_teachers", action="store_true", default=False) parser.add_argument("--num_teachers", type=int, default=2) parser.add_argument("--max_iter", type=int, default=500) parser.add_argument("--num_epochs", type=int, default=10) # Parse command line arguments args = parser.parse_args() # Set seed if desired pyrado.set_seed(args.seed, verbose=True) use_cuda = args.device == "cuda" descr = f"_{args.max_steps}st_{args.freq}Hz" # Environment env_hparams = dict(dt=1 / args.freq, max_steps=args.max_steps) env_real = QQubeSwingUpSim(**env_hparams) ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{PDDR.name}_{QQubeSwingUpAndBalanceCtrl.name}{descr}") if args.train_teachers: # Teacher policy teacher_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.relu, output_nonlin=to.tanh, use_cuda=use_cuda) teacher_policy = FNNPolicy(spec=env_real.spec, **teacher_policy_hparam) # Reduce weights of last layer, recommended by paper for p in teacher_policy.net.output_layer.parameters(): with to.no_grad():
# Parse command line arguments args = get_argparser().parse_args() # Experiment (set seed before creating the modules) ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{BayRn.name}-{PoWER.name}_{QQubeSwingUpAndBalanceCtrl.name}", f"sim2sim_rand-mass_pend_pole-mass_rot_pole_seed-{args.seed}", ) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environments env_sim_hparams = dict(dt=1 / 100.0, max_steps=600) env_sim = QQubeSwingUpSim(**env_sim_hparams) env_sim = DomainRandWrapperLive(env_sim, create_zero_var_randomizer(env_sim)) dp_map = create_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real = QQubeSwingUpSim(**env_sim_hparams) env_real.domain_param = dict( mass_pend_pole=0.024 * 1.1, mass_rot_pole=0.095 * 1.1, ) env_real_hparams = env_sim_hparams env_real = wrap_like_other_env(env_real, env_sim) # PoWER and energy-based controller setup policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.0)
Test predefined energy-based swing-up controller on the Quanser Qube with observation noise. """ from scipy.ndimage import gaussian_filter1d from matplotlib import pyplot as plt from pyrado.environment_wrappers.observation_noise import GaussianObsNoiseWrapper from pyrado.environments.pysim.quanser_qube import QQubeSwingUpSim from pyrado.policies.special.environment_specific import QQubeSwingUpAndBalanceCtrl from pyrado.sampling.rollout import rollout from pyrado.utils.data_types import RenderMode if __name__ == '__main__': plt.rc('text', usetex=True) # Set up environment env = QQubeSwingUpSim(dt=1 / 500., max_steps=3500) env = GaussianObsNoiseWrapper( env, noise_std=[0., 0., 0., 0., 2., 0]) # only noise on theta_dot [rad/s] # Set up policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate ro = rollout(env, policy, render_mode=RenderMode(text=False, video=False), eval=True) # Filter the observations of the last rollout theta_dot = ro.observations[:, 4]
from pyrado.utils.data_types import RenderMode from pyrado.utils.input_output import print_cbt if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() dt = args.dt if args.dt is not None else 0.01 if args.env_name == QCartPoleSwingUpSim.name: env = QCartPoleSwingUpSim(dt=dt, max_steps=int(5 / dt), wild_init=False) state = np.array([0, 87 / 180 * np.pi, 0, 0]) elif args.env_name == QQubeSwingUpSim.name: env = QQubeSwingUpSim(dt=dt, max_steps=int(5 / dt)) state = np.array([5 / 180 * np.pi, 87 / 180 * np.pi, 0, 0]) elif args.env_name == QBallBalancerSim.name: env = QBallBalancerSim(dt=dt, max_steps=int(5 / dt)) state = np.array( [2 / 180 * np.pi, 2 / 180 * np.pi, 0.1, -0.08, 0, 0, 0, 0]) elif args.env_name == OneMassOscillatorSim.name: env = OneMassOscillatorSim(dt=dt, max_steps=int(5 / dt)) state = np.array([-0.7, 0]) elif args.env_name == PendulumSim.name: env = PendulumSim(dt=dt, max_steps=int(5 / dt)) state = np.array([87 / 180 * np.pi, 0])
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1/100., max_steps=600) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # FNN # policy_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]), # ) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) # vfcn_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]), # ) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( batch_size=500, gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]), max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=250, batch_size=500, min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}')) algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret
args.max_steps = 600 print_cbt(f'Set maximum number of time steps to {args.max_steps}', 'y') # Get the experiment's directory to load from ex_dir = ask_for_experiment() if args.ex_dir is None else args.ex_dir dirs = [tmp[0] for tmp in os.walk(ex_dir)][1:] num_policies = len(dirs) print(f'Found {num_policies} policies.') # Specify domain parameters param_names = ['Dp', 'Dr', 'Mp', 'Mr', 'Lp', 'Lr'] num_param = len(param_names) num_samples = 10 # Create one-dim evaluation grid for multiple parameters nom_params = QQubeSwingUpSim.get_nominal_domain_param() param_values = dict( Dp=np.logspace(-8, -4, num_samples), Dr=np.logspace(-8, -4, num_samples), Mp=np.linspace(0.6 * nom_params['Mp'], 1.5 * nom_params['Mp'], num_samples), Mr=np.linspace(0.6 * nom_params['Mr'], 1.5 * nom_params['Mr'], num_samples), Lp=np.linspace(0.6 * nom_params['Lp'], 1.5 * nom_params['Lp'], num_samples), Lr=np.linspace(0.6 * nom_params['Lr'], 1.5 * nom_params['Lr'], num_samples), ) # Set up the environment env = ActNormWrapper(QQubeSwingUpSim(dt=1 / 100.,
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( Mr=0.095 * 0.9, # 0.095*0.9 = 0.0855 Mp=0.024 * 1.1, # 0.024*1.1 = 0.0264 Lr=0.085 * 0.9, # 0.085*0.9 = 0.0765 Lp=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ('Mr', 'mean'), 1: ('Mr', 'std'), 2: ('Mp', 'mean'), 3: ('Mp', 'std'), 4: ('Lr', 'mean'), 5: ('Lr', 'std'), 6: ('Lp', 'mean'), 7: ('Lp', 'std') } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9885, lamda=0.9648, num_epoch=2, batch_size=500, standardize_adv=False, lr=5.792e-4, max_grad_norm=1., ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=200, min_steps=3 * 23 * env_sim.max_steps, num_epoch=7, eps_clip=0.0744, batch_size=500, std_init=0.9074, lr=3.446e-04, max_grad_norm=1., num_workers=1, ) subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20) prior = DomainRandomizer( NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom), NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom), NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom), NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom), ) ddp_policy = DomainDistrParamPolicy( mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=trial.suggest_categorical('ddp_policy_scale_params', [True, False]), ) subsubrtn_distr_hparam = dict( max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]), pop_size=trial.suggest_int('pop_size', 50, 500), num_rollouts=1, num_is_samples=trial.suggest_int('num_is_samples', 5, 20), expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]), extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0, 10), num_workers=1, ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) subsubrtn_distr = CEM(study_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam, logger=csv_logger) obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight], num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20, 100), num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=trial.suggest_categorical('algo_max_iter', [10]), num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts', [5]), warmstart=trial.suggest_categorical('algo_warmstart', [True]), thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn', [50]), subrtn_snapshot_mode='latest', ) algo = SimOpt(study_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam, logger=csv_logger) # Jeeeha algo.train(seed=args.seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env_real, algo.policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
dt = args.dt else: raise pyrado.ValueErr( msg= "There was no time field in the loaded rollout to infer the time step size from, neither has " "it been specified explicitly! Please provide the time step size using --dt." ) if env_name == QBallBalancerSim.name: env = QBallBalancerSim(dt=dt) elif env_name == QCartPoleSwingUpSim.name: env = QCartPoleSwingUpSim(dt=dt) elif env_name == QQubeSwingUpSim.name: env = QQubeSwingUpSim(dt=dt) elif env_name == "wam-bic": # avoid loading mujoco from pyrado.environments.mujoco.wam_bic import WAMBallInCupSim env = WAMBallInCupSim(num_dof=4) env.init_space = BoxSpace(-pyrado.inf, pyrado.inf, shape=env.init_space.shape) elif env_name == "wam-jsc": # avoid loading mujoco from pyrado.environments.mujoco.wam_jsc import WAMJointSpaceCtrlSim env = WAMJointSpaceCtrlSim(num_dof=7) env.init_space = BoxSpace(-pyrado.inf, pyrado.inf,
if __name__ == '__main__': # Parse command line arguments args = get_argparser().parse_args() # Experiment (set seed before creating the modules) ex_dir = setup_experiment(QQubeSwingUpSim.name, f'{BayRn.name}-{PoWER.name}_{QQubeSwingUpAndBalanceCtrl.name}_sim2sim', f'rand-Mp-Mr_seed-{args.seed}') # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environments env_sim_hparams = dict(dt=1/100., max_steps=600) env_sim = QQubeSwingUpSim(**env_sim_hparams) env_sim = DomainRandWrapperLive(env_sim, create_zero_var_randomizer(env_sim)) dp_map = get_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real = QQubeSwingUpSim(**env_sim_hparams) env_real.domain_param = dict( Mp=0.024*1.1, Mr=0.095*1.1, ) env_real_hparams = env_sim_hparams env_real = wrap_like_other_env(env_real, env_sim) # PoWER + energy-based controller setup policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.) policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
# param_spec['m_pole'] = np.linspace(0.127*0.7, 0.127*1.3, num=11, endpoint=True) # param_spec['l_pole'] = np.linspace(0.641/2*0.7, 0.641/2*1.3, num=11, endpoint=True) # Get the experiments' directories to load from prefixes = [ osp.join(pyrado.EXP_DIR, 'ENV_NAME', 'ALGO_NAME'), ] ex_names = [ '', ] ex_labels = [ '', ] elif args.env_name == QQubeSwingUpSim.name: env = QQubeSwingUpSim(dt=args.dt, max_steps=args.max_steps) # param_spec['g'] = np.linspace(9.81*0.7, 9.81*1.3, num=11, endpoint=True) # param_spec['Rm'] = np.linspace(8.4*0.7, 8.4*1.3, num=11, endpoint=True) # param_spec['km'] = np.linspace(0.042*0.7, 0.042*1.3, num=11, endpoint=True) # param_spec['Mr'] = np.linspace(0.095*0.7, 0.095*1.3, num=11, endpoint=True) # param_spec['Lr'] = np.linspace(0.085*0.7, 0.085*1.3, num=11, endpoint=True) # param_spec['Dr'] = np.linspace(5e-6*0.2, 5e-6*5, num=11, endpoint=True) # 5e-6 # param_spec['Mp'] = np.linspace(0.024*0.7, 0.024*1.3, num=11, endpoint=True) # param_spec['Lp'] = np.linspace(0.129*0.7, 0.129*1.3, num=11, endpoint=True) # param_spec['Dp'] = np.linspace(1e-6*0.2, 1e-6n*5, num=11, endpoint=True) # 1e-6 # Get the experiments' directories to load from prefixes = [ osp.join(pyrado.EXP_DIR, 'ENV_NAME', 'ALGO_NAME'), ]
# Parse command line arguments args = get_argparser().parse_args() # Experiment (set seed before creating the modules) ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{BayRn.name}-{PPO.name}_{FNNPolicy.name}", "rand-mass_pend_pole-mass_rot_pole-length_pend_pole-length_rot_pole_lower-std", ) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environments env_sim_hparams = dict(dt=1 / 100.0, max_steps=600) env_sim = QQubeSwingUpSim(**env_sim_hparams) env_sim = ActNormWrapper(env_sim) env_sim = DomainRandWrapperLive(env_sim, create_zero_var_randomizer(env_sim)) dp_map = create_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real_hparams = dict(dt=1 / 500.0, max_steps=3000) env_real = QQubeSwingUpReal(**env_real_hparams) env_real = wrap_like_other_env(env_real, env_sim) # Policy policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) policy = FNNPolicy(spec=env_sim.spec, **policy_hparam) # Critic
) t_end = 5 # s else: ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{NPDR.name}_{TimePolicy.name}", num_segs_str + len_seg_str + seed_str, ) t_end = 10 # s # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environments env_sim_hparams = dict(dt=1 / 250.0, max_steps=int(t_end * 250)) env_sim = QQubeSwingUpSim(**env_sim_hparams) # env_sim = ActDelayWrapper(env_sim) # Create the ground truth target domain and the behavioral policy if ectl: env_real = osp.join(pyrado.EVAL_DIR, f"qq-su_ectrl_250Hz_{t_end}s") # 5s long policy = QQubeSwingUpAndBalanceCtrl( env_sim.spec ) # replaced by the recorded actions if use_rec_act=True else: env_real = osp.join(pyrado.EVAL_DIR, f"qq_chrip_10to0Hz_+1.5V_250Hz_{t_end}s") assert use_rec_act policy = DummyPolicy(env_sim.spec) # replaced by recorded real actions
parser.add_argument("--cov_only", action="store_true") args = parser.parse_args() # Experiment (set seed before creating the modules) ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{PPO.name}_{FNNPolicy.name}", f"{args.frequency}Hz_{args.max_steps}ROLen_{args.ppo_iterations}PPOIter_{args.sprl_iterations}SPRLIter_cov_only{args.cov_only}_seed_{args.seed}", ) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environment env_hparams = dict(dt=1 / float(args.frequency), max_steps=args.max_steps) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) # FNN # policy_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) # FNN # vfcn_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict(
from pyrado.utils.functions import skyline from pyrado.utils.input_output import print_cbt if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() dt = args.dt or 1 / 500.0 t_end = 5.5 # s max_steps = int(t_end / dt) # run for 5s check_in_sim = False # max_amp = 5.0 / 180 * np.pi # max. amplitude [rad] max_amp = -3.5 # max. amplitude [V] # Create the simulated and real environments if args.env_name == QQubeSwingUpReal.name: env_sim = QQubeSwingUpSim(dt, max_steps) env_real = QQubeSwingUpReal(dt, max_steps) elif args.env_name == QCartPoleSwingUpReal.name: env_sim = QCartPoleSwingUpSim(dt, max_steps) env_real = QCartPoleSwingUpReal(dt, max_steps) elif args.env_name == WAMReal.name: env_sim = WAMJointSpaceCtrlSim(frame_skip=4, num_dof=7, max_steps=max_steps) env_real = WAMJointSpaceCtrlRealStepBased(num_dof=7, max_steps=max_steps) else: raise pyrado.ValueErr( given=args.env_name, eq_constraint= f"{QQubeSwingUpReal.name}, {QCartPoleSwingUpReal.name} or {WAMReal.name}"