def test_training_parameter_exploring(ex_dir, env: SimEnv, algo, algo_hparam): # Environment and policy env = ActNormWrapper(env) policy_hparam = dict(feats=FeatureStack([const_feat, identity_feat])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Get initial return for comparison rets_before = np.zeros(5) for i in range(rets_before.size): rets_before[i] = rollout(env, policy, eval=True, seed=i).undiscounted_return() # Create the algorithm and train algo_hparam['num_workers'] = 1 algo = algo(ex_dir, env, policy, **algo_hparam) algo.train() policy.param_values = algo.best_policy_param # mimic saving and loading # Compare returns before and after training for max_iter iteration rets_after = np.zeros_like(rets_before) for i in range(rets_before.size): rets_after[i] = rollout(env, policy, eval=True, seed=i).undiscounted_return() assert all(rets_after > rets_before)
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 250., max_steps=1500) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(feats=FeatureStack([ identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat([4, 5]) ])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=50, pop_size=trial.suggest_int('pop_size', 50, 200), num_rollouts=trial.suggest_int('num_rollouts', 4, 10), num_is_samples=trial.suggest_int('num_is_samples', 5, 40), expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5), symm_sampling=trial.suggest_categorical('symm_sampling', [True, False]), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'), env, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def test_rfb_policy_serial(env, num_feat_per_dim): rbf = RBFFeat(num_feat_per_dim=num_feat_per_dim, bounds=env.obs_space.bounds) fs = FeatureStack([rbf]) policy = LinearPolicy(env.spec, fs) for _ in range(10): obs = env.obs_space.sample_uniform() act = policy(to.from_numpy(obs)) assert act.shape == (env.act_space.flat_dim,)
def test_rff_policy_serial(env, num_feat_per_dim): rff = RandFourierFeat(inp_dim=env.obs_space.flat_dim, num_feat_per_dim=num_feat_per_dim, bandwidth=env.obs_space.bound_up) policy = LinearPolicy(env.spec, FeatureStack([rff])) for _ in range(10): obs = env.obs_space.sample_uniform() act = policy(to.from_numpy(obs)) assert act.shape == (env.act_space.flat_dim,)
def test_rfb_policy_batch(env, batch_size, num_feat_per_dim): rbf = RBFFeat(num_feat_per_dim=num_feat_per_dim, bounds=env.obs_space.bounds) fs = FeatureStack([rbf]) policy = LinearPolicy(env.spec, fs) for _ in range(10): obs = env.obs_space.sample_uniform() obs = to.from_numpy(obs).repeat(batch_size, 1) act = policy(obs) assert act.shape == (batch_size, env.act_space.flat_dim)
def create_bob_setup(): # Environments env_hparams = dict(dt=1 / 100., max_steps=500) env_real = BallOnBeamSim(**env_hparams) env_real.domain_param = dict( # l_beam=1.95, # ang_offset=-0.03, g=10.81) env_sim = BallOnBeamSim(**env_hparams) randomizer = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=0, std=1e-12, clip_lo=1.5, clip_up=3.5), # UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12), NormalDomainParam(name='g', mean=0, std=1e-12), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { # 0: ('l_beam', 'mean'), 1: ('l_beam', 'std'), # 2: ('ang_offset', 'mean'), 3: ('ang_offset', 'halfspan') 0: ('g', 'mean'), 1: ('g', 'std') } env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack( [identity_feat, sin_feat])) behavior_policy.param_values = to.tensor( [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443]) prior = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=2.05, std=2.05/10), # UniformDomainParam(name='ang_offset', mean=0.03, halfspan=0.03/10), NormalDomainParam(name='g', mean=8.81, std=8.81 / 10), ) # trafo_mask = [False, True, False, True] trafo_mask = [True, True] ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=True) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
0.05 ]) # ... rad/s, rad/s, m/s, m/s] # env = ObsPartialWrapper(env, mask=[0, 0, 0, 0, 1, 1, 0, 0]) env = ActDelayWrapper(env) randomizer = create_default_randomizer(env) randomizer.add_domain_params( UniformDomainParam(name='act_delay', mean=5, halfspan=5, clip_lo=0, roundint=True)) env = DomainRandWrapperBuffer(env, randomizer) # Policy policy_hparam = dict(feats=FeatureStack([identity_feat])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Initialize with Quanser's PD gains init_policy_param_values = to.tensor([ -14., 0, -14 * 3.45, 0, 0, 0, -14 * 2.11, 0, 0, -14., 0, -14 * 3.45, 0, 0, 0, -14 * 2.11 ]) # Algorithm subrtn_hparam_cand = dict( max_iter=100, num_rollouts=1, # will be overwritten by SPOTA pop_size=50, expl_factor=1.1, expl_std_init=0.5, num_workers=8)
from tabulate import tabulate from pyrado.environment_wrappers.action_normalization import ActNormWrapper from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim from pyrado.policies.features import FeatureStack, identity_feat, squared_feat from pyrado.policies.feed_forward.linear import LinearPolicy from pyrado.sampling.parallel_rollout_sampler import ParallelRolloutSampler if __name__ == '__main__': # Set up environment env = BallOnBeamSim(dt=0.02, max_steps=500) env = ActNormWrapper(env) # Set up policy feats = FeatureStack([identity_feat, squared_feat]) policy = LinearPolicy(env.spec, feats) # Set up sampler sampler = ParallelRolloutSampler(env, policy, num_workers=2, min_rollouts=2000) # Sample and print ros = sampler.sample() print( tabulate({ 'StepSequence count': len(ros), 'Step count': sum(map(len, ros)), }.items()))
def get_lin_ctrl(env: SimEnv, ctrl_type: str, ball_z_dim_mismatch: bool = True) -> LinearPolicy: """ Construct a linear controller specified by its controller gains. Parameters for BallOnPlate5DSim by Markus Lamprecht (clipped gains < 1e-5 to 0). :param env: environment :param ctrl_type: type of the controller: 'lqr', or 'h2' :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim set to True if the given controller dos not have the z component (relative position) of the ball in the state vector, i.e. state is 14-dim instead of 16-dim :return: controller compatible with Pyrado Policy """ from pyrado.environments.rcspysim.ball_on_plate import BallOnPlate5DSim if isinstance(inner_env(env), BallOnPlate5DSim): # Get the controller gains (K-matrix) if ctrl_type.lower() == 'lqr': ctrl_gains = to.tensor([ [0.1401, 0, 0, 0, -0.09819, -0.1359, 0, 0.545, 0, 0, 0, -0.01417, -0.04427, 0], [0, 0.1381, 0, 0.2518, 0, 0, -0.2142, 0, 0.5371, 0, 0.03336, 0, 0, -0.1262], [0, 0, 0.1414, 0.0002534, 0, 0, -0.0002152, 0, 0, 0.5318, 0, 0, 0, -0.0001269], [0, -0.479, -0.0004812, 39.24, 0, 0, -15.44, 0, -1.988, -0.001934, 9.466, 0, 0, -13.14], [0.3039, 0, 0, 0, 25.13, 15.66, 0, 1.284, 0, 0, 0, 7.609, 6.296, 0] ]) elif ctrl_type.lower() == 'h2': ctrl_gains = to.tensor([ [-73.88, -2.318, 39.49, -4.270, 12.25, 0.9779, 0.2564, 35.11, 5.756, 0.8661, -0.9898, 1.421, 3.132, -0.01899], [-24.45, 0.7202, -10.58, 2.445, -0.6957, 2.1619, -0.3966, -61.66, -3.254, 5.356, 0.1908, 12.88, 6.142, -0.3812], [-101.8, -9.011, 64.345, -5.091, 17.83, -2.636, 0.9506, -44.28, 3.206, 37.59, 2.965, -32.65, -21.68, -0.1133], [-59.56, 1.56, -0.5794, 26.54, -2.503, 3.827, -7.534, 9.999, 1.143, -16.96, 8.450, -5.302, 4.620, -10.32], [-107.1, 0.4359, 19.03, -9.601, 20.33, 10.36, 0.2285, -74.98, -2.136, 7.084, -1.240, 62.62, 33.66, 1.790] ]) else: raise pyrado.ValueErr(given=ctrl_type, eq_constraint="'lqr' or 'h2'") # Compensate for the mismatching different state definition if ball_z_dim_mismatch: ctrl_gains = insert_tensor_col(ctrl_gains, 7, to.zeros((5, 1))) # ball z position ctrl_gains = insert_tensor_col(ctrl_gains, -1, to.zeros((5, 1))) # ball z velocity elif isinstance(inner_env(env), QBallBalancerSim): # Get the controller gains (K-matrix) if ctrl_type.lower() == 'pd': # Quanser gains (the original Quanser controller includes action clipping) ctrl_gains = -to.tensor([[-14., 0, -14*3.45, 0, 0, 0, -14*2.11, 0], [0, -14., 0, -14*3.45, 0, 0, 0, -14*2.11]]) elif ctrl_type.lower() == 'lqr': # Since the control module can by tricky to install (recommended using anaconda), we only load it if needed import control # System modeling A = np.zeros((env.obs_space.flat_dim, env.obs_space.flat_dim)) A[:env.obs_space.flat_dim//2, env.obs_space.flat_dim//2:] = np.eye(env.obs_space.flat_dim//2) A[4, 4] = -env.B_eq_v/env.J_eq A[5, 5] = -env.B_eq_v/env.J_eq A[6, 0] = env.c_kin*env.m_ball*env.g*env.r_ball**2/env.zeta A[6, 6] = -env.c_kin*env.r_ball**2/env.zeta A[7, 1] = env.c_kin*env.m_ball*env.g*env.r_ball**2/env.zeta A[7, 7] = -env.c_kin*env.r_ball**2/env.zeta B = np.zeros((env.obs_space.flat_dim, env.act_space.flat_dim)) B[4, 0] = env.A_m/env.J_eq B[5, 1] = env.A_m/env.J_eq # C = np.zeros((env.obs_space.flat_dim // 2, env.obs_space.flat_dim)) # C[:env.obs_space.flat_dim // 2, :env.obs_space.flat_dim // 2] = np.eye(env.obs_space.flat_dim // 2) # D = np.zeros((env.obs_space.flat_dim // 2, env.act_space.flat_dim)) # Get the weighting matrices from the environment Q = env.task.rew_fcn.Q R = env.task.rew_fcn.R # Q = np.diag([1e2, 1e2, 5e2, 5e2, 1e-2, 1e-2, 1e+1, 1e+1]) # Solve the continuous time Riccati eq K, _, _ = control.lqr(A, B, Q, R) # for discrete system pass dt ctrl_gains = to.from_numpy(K).to(to.get_default_dtype()) else: raise pyrado.ValueErr(given=ctrl_type, eq_constraint="'pd', 'lqr'") else: raise pyrado.TypeErr(given=inner_env(env), expected_type=BallOnPlate5DSim) # Reconstruct the controller feats = FeatureStack([identity_feat]) ctrl = LinearPolicy(env.spec, feats) ctrl.init_param(-1*ctrl_gains) # in classical control it is u = -K*x; here a = psi(s)*s return ctrl
def create_nonrecurrent_policy(): return LinearPolicy( EnvSpec( BoxSpace(-1, 1, 4), BoxSpace(-1, 1, 3), ), FeatureStack([const_feat, identity_feat, squared_feat]))
from pyrado.utils.data_types import EnvSpec if __name__ == '__main__': # Define some arbitrary EnvSpec obs_space = BoxSpace(bound_lo=np.array([-5., -12.]), bound_up=np.array([10., 6.])) act_space = BoxSpace(bound_lo=np.array([-1.]), bound_up=np.array([1.])) spec = EnvSpec(obs_space, act_space) num_fpd = 5 num_eval_points = 500 policy_hparam = dict( feats=FeatureStack([RBFFeat(num_feat_per_dim=num_fpd, bounds=obs_space.bounds, scale=None)]) ) policy = LinearPolicy(spec, **policy_hparam) eval_grid_0 = to.linspace(-5., 10, num_eval_points) eval_grid_1 = to.linspace(-12., 6, num_eval_points) eval_grid = to.stack([eval_grid_0, eval_grid_1], dim=1) feat_vals = to.empty(num_eval_points, num_fpd*obs_space.flat_dim) # Feed evaluation samples one by one for i, obs in enumerate(eval_grid): feat_vals[i, :] = policy.eval_feats(obs) feat_vals_batch = policy.eval_feats(eval_grid) if (feat_vals == feat_vals_batch).all(): feat_vals = feat_vals_batch else:
def linear_policy(env): return LinearPolicy(env.spec, DefaultPolicies.default_fs())
def test_sysidasrl(ex_dir, env: SimEnv, num_eval_rollouts): def eval_ddp_policy(rollouts_real): init_states_real = np.array( [ro.rollout_info['init_state'] for ro in rollouts_real]) rollouts_sim = [] for i, _ in enumerate(range(num_eval_rollouts)): rollouts_sim.append( rollout(env_sim, behavior_policy, eval=True, reset_kwargs=dict(init_state=init_states_real[i, :]))) # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real, rollouts_sim, replicate=False) assert len(ros_real_tr) == len(ros_sim_tr) assert all([ np.allclose(r.rollout_info['init_state'], s.rollout_info['init_state']) for r, s in zip(ros_real_tr, ros_sim_tr) ]) # Return the average the loss losses = [ algo.loss_fcn(ro_r, ro_s) for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr) ] return float(np.mean(np.asarray(losses))) # Environments env_real = deepcopy(env) env_real.domain_param = dict(ang_offset=-2 * np.pi / 180) env_sim = deepcopy(env) randomizer = DomainRandomizer( UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = {0: ('ang_offset', 'mean'), 1: ('ang_offset', 'halfspan')} env_sim = MetaDomainRandWrapper(env_sim, dp_map) assert env_real is not env_sim # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack([identity_feat])) prior = DomainRandomizer( UniformDomainParam(name='ang_offset', mean=1 * np.pi / 180, halfspan=1 * np.pi / 180), ) ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=[False, True], prior=prior) # Subroutine subrtn_hparam = dict( max_iter=5, pop_size=40, num_rollouts=1, num_is_samples=4, expl_std_init=1 * np.pi / 180, expl_std_min=0.001, extra_expl_std_init=0., extra_expl_decay_iter=5, num_workers=1, ) subrtn = CEM(ex_dir, env_sim, ddp_policy, **subrtn_hparam) algo_hparam = dict(metric=None, obs_dim_weight=np.ones(env_sim.obs_space.shape), num_rollouts_per_distr=10, num_workers=1) algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam) rollouts_real_tst = [] for _ in range(num_eval_rollouts): rollouts_real_tst.append(rollout(env_real, behavior_policy, eval=True)) loss_pre = eval_ddp_policy(rollouts_real_tst) # Mimic training while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met(): algo.logger.add_value(algo.iteration_key, algo.curr_iter) # Creat fake real-world data rollouts_real = [] for _ in range(num_eval_rollouts): rollouts_real.append(rollout(env_real, behavior_policy, eval=True)) algo.step(snapshot_mode='latest', meta_info=dict(rollouts_real=rollouts_real)) algo.logger.record_step() algo._curr_iter += 1 loss_post = eval_ddp_policy(rollouts_real_tst) assert loss_post <= loss_pre # don't have to be better every step