def test_training_parameter_exploring(ex_dir, env: SimEnv, algo, algo_hparam): # Environment and policy env = ActNormWrapper(env) policy_hparam = dict(feats=FeatureStack(const_feat, identity_feat)) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Get initial return for comparison rets_before = np.zeros(5) for i in range(rets_before.size): rets_before[i] = rollout(env, policy, eval=True, seed=i).undiscounted_return() # Create the algorithm and train algo_hparam["num_workers"] = 1 algo = algo(ex_dir, env, policy, **algo_hparam) algo.train() policy.param_values = algo.best_policy_param # mimic saving and loading # Compare returns before and after training for max_iter iteration rets_after = np.zeros_like(rets_before) for i in range(rets_before.size): rets_after[i] = rollout(env, policy, eval=True, seed=i).undiscounted_return() assert all(rets_after > rets_before)
def create_nonrecurrent_policy(): return LinearPolicy( EnvSpec( BoxSpace(-1, 1, 4), BoxSpace(-1, 1, 3), ), FeatureStack(const_feat, identity_feat, squared_feat), )
def create_lin_setup(physicsEngine: str, dt: float, max_steps: int, checkJointLimits: bool): # Set up environment env = MiniGolfIKSim( usePhysicsNode=True, physicsEngine=physicsEngine, dt=dt, max_steps=max_steps, checkJointLimits=checkJointLimits, fixedInitState=True, ) # Set up policy policy = LinearPolicy(env.spec, FeatureStack([const_feat])) policy.param_values = to.tensor([0.6, 0.0, 0.03 ]) # X (abs), Y (rel), Z (abs), C (abs) return env, policy
def test_rfb_policy_serial(env: Env, num_feat_per_dim: int): rbf = RBFFeat(num_feat_per_dim=num_feat_per_dim, bounds=env.obs_space.bounds) fs = FeatureStack(rbf) policy = LinearPolicy(env.spec, fs) for _ in range(10): obs = env.obs_space.sample_uniform() obs = to.from_numpy(obs).to(dtype=to.get_default_dtype()) act = policy(obs) assert act.shape == (env.act_space.flat_dim, )
def test_rff_policy_serial(env: Env, num_feat_per_dim: int): rff = RFFeat(inp_dim=env.obs_space.flat_dim, num_feat_per_dim=num_feat_per_dim, bandwidth=env.obs_space.bound_up) policy = LinearPolicy(env.spec, FeatureStack(rff)) for _ in range(10): obs = env.obs_space.sample_uniform() obs = to.from_numpy(obs).to(dtype=to.get_default_dtype()) act = policy(obs) assert act.shape == (env.act_space.flat_dim, )
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 250.0, max_steps=1500) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( feats=FeatureStack( [identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat((4, 5))] ) ) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=50, pop_size=trial.suggest_int("pop_size", 50, 200), num_init_states_per_domain=trial.suggest_int("num_init_states_per_domain", 4, 10), num_is_samples=trial.suggest_int("num_is_samples", 5, 40), expl_std_init=trial.suggest_uniform("expl_std_init", 0.1, 0.5), symm_sampling=trial.suggest_categorical("symm_sampling", [True, False]), ) csv_logger = create_csv_step_logger(osp.join(study_dir, f"trial_{trial.number}")) algo = PoWER(osp.join(study_dir, f"trial_{trial.number}"), env, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts ) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def create_bob_setup(): # Environments env_hparams = dict(dt=1 / 100.0, max_steps=500) env_real = BallOnBeamSim(**env_hparams) env_real.domain_param = dict( # l_beam=1.95, # ang_offset=-0.03, gravity_const=10.81) env_sim = BallOnBeamSim(**env_hparams) randomizer = DomainRandomizer( # NormalDomainParam(name="beam_length", mean=0, std=1e-6, clip_lo=1.5, clip_up=3.5), # UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6), NormalDomainParam(name="gravity_const", mean=0, std=1e-6), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { # 0: ("beam_length", "mean"), 1: ("beam_length", "std"), # 2: ("ang_offset", "mean"), 3: ("ang_offset", "halfspan") 0: ("gravity_const", "mean"), 1: ("gravity_const", "std"), } env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack(identity_feat, sin_feat)) behavior_policy.param_values = to.tensor( [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443]) prior = DomainRandomizer( # NormalDomainParam(name="beam_length", mean=2.05, std=2.05/10), # UniformDomainParam(name="ang_offset", mean=0.03, halfspan=0.03/10), NormalDomainParam(name="gravity_const", mean=8.81, std=8.81 / 10), ) # trafo_mask = [False, True, False, True] trafo_mask = [True, True] ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=True) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
def test_rff_regression(ex_dir, num_feat_per_dim: int, loss_fcn: Callable, algo_hparam: dict): # Generate some data inputs = to.linspace(-4.0, 4.0, 8001).view(-1, 1) targets = noisy_nonlin_fcn(inputs, f=3.0, noise_std=0).view(-1, 1) # Create the policy rff = RFFeat(inp_dim=1, num_feat_per_dim=num_feat_per_dim, bandwidth=1 / 20) policy = LinearPolicy( EnvSpec(InfBoxSpace(shape=(1, )), InfBoxSpace(shape=(1, ))), FeatureStack(rff)) # Create the algorithm, and train loss_before = loss_fcn(policy(inputs), targets) algo = NonlinRegression(ex_dir, inputs, targets, policy, **algo_hparam) algo.train() loss_after = loss_fcn(policy(inputs), targets) assert loss_after < loss_before assert algo.curr_iter >= algo_hparam["max_iter_no_improvement"]
f"{REPS.name}_{LinearPolicy.name}") # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environment env_hparams = dict(dt=1 / 100.0, max_steps=500) env = BallOnBeamSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( # feats=FeatureStack(RFFeat(env.obs_space.flat_dim, num_feat=1000, bandwidth=1/env.obs_space.bound_up)) # feats=FeatureStack(RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=0.8)), feats=FeatureStack(identity_feat, sin_feat)) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( max_iter=500, eps=0.2, pop_size=10 * policy.num_param, num_init_states_per_domain=10, expl_std_init=0.2, expl_std_min=0.02, num_epoch_dual=1000, optim_mode="scipy", lr_dual=1e-3, use_map=True, num_workers=8, )
env_hparams = dict(dt=1 / 100.0, max_steps=500) env = BallOnBeamSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( # feats=FeatureStack( # [ # RFFeat( # env.obs_space.flat_dim, num_feat_per_dim=500, bandwidth=1/env.obs_space.bound_up, use_cuda=True # ) # ] # ) # feats=FeatureStack(RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=None, use_cuda=True)) feats=FeatureStack(identity_feat, sin_feat)) policy = LinearPolicy(spec=env.spec, **policy_hparam, use_cuda=True) # Critic vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam, use_cuda=True) critic_hparam = dict( gamma=0.99, lamda=0.95, batch_size=100, standardize_adv=True, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.99), ) critic = GAE(vfcn, **critic_hparam)
def test_sysidasrl_reps(ex_dir, env: SimEnv, num_eval_rollouts: int): pyrado.set_seed(0) def eval_ddp_policy(rollouts_real): init_states_real = np.array([ro.states[0, :] for ro in rollouts_real]) rollouts_sim = [] for i, _ in enumerate(range(num_eval_rollouts)): rollouts_sim.append( rollout(env_sim, behavior_policy, eval=True, reset_kwargs=dict(init_state=init_states_real[i, :]))) # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real, rollouts_sim, replicate=False) assert len(ros_real_tr) == len(ros_sim_tr) assert all([ np.allclose(r.states[0, :], s.states[0, :]) for r, s in zip(ros_real_tr, ros_sim_tr) ]) # Return the average the loss losses = [ algo.loss_fcn(ro_r, ro_s) for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr) ] return float(np.mean(np.asarray(losses))) # Environments env_real = deepcopy(env) env_real.domain_param = dict(ang_offset=-2 * np.pi / 180) env_sim = deepcopy(env) randomizer = DomainRandomizer( UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = {0: ("ang_offset", "mean"), 1: ("ang_offset", "halfspan")} env_sim = MetaDomainRandWrapper(env_sim, dp_map) assert env_real is not env_sim # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack(identity_feat)) prior = DomainRandomizer( UniformDomainParam(name="ang_offset", mean=1 * np.pi / 180, halfspan=1 * np.pi / 180), ) ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=[False, True], prior=prior) # Subroutine subrtn_hparam = dict( max_iter=2, eps=1.0, pop_size=100, num_init_states_per_domain=1, expl_std_init=5e-2, expl_std_min=1e-4, num_workers=1, ) subrtn = REPS(ex_dir, env_sim, ddp_policy, **subrtn_hparam) algo_hparam = dict(metric=None, obs_dim_weight=np.ones(env_sim.obs_space.shape), num_rollouts_per_distr=5, num_workers=1) algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam) rollouts_real_tst = [] for _ in range(num_eval_rollouts): rollouts_real_tst.append(rollout(env_real, behavior_policy, eval=True)) loss_pre = eval_ddp_policy(rollouts_real_tst) # Mimic training while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met(): algo.logger.add_value(algo.iteration_key, algo.curr_iter) # Creat fake real-world data rollouts_real = [] for _ in range(num_eval_rollouts): rollouts_real.append(rollout(env_real, behavior_policy, eval=True)) algo.step(snapshot_mode="latest", meta_info=dict(rollouts_real=rollouts_real)) algo.logger.record_step() algo._curr_iter += 1 loss_post = eval_ddp_policy(rollouts_real_tst) assert loss_post <= loss_pre # don't have to be better every step
def linear_policy_cuda(env: Env): return LinearPolicy(env.spec, DefaultPolicies.default_fs(), use_cuda=True)
def linear_policy(env: Env): return LinearPolicy(env.spec, DefaultPolicies.default_fs())
from tabulate import tabulate from pyrado.environment_wrappers.action_normalization import ActNormWrapper from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim from pyrado.policies.features import FeatureStack, identity_feat, squared_feat from pyrado.policies.feed_back.linear import LinearPolicy from pyrado.sampling.parallel_rollout_sampler import ParallelRolloutSampler if __name__ == "__main__": # Set up environment env = BallOnBeamSim(dt=0.02, max_steps=500) env = ActNormWrapper(env) # Set up policy feats = FeatureStack(identity_feat, squared_feat) policy = LinearPolicy(env.spec, feats) # Set up sampler sampler = ParallelRolloutSampler(env, policy, num_workers=2, min_rollouts=2000) # Sample and print ros = sampler.sample() print( tabulate({ "StepSequence count": len(ros), "Step count": sum(map(len, ros)), }.items()))