def create_bob_setup(): # Environments env_hparams = dict(dt=1 / 100., max_steps=500) env_real = BallOnBeamSim(**env_hparams) env_real.domain_param = dict( # l_beam=1.95, # ang_offset=-0.03, g=10.81) env_sim = BallOnBeamSim(**env_hparams) randomizer = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=0, std=1e-12, clip_lo=1.5, clip_up=3.5), # UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12), NormalDomainParam(name='g', mean=0, std=1e-12), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { # 0: ('l_beam', 'mean'), 1: ('l_beam', 'std'), # 2: ('ang_offset', 'mean'), 3: ('ang_offset', 'halfspan') 0: ('g', 'mean'), 1: ('g', 'std') } env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack( [identity_feat, sin_feat])) behavior_policy.param_values = to.tensor( [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443]) prior = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=2.05, std=2.05/10), # UniformDomainParam(name='ang_offset', mean=0.03, halfspan=0.03/10), NormalDomainParam(name='g', mean=8.81, std=8.81 / 10), ) # trafo_mask = [False, True, False, True] trafo_mask = [True, True] ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=True) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
def __init__(self): ShowBase.__init__(self) self.done = False self.state = None self.param = None print("a") self.ro = rollout( env, policy, render_mode=RenderMode(text=args.verbose, video=args.animation), eval=True, max_steps=max_steps, stop_on_done=not args.relentless, reset_kwargs=dict(domain_param=self.param, init_state=self.state), ) print("hoi") print_domain_params(env.domain_param) print_cbt(f"Return: {self.ro.undiscounted_return()}", "g", bright=True) self.done, self.state, self.param = after_rollout_query( env, policy, self.ro) print("1") self.bob = BallOnBeamSim(2) print("2") self.pos, self.r_ball, self.a, self.l_beam, self.d_beam = self.bob._init_anim( ) print("3") self.ball = self.loader.loadModel("my_models/ball") self.ball.reparentTo(self.render) self.ball.setPos(self.pos) self.box = self.loader.loadModel("my_models/box") self.box.reparentTo(self.render) self.box.setPos(0, 0, 0) self.box.setScale(self.l_beam, self.d_beam, 2 * self.d_beam) self.camera.setPos(0, -10, 0)
def create_default_randomizer_bob() -> DomainRandomizer: """ Create the default randomizer for the `BallOnBeamSim`. :return: randomizer based on the nominal domain parameter values """ from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim dp_nom = BallOnBeamSim.get_nominal_domain_param() return DomainRandomizer( NormalDomainParam(name='g', mean=dp_nom['g'], std=dp_nom['g']/10, clip_lo=1e-4), NormalDomainParam(name='m_ball', mean=dp_nom['m_ball'], std=dp_nom['m_ball']/5, clip_lo=1e-4), NormalDomainParam(name='r_ball', mean=dp_nom['r_ball'], std=dp_nom['r_ball']/5, clip_lo=1e-4), NormalDomainParam(name='m_beam', mean=dp_nom['m_beam'], std=dp_nom['m_beam']/5, clip_lo=1e-3), NormalDomainParam(name='l_beam', mean=dp_nom['l_beam'], std=dp_nom['l_beam']/5, clip_lo=1e-3), NormalDomainParam(name='d_beam', mean=dp_nom['d_beam'], std=dp_nom['d_beam']/5, clip_lo=1e-3), UniformDomainParam(name='c_frict', mean=dp_nom['c_frict'], halfspan=dp_nom['c_frict'], clip_lo=0), UniformDomainParam(name='ang_offset', mean=0./180*np.pi, halfspan=0.1/180*np.pi) )
def create_default_randomizer_bob() -> DomainRandomizer: """ Create the default randomizer for the `BallOnBeamSim`. :return: randomizer based on the nominal domain parameter values """ from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim dp_nom = BallOnBeamSim.get_nominal_domain_param() return DomainRandomizer( NormalDomainParam(name="gravity_const", mean=dp_nom["gravity_const"], std=dp_nom["gravity_const"] / 10, clip_lo=1e-4), NormalDomainParam(name="ball_mass", mean=dp_nom["ball_mass"], std=dp_nom["ball_mass"] / 5, clip_lo=1e-4), NormalDomainParam(name="ball_radius", mean=dp_nom["ball_radius"], std=dp_nom["ball_radius"] / 5, clip_lo=1e-4), NormalDomainParam(name="beam_mass", mean=dp_nom["beam_mass"], std=dp_nom["beam_mass"] / 5, clip_lo=1e-3), NormalDomainParam(name="beam_length", mean=dp_nom["beam_length"], std=dp_nom["beam_length"] / 5, clip_lo=1e-3), NormalDomainParam(name="beam_thickness", mean=dp_nom["beam_thickness"], std=dp_nom["beam_thickness"] / 5, clip_lo=1e-3), UniformDomainParam(name="friction_coeff", mean=dp_nom["friction_coeff"], halfspan=dp_nom["friction_coeff"], clip_lo=0), UniformDomainParam(name="ang_offset", mean=0.0 / 180 * np.pi, halfspan=0.1 / 180 * np.pi), )
'ident-sin', seed=1001) """ Set up the environment a.k.a. domain to train in. After creating the environment, you can apply various wrappers which are modular. Note that the order of wrappers might be of importance. For example, wrapping an environment with an `ObsNormWrapper` and then with an `GaussianObsNoiseWrapper` applies the noise on the normalized observations, and yields different results than the reverse order of wrapping. Environments in Pyrado can be of different types: (i) written in Python only (like the Qunaser simulations or simple OpenAI Gym environments), (ii) wrapped as well as self-designed MuJoCo-based simulations, or (iii) self-designed robotic environments powered by Rcs using either the Bullet or Vortex physics engine. None of the simulations includes any computer vision aspects. It is all about dynamics-based interaction and (continuous) control. The degree of randomization for the environments varies strongly, since it is a lot of work to randomize them properly (including testing) and I have to graduate after all ;) """ env_hparams = dict(dt=1 / 50., max_steps=300) env = BallOnBeamSim(**env_hparams) env = ActNormWrapper(env) """ Set up the policy after the environment since it needs to know the dimensions of the policies observation and action space. There are many different policy architectures available under `Pyrado/pyrado/policies`, which significantly vary in terms of required hyper-parameters. You can find some examples at `Pyrado/scripts/training`. Note that all policies must inherit from `Policy` which inherits from `torch.nn.Module`. Moreover, all `Policy` instances are deterministic. The exploration is handled separately (see `Pyrado/pyrado/exploration`). """ policy_hparam = dict(feats=FeatureStack([identity_feat, sin_feat])) policy = LinearPolicy(spec=env.spec, **policy_hparam) """ Specify the algorithm you want to use for learning the policy parameters. For deterministic sampling, you need to set `num_sampler_envs=1`. If `num_sampler_envs>1`, PyTorch's multiprocessing library will be used to parallelize sampling from the environment on the CPU. The resulting behavior is non-deterministic, i.e. even for the same random seed, you will get different results.
import pytest from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim from pyrado.environments.pysim.quanser_ball_balancer import QBallBalancerSim from pyrado.environments.sim_base import SimEnv from pyrado.exploration.stochastic_action import NormalActNoiseExplStrat from pyrado.exploration.stochastic_params import NormalParamNoise from pyrado.policies.base import Policy from pyrado.policies.features import * @pytest.mark.parametrize( "env", [ BallOnBeamSim(dt=0.02, max_steps=1), QBallBalancerSim(dt=0.02, max_steps=1), ], ids=["bob", "qbb"], ) @pytest.mark.parametrize("policy", ["linear_policy", "fnn_policy"], ids=["lin", "fnn"], indirect=True) def test_noise_on_act(env: SimEnv, policy: Policy): for _ in range(100): # Init the exploration strategy act_noise_strat = NormalActNoiseExplStrat(policy, std_init=0.5, train_mean=True) # Set new parameters for the exploration noise
rowvar = not data_along_rows cov_np = np.cov(x, rowvar=rowvar) cov_pyrado = cov(to.from_numpy(x), data_along_rows=data_along_rows).numpy() assert cov_pyrado.shape[0] == cov_pyrado.shape[1] if data_along_rows: assert cov_np.shape[0] == x.shape[1] assert cov_pyrado.shape[0] == x.shape[1] else: assert cov_np.shape[0] == x.shape[0] assert cov_pyrado.shape[0] == x.shape[0] assert np.allclose(cov_np, cov_pyrado) @pytest.mark.parametrize('env, expl_strat', [ (BallOnBeamSim(dt=0.02, max_steps=100), DummyPolicy(BallOnBeamSim(dt=0.02, max_steps=100).spec)), ], ids=['bob_dummy']) def test_concat_rollouts(env, expl_strat): ro1 = rollout(env, expl_strat) ro2 = rollout(env, expl_strat) ro_cat = StepSequence.concat([ro1, ro2]) assert isinstance(ro_cat, StepSequence) assert ro_cat.length == ro1.length + ro2.length @pytest.mark.parametrize('x, y', [ (to.tensor([1., 2., 3.]), to.tensor([1., 2., 3.])), (to.tensor([1., 0., 1.]), to.tensor([1., 1e12, 1.])), (to.tensor([0., 0., 0.]), to.tensor([1., 2, 3.])),
def default_omo(): return BallOnBeamSim(dt=0.02, max_steps=300)
def default_bob(): return BallOnBeamSim(dt=0.01, max_steps=500)
elif args.env_name == QBallBalancerSim.name: env = QBallBalancerSim(dt=dt, max_steps=int(5 / dt)) state = np.array( [2 / 180 * np.pi, 2 / 180 * np.pi, 0.1, -0.08, 0, 0, 0, 0]) elif args.env_name == OneMassOscillatorSim.name: env = OneMassOscillatorSim(dt=dt, max_steps=int(5 / dt)) state = np.array([-0.7, 0]) elif args.env_name == PendulumSim.name: env = PendulumSim(dt=dt, max_steps=int(5 / dt)) state = np.array([87 / 180 * np.pi, 0]) elif args.env_name == BallOnBeamSim.name: env = BallOnBeamSim(dt=dt, max_steps=int(5 / dt)) state = np.array([-0.25, 0, 0, +20 / 180 * np.pi]) else: raise pyrado.ValueErr( given=args.env_name, eq_constraint= f"{QCartPoleSwingUpSim.name}, {QQubeSwingUpSim.name}, {QBallBalancerSim.name}, " f"{OneMassOscillatorSim.name}, {PendulumSim.name}, or {BallOnBeamSim.name}", ) policy = IdlePolicy(env.spec) # Simulate done, param = False, None while not done:
policy_infos=policy_infos, hidden=hidden, data_format=data_format) # Pickle/unpickle ro2 = pickle.loads(pickle.dumps(ro, pickle.HIGHEST_PROTOCOL)) for step, step_pi in zip(ro, ro2): assert step.reward == step_pi.reward assert (step.observation == step_pi.observation).all() assert (step.action == step_pi.action).all() assert step.done == step_pi.done @pytest.mark.parametrize('env', [ BallOnBeamSim(dt=0.01, max_steps=200), ], ids=['bob_linpol']) def test_advantage_calculation(env, linear_policy): ro = rollout(env, linear_policy) gamma = 0.99 lamb = 0.95 # Add dummy values values = np.ones_like(ro.rewards) if not ro.done[-1]: values = to.cat([values, 0]) ro.add_data('values', values) gae1 = gae_returns(ro, gamma, lamb)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """ Script to sample some rollouts using the ParallelRolloutSampler """ from tabulate import tabulate from pyrado.environment_wrappers.action_normalization import ActNormWrapper from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim from pyrado.policies.features import FeatureStack, identity_feat, squared_feat from pyrado.policies.feed_back.linear import LinearPolicy from pyrado.sampling.parallel_rollout_sampler import ParallelRolloutSampler if __name__ == "__main__": # Set up environment env = BallOnBeamSim(dt=0.02, max_steps=500) env = ActNormWrapper(env) # Set up policy feats = FeatureStack(identity_feat, squared_feat) policy = LinearPolicy(env.spec, feats) # Set up sampler sampler = ParallelRolloutSampler(env, policy, num_workers=2, min_rollouts=2000) # Sample and print ros = sampler.sample() print(
cov_np = np.cov(x, rowvar=rowvar) cov_pyrado = cov(to.from_numpy(x), data_along_rows=data_along_rows).numpy() assert cov_pyrado.shape[0] == cov_pyrado.shape[1] if data_along_rows: assert cov_np.shape[0] == x.shape[1] assert cov_pyrado.shape[0] == x.shape[1] else: assert cov_np.shape[0] == x.shape[0] assert cov_pyrado.shape[0] == x.shape[0] assert np.allclose(cov_np, cov_pyrado) @pytest.mark.parametrize( 'env, expl_strat', [ (BallOnBeamSim(dt=0.02, max_steps=100), DummyPolicy(BallOnBeamSim(dt=0.02, max_steps=100).spec)), ], ids=['bob_dummy'] ) def test_concat_rollouts(env, expl_strat): ro1 = rollout(env, expl_strat) ro2 = rollout(env, expl_strat) ro_cat = StepSequence.concat([ro1, ro2]) assert isinstance(ro_cat, StepSequence) assert ro_cat.length == ro1.length + ro2.length @pytest.mark.parametrize( 'x, y', [ (to.tensor([1., 2., 3.]), to.tensor([1., 2., 3.])), (to.tensor([1., 0., 1.]), to.tensor([1., 1e12, 1.])),
env, noise_mean=0.1 * np.ones(env.act_space.shape), noise_std=0.2 * np.ones(env.act_space.shape)) for _ in range(3): # Sample some values rand_act = env.act_space.sample_uniform() wrapped_env.reset() obs_nom, _, _, _ = env.step(rand_act) obs_wrapped, _, _, _ = wrapped_env.step(rand_act) # Different actions can not lead to the same observation assert not np.all(obs_nom == obs_wrapped) @pytest.mark.wrappers @pytest.mark.parametrize('env', [ BallOnBeamSim(dt=0.05, max_steps=1), ], ids=['bob']) def test_order_act_noise_act_norm(env): # First noise wrapper then normalization wrapper wrapped_env_noise = GaussianActNoiseWrapper( env, noise_mean=0.2 * np.ones(env.act_space.shape), noise_std=0.1 * np.ones(env.act_space.shape)) wrapped_env_noise_norm = ActNormWrapper(wrapped_env_noise) # First normalization wrapper then noise wrapper wrapped_env_norm = ActNormWrapper(env) wrapped_env_norm_noise = GaussianActNoiseWrapper( wrapped_env_norm, noise_mean=0.2 * np.ones(env.act_space.shape),
'num_dim, method', [ (1, 'uniform'), (1, 'uniform'), (3, 'uniform'), (3, 'normal'), (3, 'Marsaglia'), (4, 'uniform'), (4, 'normal'), (4, 'Marsaglia'), (15, 'uniform'), (15, 'normal') ] ) def test_sample_from_unit_sphere_surface(num_dim, method): s = sample_from_hyper_sphere_surface(num_dim, method) assert 0.95 <= to.norm(s, p=2) <= 1.05 @pytest.mark.sampling @pytest.mark.parametrize( 'env, policy', [ (BallOnBeamSim(dt=0.02, max_steps=100), LinearPolicy(BallOnBeamSim(dt=0.02, max_steps=100).spec, FeatureStack([const_feat, identity_feat, squared_feat]))), (QBallBalancerSim(dt=0.02, max_steps=100), LinearPolicy(QBallBalancerSim(dt=0.02, max_steps=100).spec, FeatureStack([const_feat, identity_feat, squared_feat]))) ], ids=['bob_linpol', 'qbb_linpol'] ) def test_rollout_wo_exploration(env, policy): ro = rollout(env, policy, render_mode=RenderMode()) assert isinstance(ro, StepSequence) assert len(ro) <= env.max_steps @pytest.mark.parametrize( 'mean, cov', [
cov_pyrado = cov(to.from_numpy(x), data_along_rows=data_along_rows).numpy() assert cov_pyrado.shape[0] == cov_pyrado.shape[1] if data_along_rows: assert cov_np.shape[0] == x.shape[1] assert cov_pyrado.shape[0] == x.shape[1] else: assert cov_np.shape[0] == x.shape[0] assert cov_pyrado.shape[0] == x.shape[0] assert np.allclose(cov_np, cov_pyrado) @pytest.mark.parametrize( "env, expl_strat", [ (BallOnBeamSim(dt=0.02, max_steps=100), DummyPolicy(BallOnBeamSim(dt=0.02, max_steps=100).spec)), ], ids=["bob_dummy"], ) def test_concat_rollouts(env, expl_strat): ro1 = rollout(env, expl_strat) ro2 = rollout(env, expl_strat) ro_cat = StepSequence.concat([ro1, ro2]) assert isinstance(ro_cat, StepSequence) assert ro_cat.length == ro1.length + ro2.length @pytest.mark.parametrize( "x, y", [ (to.tensor([1.0, 2.0, 3.0]), to.tensor([1.0, 2.0, 3.0])),