raise pyrado.ValueErr(given=setup_type, eq_constraint="'idle', 'pos', 'vel'") if randomize: dp_nom = env.get_nominal_domain_param() randomizer = DomainRandomizer( UniformDomainParam(name='box_mass', mean=dp_nom['box_mass'], halfspan=dp_nom['box_mass'] / 5), UniformDomainParam(name='box_width', mean=dp_nom['box_width'], halfspan=dp_nom['box_length'] / 5), UniformDomainParam(name='basket_friction_coefficient', mean=dp_nom['basket_friction_coefficient'], halfspan=dp_nom['basket_friction_coefficient'] / 5)) env = DomainRandWrapperLive(env, randomizer) # Simulate and plot print('observations:\n', env.obs_space.labels) done, param, state = False, None, None while not done: ro = rollout(env, policy, render_mode=RenderMode(text=False, video=True), eval=True, max_steps=max_steps, reset_kwargs=dict(domain_param=param, init_state=state)) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, state, param = after_rollout_query(env, policy, ro)
# Use the recorded initial state from the real system init_state = env.init_space.sample_uniform() if real_data_exists: if input( 'Use the recorded initial state from the real system? [y] / n ' ).lower() == '' or 'y': init_state[:env.num_dof] = qpos_real[0, :] # Define indices of actuated joints act_idcs = [1, 3, 5] if env.num_dof == 7 else [1, 3] # Do rollout in simulation ro = rollout(env, policy, eval=True, render_mode=RenderMode(video=False), reset_kwargs=dict(init_state=init_state)) t = ro.env_infos['t'] qpos_sim, qvel_sim = ro.env_infos['qpos'], ro.env_infos['qvel'] qpos_des, qvel_des = ro.env_infos['qpos_des'], ro.env_infos['qvel_des'] plot_des = False if input( 'Plot the desired joint states and velocities (i.e. the policy features)? [y] / n ' ) == '' or 'y': plot_des = True # Plot trajectories of the directly controlled joints and their corresponding desired trajectories fig, ax = plt.subplots(nrows=len(act_idcs), ncols=2, figsize=(12, 8),
def render(self, mode: RenderMode = RenderMode(), render_step: int = 1): if mode.video: return self._gym_env.render()
def test_gru_policy(default_bob, gru_policy): ro = rollout(default_bob, gru_policy, render_mode=RenderMode(text=True)) assert isinstance(ro, StepSequence)
env = BallOnPlate5DSim( physicsEngine='Vortex', # Bullet or Vortex dt=dt, max_steps=2000, ) env = ActNormWrapper(env) print_domain_params(env.domain_param) # Set up policy policy = LSTMPolicy(env.spec, 20, 1) policy.init_param() # Simulate ro = rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True) # Plot fig, axs = plt.subplots(2, 1, figsize=(6, 8), sharex='all', tight_layout=True) axs[0].plot(ro.observations[:, 1], label='plate y pos') axs[1].plot(ro.observations[:, 2], label='plate z pos') axs[0].legend() axs[1].legend() plt.show() ro.torch()
print_cbt("Set up controller for the QQubeSwingUpReal environment.", "c") else: raise pyrado.ValueErr( given=args.env_name, eq_constraint= f"{QBallBalancerReal.name}, {QCartPoleSwingUpReal.name}, " f"{QCartPoleStabReal.name}, or {QQubeSwingUpReal.name}", ) # Run on device done = False print_cbt("Running predefined controller ...", "c", bright=True) while not done: ro = rollout(env, policy, eval=True, render_mode=RenderMode(text=args.verbose)) if args.save: pyrado.save( ro, "rollout_real.pkl", pyrado.TEMP_DIR, suffix=datetime.now().strftime(pyrado.timestamp_format), ) print_cbt(f"Saved rollout to {pyrado.TEMP_DIR}", "g") done, _, _ = after_rollout_query(env, policy, ro)
-4.1818547e+00 ])) print_cbt('Set up controller for the QuanserCartpole environment.', 'c') # Override the time step size if specified if args.dt is not None: env.dt = args.dt if args.remove_dr_wrappers: env = remove_all_dr_wrappers(env, verbose=True) # Use the environments number of steps in case of the default argument (inf) max_steps = env.max_steps if args.max_steps == pyrado.inf else args.max_steps # Simulate done, state, param = False, None, None while not done: ro = rollout(env, policy, render_mode=RenderMode(text=args.verbose, video=args.animation), eval=True, max_steps=max_steps, stop_on_done=not args.relentless, reset_kwargs=dict(domain_param=param, init_state=state)) print_domain_params(env.domain_param) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, state, param = after_rollout_query(env, policy, ro) pyrado.close_vpython()
d_gains = np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05]) n = 1500 # Number of steps init_pos = np.array([0., -1.986, 0., 3.146, 0., 0., 0.]) zero_vel = np.zeros_like(init_pos) goal_pos = np.array([0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]) # constants diff = goal_pos - init_pos c_1 = -2 * diff / n**3 c_2 = 3 * diff / n**2 # Environment env = WAMSim() env.reset(init_state=np.concatenate((init_pos, zero_vel)).ravel()) env.render(mode=RenderMode(video=True)) env.viewer._run_speed = 0.5 for i in range(1, n + 1000): if i < n: des_pos = c_1 * i**3 + c_2 * i**2 + init_pos des_vel = (3 * c_1 * i**2 + 2 * c_2 * i) / env.dt else: des_pos = goal_pos des_vel = zero_vel act = p_gains * (des_pos - env.state[:7]) + d_gains * (des_vel - env.state[7:]) env.step(act) env.render(mode=RenderMode(video=True)) print('Desired Pos:', goal_pos)
def test_combination(env: SimEnv): pyrado.set_seed(0) env.max_steps = 20 randomizer = create_default_randomizer(env) env_r = DomainRandWrapperBuffer(env, randomizer) env_r.fill_buffer(num_domains=3) dp_before = [] dp_after = [] for i in range(4): dp_before.append(env_r.domain_param) rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) dp_after.append(env_r.domain_param) assert dp_after[i] != dp_before[i] assert dp_after[0] == dp_after[3] env_rn = ActNormWrapper(env) elb = {"x_dot": -213.0, "theta_dot": -42.0} eub = {"x_dot": 213.0, "theta_dot": 42.0, "x": 0.123} env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub) alb, aub = env_rn.act_space.bounds assert all(alb == -1) assert all(aub == 1) olb, oub = env_rn.obs_space.bounds assert all(olb == -1) assert all(oub == 1) ro_r = rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) ro_rn = rollout(env_rn, DummyPolicy(env_rn.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(env_rn._process_obs(ro_r.observations), ro_rn.observations) env_rnp = ObsPartialWrapper( env_rn, idcs=[env.obs_space.labels[2], env.obs_space.labels[3]]) ro_rnp = rollout(env_rnp, DummyPolicy(env_rnp.spec), eval=True, seed=0, render_mode=RenderMode()) env_rnpa = GaussianActNoiseWrapper( env_rnp, noise_mean=0.5 * np.ones(env_rnp.act_space.shape), noise_std=0.1 * np.ones(env_rnp.act_space.shape)) ro_rnpa = rollout(env_rnpa, DummyPolicy(env_rnpa.spec), eval=True, seed=0, render_mode=RenderMode()) assert not np.allclose( ro_rnp.observations, ro_rnpa.observations) # the action noise changed to rollout env_rnpd = ActDelayWrapper(env_rnp, delay=3) ro_rnpd = rollout(env_rnpd, DummyPolicy(env_rnpd.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(ro_rnp.actions, ro_rnpd.actions) assert not np.allclose(ro_rnp.observations, ro_rnpd.observations) assert type(inner_env(env_rnpd)) == type(env) assert typed_env(env_rnpd, ObsPartialWrapper) is not None assert isinstance(env_rnpd, ActDelayWrapper) env_rnpdr = remove_env(env_rnpd, ActDelayWrapper) assert not isinstance(env_rnpdr, ActDelayWrapper)
def rollout(env: Env, policy: [nn.Module, Policy], eval: bool = False, max_steps: int = None, reset_kwargs: dict = None, render_mode: RenderMode = RenderMode(), render_step: int = 1, bernoulli_reset: float = None, no_reset: bool = False, no_close: bool = False, record_dts: bool = False, stop_on_done: bool = True) -> StepSequence: """ Perform a rollout (i.e. sample a trajectory) in the given environment using given policy. :param env: environment to use (`SimEnv` or `RealEnv`) :param policy: policy to determine the next action given the current observation. This policy may be wrapped by an exploration strategy. :param eval: flag if the rollout is executed during training (`False`) or during evaluation (`True`) :param max_steps: maximum number of time steps, if `None` the environment's property is used :param reset_kwargs: keyword arguments passed to environment's reset function :param render_mode: determines if the user sees an animation, console prints, or nothing :param render_step: rendering interval, renders every step if set to 1 :param bernoulli_reset: probability for resetting after the current time step :param no_reset: do not reset the environment before running the rollout :param no_close: do not close (and disconnect) the environment after running the rollout :param record_dts: flag if the time intervals of different parts of one step should be recorded (for debugging) :param stop_on_done: set to false to ignore the environments's done flag (for debugging) :return paths of the observations, actions, rewards, and information about the environment as well as the policy """ # Check the input if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) # Don't restrain policy type, can be any callable if not isinstance(eval, bool): raise pyrado.TypeErr(given=eval, expected_type=bool) # The max_steps argument is checked by the environment's setter if not (isinstance(reset_kwargs, dict) or reset_kwargs is None): raise pyrado.TypeErr(given=reset_kwargs, expected_type=dict) if not isinstance(render_mode, RenderMode): raise pyrado.TypeErr(given=render_mode, expected_type=RenderMode) # Initialize the paths obs_hist = [] act_hist = [] rew_hist = [] env_info_hist = [] if policy.is_recurrent: hidden_hist = [] # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly if isinstance(getattr(policy, 'policy', policy), (ADNPolicy, NFPolicy)): pot_hist = [] stim_ext_hist = [] stim_int_hist = [] elif isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy): head_2_hist = [] if record_dts: dt_policy_hist = [] dt_step_hist = [] dt_remainder_hist = [] # Override the number of steps to execute if max_steps is not None: env.max_steps = max_steps # Reset the environment and pass the kwargs if reset_kwargs is None: reset_kwargs = {} if not no_reset: obs = env.reset(**reset_kwargs) else: obs = np.zeros(env.obs_space.shape) if isinstance(policy, Policy): # Reset the policy / the exploration strategy policy.reset() # Set dropout and batch normalization layers to the right mode if eval: policy.eval() else: policy.train() # Check for recurrent policy, which requires special handling if policy.is_recurrent: # Initialize hidden state var hidden = policy.init_hidden() # Setup rollout information rollout_info = dict(env_spec=env.spec) if isinstance(inner_env(env), SimEnv): rollout_info['domain_param'] = env.domain_param # Initialize animation env.render(render_mode, render_step=1) # Initialize the main loop variables done = False if record_dts: t_post_step = time.time() # first sample of remainder is useless # ---------- # Begin loop # ---------- # Terminate if the environment signals done, it also keeps track of the time while not (done and stop_on_done) and env.curr_step < env.max_steps: # Record step start time if record_dts or render_mode.video: t_start = time.time() # dual purpose if record_dts: dt_remainder = t_start - t_post_step # Check observations if np.isnan(obs).any(): env.render(render_mode, render_step=1) raise pyrado.ValueErr( msg=f'At least one observation value is NaN!' + tabulate([list(env.obs_space.labels), [*color_validity(obs, np.invert(np.isnan(obs)))]], headers='firstrow') ) # Get the agent's action obs_to = to.from_numpy(obs).type(to.get_default_dtype()) # policy operates on PyTorch tensors with to.no_grad(): if policy.is_recurrent: if isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy): act_to, head_2_to, hidden_next = policy(obs_to, hidden) else: act_to, hidden_next = policy(obs_to, hidden) else: if isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy): act_to, head_2_to = policy(obs_to) else: act_to = policy(obs_to) # act_to = (to.tensor([-3.6915228, 31.47042, -6.827999, 11.602707]) @ obs_to).view(-1) # act_to = (to.tensor([-0.42, 18.45, -0.53, 1.53]) @ obs_to).view(-1) # act_to = (to.tensor([-0.2551887, 9.8527975, -4.421094, 10.82632]) @ obs_to).view(-1) # act_to = (to.tensor([ 0.18273291 , 3.829101 , -1.4158, 5.5001416]) @ obs_to).view(-1) # act_to = to.tensor([1.0078554 , 4.221323 , 0.032006 , 4.909644, -2.201612]) @ obs_to # act_to = to.tensor([1.89549804, 4.74797034, -0.09684278, 5.51203606, -2.80852473]) @ obs_to # act_to = to.tensor([1.3555347 , 3.8478632, -0.04043245 , 7.40247 , -3.580207]) @ obs_to + \ # 0.1 * np.random.randn() # print(act_to) act = act_to.detach().cpu().numpy() # environment operates on numpy arrays # Check actions if np.isnan(act).any(): env.render(render_mode, render_step=1) raise pyrado.ValueErr( msg=f'At least one observation value is NaN!' + tabulate([list(env.act_space.labels), [*color_validity(act, np.invert(np.isnan(act)))]], headers='firstrow') ) # Record time after the action was calculated if record_dts: t_post_policy = time.time() # Ask the environment to perform the simulation step obs_next, rew, done, env_info = env.step(act) # Record time after the step i.e. the send and receive is completed if record_dts: t_post_step = time.time() dt_policy = t_post_policy - t_start dt_step = t_post_step - t_post_policy # Record data obs_hist.append(obs) act_hist.append(act) rew_hist.append(rew) env_info_hist.append(env_info) if record_dts: dt_policy_hist.append(dt_policy) dt_step_hist.append(dt_step) dt_remainder_hist.append(dt_remainder) if policy.is_recurrent: hidden_hist.append(hidden) hidden = hidden_next # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly if isinstance(getattr(policy, 'policy', policy), (ADNPolicy, NFPolicy)): pot_hist.append(getattr(policy, 'policy', policy).potentials.detach().numpy()) stim_ext_hist.append(getattr(policy, 'policy', policy).stimuli_external.detach().numpy()) stim_int_hist.append(getattr(policy, 'policy', policy).stimuli_internal.detach().numpy()) elif isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy): head_2_hist.append(head_2_to) # Store the observation for next step (if done, this is the final observation) obs = obs_next # Render if wanted (actually renders the next state) env.render(render_mode, render_step) if render_mode.video: do_sleep = True if pyrado.mujoco_available: from pyrado.environments.mujoco.base import MujocoSimEnv if isinstance(env, MujocoSimEnv): # MuJoCo environments seem to crash on time.sleep() do_sleep = False if do_sleep: # Measure time spent and sleep if needed t_end = time.time() t_sleep = env.dt + t_start - t_end if t_sleep > 0: time.sleep(t_sleep) # Stochastic reset to make the MDP ergodic (e.g. used for REPS) if bernoulli_reset is not None: assert 0. <= bernoulli_reset <= 1. # Stop the rollout with probability bernoulli_reset (most common choice is 1 - gamma) if binomial(1, bernoulli_reset): # The complete=True in the returned StepSequence sets the last done element to True break # -------- # End loop # -------- if not no_close: # Disconnect from EnvReal instance (does nothing for EnvSim instances) env.close() # Add final observation to observations list obs_hist.append(obs) # Return result object res = StepSequence( observations=obs_hist, actions=act_hist, rewards=rew_hist, rollout_info=rollout_info, env_infos=env_info_hist, complete=True # the rollout function always returns complete paths ) # Add special entries to the resulting rollout if policy.is_recurrent: res.add_data('hidden_states', hidden_hist) if isinstance(getattr(policy, 'policy', policy), (ADNPolicy, NFPolicy)): res.add_data('potentials', pot_hist) res.add_data('stimuli_external', stim_ext_hist) res.add_data('stimuli_internal', stim_int_hist) elif isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy): res.add_data('head_2', head_2_hist) if record_dts: res.add_data('dts_policy', dt_policy_hist) res.add_data('dts_step', dt_step_hist) res.add_data('dts_remainder', dt_remainder_hist) return res
def test_quanser_real_wo_connecting(env: RealEnv): assert env is not None env.render(RenderMode(text=True))
def test_rollout_wo_exploration(env, policy): ro = rollout(env, policy, render_mode=RenderMode()) assert isinstance(ro, StepSequence) assert len(ro) <= env.max_steps