Esempio n. 1
0
        raise pyrado.ValueErr(given=setup_type,
                              eq_constraint="'idle', 'pos', 'vel'")

    if randomize:
        dp_nom = env.get_nominal_domain_param()
        randomizer = DomainRandomizer(
            UniformDomainParam(name='box_mass',
                               mean=dp_nom['box_mass'],
                               halfspan=dp_nom['box_mass'] / 5),
            UniformDomainParam(name='box_width',
                               mean=dp_nom['box_width'],
                               halfspan=dp_nom['box_length'] / 5),
            UniformDomainParam(name='basket_friction_coefficient',
                               mean=dp_nom['basket_friction_coefficient'],
                               halfspan=dp_nom['basket_friction_coefficient'] /
                               5))
        env = DomainRandWrapperLive(env, randomizer)

    # Simulate and plot
    print('observations:\n', env.obs_space.labels)
    done, param, state = False, None, None
    while not done:
        ro = rollout(env,
                     policy,
                     render_mode=RenderMode(text=False, video=True),
                     eval=True,
                     max_steps=max_steps,
                     reset_kwargs=dict(domain_param=param, init_state=state))
        print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
        done, state, param = after_rollout_query(env, policy, ro)
Esempio n. 2
0
    # Use the recorded initial state from the real system
    init_state = env.init_space.sample_uniform()
    if real_data_exists:
        if input(
                'Use the recorded initial state from the real system? [y] / n '
        ).lower() == '' or 'y':
            init_state[:env.num_dof] = qpos_real[0, :]

    # Define indices of actuated joints
    act_idcs = [1, 3, 5] if env.num_dof == 7 else [1, 3]

    # Do rollout in simulation
    ro = rollout(env,
                 policy,
                 eval=True,
                 render_mode=RenderMode(video=False),
                 reset_kwargs=dict(init_state=init_state))
    t = ro.env_infos['t']
    qpos_sim, qvel_sim = ro.env_infos['qpos'], ro.env_infos['qvel']
    qpos_des, qvel_des = ro.env_infos['qpos_des'], ro.env_infos['qvel_des']

    plot_des = False
    if input(
            'Plot the desired joint states and velocities (i.e. the policy features)? [y] / n '
    ) == '' or 'y':
        plot_des = True

    # Plot trajectories of the directly controlled joints and their corresponding desired trajectories
    fig, ax = plt.subplots(nrows=len(act_idcs),
                           ncols=2,
                           figsize=(12, 8),
 def render(self, mode: RenderMode = RenderMode(), render_step: int = 1):
     if mode.video:
         return self._gym_env.render()
Esempio n. 4
0
def test_gru_policy(default_bob, gru_policy):
    ro = rollout(default_bob, gru_policy, render_mode=RenderMode(text=True))
    assert isinstance(ro, StepSequence)
Esempio n. 5
0
    env = BallOnPlate5DSim(
        physicsEngine='Vortex',  # Bullet or Vortex
        dt=dt,
        max_steps=2000,
    )
    env = ActNormWrapper(env)
    print_domain_params(env.domain_param)

    # Set up policy
    policy = LSTMPolicy(env.spec, 20, 1)
    policy.init_param()

    # Simulate
    ro = rollout(env,
                 policy,
                 render_mode=RenderMode(video=True),
                 stop_on_done=True)

    # Plot
    fig, axs = plt.subplots(2,
                            1,
                            figsize=(6, 8),
                            sharex='all',
                            tight_layout=True)
    axs[0].plot(ro.observations[:, 1], label='plate y pos')
    axs[1].plot(ro.observations[:, 2], label='plate z pos')
    axs[0].legend()
    axs[1].legend()
    plt.show()

    ro.torch()
Esempio n. 6
0
        print_cbt("Set up controller for the QQubeSwingUpReal environment.",
                  "c")

    else:
        raise pyrado.ValueErr(
            given=args.env_name,
            eq_constraint=
            f"{QBallBalancerReal.name}, {QCartPoleSwingUpReal.name}, "
            f"{QCartPoleStabReal.name}, or {QQubeSwingUpReal.name}",
        )

    # Run on device
    done = False
    print_cbt("Running predefined controller ...", "c", bright=True)
    while not done:
        ro = rollout(env,
                     policy,
                     eval=True,
                     render_mode=RenderMode(text=args.verbose))

        if args.save:
            pyrado.save(
                ro,
                "rollout_real.pkl",
                pyrado.TEMP_DIR,
                suffix=datetime.now().strftime(pyrado.timestamp_format),
            )
            print_cbt(f"Saved rollout to {pyrado.TEMP_DIR}", "g")

        done, _, _ = after_rollout_query(env, policy, ro)
Esempio n. 7
0
                                          -4.1818547e+00
                                      ]))

    print_cbt('Set up controller for the QuanserCartpole environment.', 'c')

    # Override the time step size if specified
    if args.dt is not None:
        env.dt = args.dt

    if args.remove_dr_wrappers:
        env = remove_all_dr_wrappers(env, verbose=True)

    # Use the environments number of steps in case of the default argument (inf)
    max_steps = env.max_steps if args.max_steps == pyrado.inf else args.max_steps

    # Simulate
    done, state, param = False, None, None
    while not done:
        ro = rollout(env,
                     policy,
                     render_mode=RenderMode(text=args.verbose,
                                            video=args.animation),
                     eval=True,
                     max_steps=max_steps,
                     stop_on_done=not args.relentless,
                     reset_kwargs=dict(domain_param=param, init_state=state))
        print_domain_params(env.domain_param)
        print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
        done, state, param = after_rollout_query(env, policy, ro)
    pyrado.close_vpython()
Esempio n. 8
0
    d_gains = np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05])

    n = 1500  # Number of steps
    init_pos = np.array([0., -1.986, 0., 3.146, 0., 0., 0.])
    zero_vel = np.zeros_like(init_pos)
    goal_pos = np.array([0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9])

    # constants
    diff = goal_pos - init_pos
    c_1 = -2 * diff / n**3
    c_2 = 3 * diff / n**2

    # Environment
    env = WAMSim()
    env.reset(init_state=np.concatenate((init_pos, zero_vel)).ravel())
    env.render(mode=RenderMode(video=True))
    env.viewer._run_speed = 0.5

    for i in range(1, n + 1000):
        if i < n:
            des_pos = c_1 * i**3 + c_2 * i**2 + init_pos
            des_vel = (3 * c_1 * i**2 + 2 * c_2 * i) / env.dt
        else:
            des_pos = goal_pos
            des_vel = zero_vel
        act = p_gains * (des_pos - env.state[:7]) + d_gains * (des_vel -
                                                               env.state[7:])
        env.step(act)
        env.render(mode=RenderMode(video=True))

    print('Desired Pos:', goal_pos)
Esempio n. 9
0
def test_combination(env: SimEnv):
    pyrado.set_seed(0)
    env.max_steps = 20

    randomizer = create_default_randomizer(env)
    env_r = DomainRandWrapperBuffer(env, randomizer)
    env_r.fill_buffer(num_domains=3)

    dp_before = []
    dp_after = []
    for i in range(4):
        dp_before.append(env_r.domain_param)
        rollout(env_r,
                DummyPolicy(env_r.spec),
                eval=True,
                seed=0,
                render_mode=RenderMode())
        dp_after.append(env_r.domain_param)
        assert dp_after[i] != dp_before[i]
    assert dp_after[0] == dp_after[3]

    env_rn = ActNormWrapper(env)
    elb = {"x_dot": -213.0, "theta_dot": -42.0}
    eub = {"x_dot": 213.0, "theta_dot": 42.0, "x": 0.123}
    env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub)
    alb, aub = env_rn.act_space.bounds
    assert all(alb == -1)
    assert all(aub == 1)
    olb, oub = env_rn.obs_space.bounds
    assert all(olb == -1)
    assert all(oub == 1)

    ro_r = rollout(env_r,
                   DummyPolicy(env_r.spec),
                   eval=True,
                   seed=0,
                   render_mode=RenderMode())
    ro_rn = rollout(env_rn,
                    DummyPolicy(env_rn.spec),
                    eval=True,
                    seed=0,
                    render_mode=RenderMode())
    assert np.allclose(env_rn._process_obs(ro_r.observations),
                       ro_rn.observations)

    env_rnp = ObsPartialWrapper(
        env_rn, idcs=[env.obs_space.labels[2], env.obs_space.labels[3]])
    ro_rnp = rollout(env_rnp,
                     DummyPolicy(env_rnp.spec),
                     eval=True,
                     seed=0,
                     render_mode=RenderMode())

    env_rnpa = GaussianActNoiseWrapper(
        env_rnp,
        noise_mean=0.5 * np.ones(env_rnp.act_space.shape),
        noise_std=0.1 * np.ones(env_rnp.act_space.shape))
    ro_rnpa = rollout(env_rnpa,
                      DummyPolicy(env_rnpa.spec),
                      eval=True,
                      seed=0,
                      render_mode=RenderMode())
    assert not np.allclose(
        ro_rnp.observations,
        ro_rnpa.observations)  # the action noise changed to rollout

    env_rnpd = ActDelayWrapper(env_rnp, delay=3)
    ro_rnpd = rollout(env_rnpd,
                      DummyPolicy(env_rnpd.spec),
                      eval=True,
                      seed=0,
                      render_mode=RenderMode())
    assert np.allclose(ro_rnp.actions, ro_rnpd.actions)
    assert not np.allclose(ro_rnp.observations, ro_rnpd.observations)

    assert type(inner_env(env_rnpd)) == type(env)
    assert typed_env(env_rnpd, ObsPartialWrapper) is not None
    assert isinstance(env_rnpd, ActDelayWrapper)
    env_rnpdr = remove_env(env_rnpd, ActDelayWrapper)
    assert not isinstance(env_rnpdr, ActDelayWrapper)
Esempio n. 10
0
def rollout(env: Env,
            policy: [nn.Module, Policy],
            eval: bool = False,
            max_steps: int = None,
            reset_kwargs: dict = None,
            render_mode: RenderMode = RenderMode(),
            render_step: int = 1,
            bernoulli_reset: float = None,
            no_reset: bool = False,
            no_close: bool = False,
            record_dts: bool = False,
            stop_on_done: bool = True) -> StepSequence:
    """
    Perform a rollout (i.e. sample a trajectory) in the given environment using given policy.

    :param env: environment to use (`SimEnv` or `RealEnv`)
    :param policy: policy to determine the next action given the current observation.
                   This policy may be wrapped by an exploration strategy.
    :param eval: flag if the rollout is executed during training (`False`) or during evaluation (`True`)
    :param max_steps: maximum number of time steps, if `None` the environment's property is used
    :param reset_kwargs: keyword arguments passed to environment's reset function
    :param render_mode: determines if the user sees an animation, console prints, or nothing
    :param render_step: rendering interval, renders every step if set to 1
    :param bernoulli_reset: probability for resetting after the current time step
    :param no_reset: do not reset the environment before running the rollout
    :param no_close: do not close (and disconnect) the environment after running the rollout
    :param record_dts: flag if the time intervals of different parts of one step should be recorded (for debugging)
    :param stop_on_done: set to false to ignore the environments's done flag (for debugging)
    :return paths of the observations, actions, rewards, and information about the environment as well as the policy
    """
    # Check the input
    if not isinstance(env, Env):
        raise pyrado.TypeErr(given=env, expected_type=Env)
    # Don't restrain policy type, can be any callable
    if not isinstance(eval, bool):
        raise pyrado.TypeErr(given=eval, expected_type=bool)
    # The max_steps argument is checked by the environment's setter
    if not (isinstance(reset_kwargs, dict) or reset_kwargs is None):
        raise pyrado.TypeErr(given=reset_kwargs, expected_type=dict)
    if not isinstance(render_mode, RenderMode):
        raise pyrado.TypeErr(given=render_mode, expected_type=RenderMode)

    # Initialize the paths
    obs_hist = []
    act_hist = []
    rew_hist = []
    env_info_hist = []
    if policy.is_recurrent:
        hidden_hist = []
    # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly
    if isinstance(getattr(policy, 'policy', policy), (ADNPolicy, NFPolicy)):
        pot_hist = []
        stim_ext_hist = []
        stim_int_hist = []
    elif isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy):
        head_2_hist = []
    if record_dts:
        dt_policy_hist = []
        dt_step_hist = []
        dt_remainder_hist = []

    # Override the number of steps to execute
    if max_steps is not None:
        env.max_steps = max_steps

    # Reset the environment and pass the kwargs
    if reset_kwargs is None:
        reset_kwargs = {}
    if not no_reset:
        obs = env.reset(**reset_kwargs)
    else:
        obs = np.zeros(env.obs_space.shape)

    if isinstance(policy, Policy):
        # Reset the policy / the exploration strategy
        policy.reset()

        # Set dropout and batch normalization layers to the right mode
        if eval:
            policy.eval()
        else:
            policy.train()

    # Check for recurrent policy, which requires special handling
    if policy.is_recurrent:
        # Initialize hidden state var
        hidden = policy.init_hidden()

    # Setup rollout information
    rollout_info = dict(env_spec=env.spec)
    if isinstance(inner_env(env), SimEnv):
        rollout_info['domain_param'] = env.domain_param

    # Initialize animation
    env.render(render_mode, render_step=1)

    # Initialize the main loop variables
    done = False
    if record_dts:
        t_post_step = time.time()  # first sample of remainder is useless

    # ----------
    # Begin loop
    # ----------

    # Terminate if the environment signals done, it also keeps track of the time
    while not (done and stop_on_done) and env.curr_step < env.max_steps:
        # Record step start time
        if record_dts or render_mode.video:
            t_start = time.time()  # dual purpose
        if record_dts:
            dt_remainder = t_start - t_post_step

        # Check observations
        if np.isnan(obs).any():
            env.render(render_mode, render_step=1)
            raise pyrado.ValueErr(
                msg=f'At least one observation value is NaN!' +
                    tabulate([list(env.obs_space.labels),
                              [*color_validity(obs, np.invert(np.isnan(obs)))]], headers='firstrow')
            )

        # Get the agent's action
        obs_to = to.from_numpy(obs).type(to.get_default_dtype())  # policy operates on PyTorch tensors
        with to.no_grad():
            if policy.is_recurrent:
                if isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy):
                    act_to, head_2_to, hidden_next = policy(obs_to, hidden)
                else:
                    act_to, hidden_next = policy(obs_to, hidden)
            else:
                if isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy):
                    act_to, head_2_to = policy(obs_to)
                else:
                    act_to = policy(obs_to)

                    # act_to = (to.tensor([-3.6915228, 31.47042,   -6.827999,  11.602707]) @ obs_to).view(-1)


                    # act_to = (to.tensor([-0.42, 18.45, -0.53, 1.53]) @ obs_to).view(-1)
                    # act_to = (to.tensor([-0.2551887, 9.8527975, -4.421094, 10.82632]) @ obs_to).view(-1)



                    # act_to = (to.tensor([ 0.18273291 , 3.829101 ,  -1.4158,      5.5001416]) @ obs_to).view(-1)


                    # act_to = to.tensor([1.0078554 , 4.221323 ,  0.032006 ,  4.909644,  -2.201612]) @ obs_to

                    # act_to = to.tensor([1.89549804,  4.74797034, -0.09684278,  5.51203606, -2.80852473]) @ obs_to

                    # act_to = to.tensor([1.3555347 ,  3.8478632,  -0.04043245 , 7.40247 ,   -3.580207]) @ obs_to + \
                    #     0.1 * np.random.randn()

                    # print(act_to)


        act = act_to.detach().cpu().numpy()  # environment operates on numpy arrays

        # Check actions
        if np.isnan(act).any():
            env.render(render_mode, render_step=1)
            raise pyrado.ValueErr(
                msg=f'At least one observation value is NaN!' +
                    tabulate([list(env.act_space.labels),
                              [*color_validity(act, np.invert(np.isnan(act)))]], headers='firstrow')
            )

        # Record time after the action was calculated
        if record_dts:
            t_post_policy = time.time()

        # Ask the environment to perform the simulation step
        obs_next, rew, done, env_info = env.step(act)

        # Record time after the step i.e. the send and receive is completed
        if record_dts:
            t_post_step = time.time()
            dt_policy = t_post_policy - t_start
            dt_step = t_post_step - t_post_policy

        # Record data
        obs_hist.append(obs)
        act_hist.append(act)
        rew_hist.append(rew)
        env_info_hist.append(env_info)
        if record_dts:
            dt_policy_hist.append(dt_policy)
            dt_step_hist.append(dt_step)
            dt_remainder_hist.append(dt_remainder)
        if policy.is_recurrent:
            hidden_hist.append(hidden)
            hidden = hidden_next
        # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly
        if isinstance(getattr(policy, 'policy', policy), (ADNPolicy, NFPolicy)):
            pot_hist.append(getattr(policy, 'policy', policy).potentials.detach().numpy())
            stim_ext_hist.append(getattr(policy, 'policy', policy).stimuli_external.detach().numpy())
            stim_int_hist.append(getattr(policy, 'policy', policy).stimuli_internal.detach().numpy())
        elif isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy):
            head_2_hist.append(head_2_to)

        # Store the observation for next step (if done, this is the final observation)
        obs = obs_next

        # Render if wanted (actually renders the next state)
        env.render(render_mode, render_step)

        if render_mode.video:
            do_sleep = True
            if pyrado.mujoco_available:
                from pyrado.environments.mujoco.base import MujocoSimEnv
                if isinstance(env, MujocoSimEnv):
                    # MuJoCo environments seem to crash on time.sleep()
                    do_sleep = False
            if do_sleep:
                # Measure time spent and sleep if needed
                t_end = time.time()
                t_sleep = env.dt + t_start - t_end
                if t_sleep > 0:
                    time.sleep(t_sleep)

        # Stochastic reset to make the MDP ergodic (e.g. used for REPS)
        if bernoulli_reset is not None:
            assert 0. <= bernoulli_reset <= 1.
            # Stop the rollout with probability bernoulli_reset (most common choice is 1 - gamma)
            if binomial(1, bernoulli_reset):
                # The complete=True in the returned StepSequence sets the last done element to True
                break

    # --------
    # End loop
    # --------

    if not no_close:
        # Disconnect from EnvReal instance (does nothing for EnvSim instances)
        env.close()

    # Add final observation to observations list
    obs_hist.append(obs)

    # Return result object
    res = StepSequence(
        observations=obs_hist,
        actions=act_hist,
        rewards=rew_hist,
        rollout_info=rollout_info,
        env_infos=env_info_hist,
        complete=True  # the rollout function always returns complete paths
    )

    # Add special entries to the resulting rollout
    if policy.is_recurrent:
        res.add_data('hidden_states', hidden_hist)
    if isinstance(getattr(policy, 'policy', policy), (ADNPolicy, NFPolicy)):
        res.add_data('potentials', pot_hist)
        res.add_data('stimuli_external', stim_ext_hist)
        res.add_data('stimuli_internal', stim_int_hist)
    elif isinstance(getattr(policy, 'policy', policy), TwoHeadedPolicy):
        res.add_data('head_2', head_2_hist)
    if record_dts:
        res.add_data('dts_policy', dt_policy_hist)
        res.add_data('dts_step', dt_step_hist)
        res.add_data('dts_remainder', dt_remainder_hist)

    return res
Esempio n. 11
0
def test_quanser_real_wo_connecting(env: RealEnv):
    assert env is not None
    env.render(RenderMode(text=True))
Esempio n. 12
0
def test_rollout_wo_exploration(env, policy):
    ro = rollout(env, policy, render_mode=RenderMode())
    assert isinstance(ro, StepSequence)
    assert len(ro) <= env.max_steps