Ejemplo n.º 1
0
def test_truncate_rollouts(env, num_real_ros, num_sim_ros, max_real_steps,
                           max_sim_steps):
    policy = DummyPolicy(env.spec)
    ros_real = []
    ros_sim = []

    # Create the rollout data
    for _ in range(num_real_ros):
        ros_real.append(
            rollout(env,
                    policy,
                    eval=True,
                    max_steps=max_real_steps,
                    stop_on_done=True))
    for _ in range(num_sim_ros):
        ros_sim.append(
            rollout(env,
                    policy,
                    eval=True,
                    max_steps=max_sim_steps,
                    stop_on_done=True))

    # Truncate them
    ros_real_tr, ros_sim_tr = SysIdViaEpisodicRL.truncate_rollouts(
        ros_real, ros_sim)

    # Obtained the right number of rollouts
    assert len(ros_real_tr) == len(ros_sim_tr)

    for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr):
        # All individual truncated rollouts have the correct length
        assert ro_r.length == ro_s.length
Ejemplo n.º 2
0
def test_training_parameter_exploring(ex_dir, env: SimEnv, algo, algo_hparam):
    # Environment and policy
    env = ActNormWrapper(env)
    policy_hparam = dict(feats=FeatureStack([const_feat, identity_feat]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Get initial return for comparison
    rets_before = np.zeros(5)
    for i in range(rets_before.size):
        rets_before[i] = rollout(env, policy, eval=True,
                                 seed=i).undiscounted_return()

    # Create the algorithm and train
    algo_hparam['num_workers'] = 1
    algo = algo(ex_dir, env, policy, **algo_hparam)
    algo.train()
    policy.param_values = algo.best_policy_param  # mimic saving and loading

    # Compare returns before and after training for max_iter iteration
    rets_after = np.zeros_like(rets_before)
    for i in range(rets_before.size):
        rets_after[i] = rollout(env, policy, eval=True,
                                seed=i).undiscounted_return()

    assert all(rets_after > rets_before)
Ejemplo n.º 3
0
    def _eval_cand_and_ref_one_domain(self, i: int) -> tuple:
        """
        Evaluate the candidate and the k-th reference solution (see outer loop) in the i-th domain using nJ rollouts.

        :param i: index of the domain to evaluate in
        :return: average return values for the candidate and the k-th reference in the i-th domain
        """
        cand_ret_avg = 0.0
        refs_ret_avg = 0.0

        # Do nJ rollouts for each set of physics params
        for r in range(self.nJ):
            # Set the circular index for the particular realization
            self.env_dr.ring_idx = i
            # Do the rollout and collect the return
            ro_cand = rollout(self.env_dr,
                              self._subrtn_cand.policy,
                              eval=True,
                              seed=self.base_seed + i * self.nJ + r)
            cand_ret_avg += ro_cand.undiscounted_return()

            # Set the circular index for the particular realization
            self.env_dr.ring_idx = i
            # Do the rollout and collect the return
            ro_ref = rollout(self.env_dr,
                             self._subrtn_refs.policy,
                             eval=True,
                             seed=self.base_seed + i * self.nJ + r)
            refs_ret_avg += ro_ref.undiscounted_return()

        return cand_ret_avg / self.nJ, refs_ret_avg / self.nJ  # average over nJ seeds
Ejemplo n.º 4
0
def create_qbb_setup(factor, dt, max_steps):
    # Set up environment
    init_state = np.array([0, 0, 0.1, 0.1, 0, 0, 0, 0])
    env = QBallBalancerSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QBallBalancerPDCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz_w = ro.actions

    # Time in seconds
    time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz)))
    time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz)))
    time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt),
                               int(len(act_500Hz_w)))

    # Plot
    _, axs = plt.subplots(nrows=2)
    for i in range(2):
        axs[i].plot(time_500Hz, act_500Hz[:, i], label="500 Hz (original)")
        axs[i].plot(time_100Hz, act_100Hz[:, i], label="100 Hz", ls="--")
        axs[i].plot(time_500Hz_w,
                    act_500Hz_w[:, i],
                    label="500 Hz (wrapped)",
                    ls="--")
        axs[i].legend()
        axs[i].set_ylabel(env.act_space.labels[i])
    axs[1].set_xlabel("time [s]")
Ejemplo n.º 5
0
def create_qq_setup(factor, dt, max_steps, render_mode):
    # Set up environment
    init_state = np.array([0.1, 0.0, 0.0, 0.0])
    env = QQubeSwingUpSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=render_mode,
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz_w = ro.actions

    # Time in seconds
    time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz)))
    time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz)))
    time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt),
                               int(len(act_500Hz_w)))

    # Plot
    _, ax = plt.subplots(nrows=1)
    ax.plot(time_500Hz, act_500Hz, label="500 Hz (original)")
    ax.plot(time_100Hz, act_100Hz, label="100 Hz", ls="--")
    ax.plot(time_500Hz_w, act_500Hz_w, label="500 Hz (wrapped)", ls="--")
    ax.legend()
    ax.set_ylabel(env.act_space.labels)
    ax.set_xlabel("time [s]")
Ejemplo n.º 6
0
def create_qq_setup(factor, dt, max_steps):
    # Set up environment
    init_state = np.array([0.1, 0.0, 0.0, 0.0])
    env = QQubeSwingUpSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions
    act_100Hz_zoh = np.repeat(act_100Hz, 5, axis=0)

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz_wrapped = ro.actions

    # Plot
    _, ax = plt.subplots(nrows=1)
    ax.plot(act_500Hz, label="500 Hz (original)")
    ax.plot(act_100Hz_zoh, label="100 Hz (zoh)")
    ax.plot(act_500Hz_wrapped, label="500 Hz (wrapped)")
    ax.legend()
    ax.set_ylabel(env.act_space.labels)
    ax.set_xlabel("time steps")
    plt.show()
Ejemplo n.º 7
0
def test_velocity_filter(plot: bool):
    # Set up environment
    env_gt = QQubeSwingUpSim(dt=1 / 500.0, max_steps=350)
    env_gt.init_space = SingularStateSpace(np.array([0.1, np.pi / 2, 3.0, 0]))
    env_filt = ObsVelFiltWrapper(env_gt,
                                 idcs_pos=["theta", "alpha"],
                                 idcs_vel=["theta_dot", "alpha_dot"])

    # Set up policy
    policy = IdlePolicy(env_gt.spec)

    # Simulate
    ro_gt = rollout(env_gt, policy)
    ro_filt = rollout(env_filt, policy)

    # Filter the observations of the last rollout
    theta_dot_gt = ro_gt.observations[:, 4]
    alpha_dot_gt = ro_gt.observations[:, 5]
    theta_dot_filt = ro_filt.observations[:, 4]
    alpha_dot_filt = ro_filt.observations[:, 5]

    assert theta_dot_filt[0] != pytest.approx(
        theta_dot_gt[0])  # can't be equal since we set an init vel of 3 rad/s
    assert alpha_dot_filt[0] == pytest.approx(alpha_dot_gt[0], abs=1e-4)

    # Compute the error
    rmse_theta = rmse(theta_dot_gt, theta_dot_filt)
    rmse_alpha = rmse(alpha_dot_gt, alpha_dot_filt)

    if plot:
        from matplotlib import pyplot as plt

        # Plot the filtered signals versus the orignal observations
        plt.rc("text", usetex=True)
        fix, axs = plt.subplots(2, figsize=(16, 9))
        axs[0].plot(theta_dot_gt, label=r"$\dot{\theta}_{true}$")
        axs[0].plot(theta_dot_filt, label=r"$\dot{\theta}_{filt}$")
        axs[1].plot(alpha_dot_gt, label=r"$\dot{\alpha}_{true}$")
        axs[1].plot(alpha_dot_filt, label=r"$\dot{\alpha}_{filt}$")

        axs[0].set_title(rf"RMSE($\theta$): {rmse_theta}")
        axs[0].set_ylabel(r"$\dot{\theta}$ [rad/s]")
        axs[0].legend()
        axs[1].set_title(rf"RMSE($\alpha$): {rmse_alpha}")
        axs[1].set_xlabel("time steps")
        axs[1].set_ylabel(r"$\dot{\alpha}$ [rad/s]")
        axs[1].legend()
        plt.show()
Ejemplo n.º 8
0
def create_manual_activation_setup(dt, max_steps, max_dist_force,
                                   physics_engine):
    # Set up environment
    env = Planar3LinkTASim(physicsEngine=physics_engine,
                           dt=dt,
                           max_steps=max_steps,
                           max_dist_force=max_dist_force,
                           positionTasks=True,
                           observeTaskSpaceDiscrepancy=True)
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        pot = np.fromstring(input("Enter potentials for next step: "),
                            dtype=np.double,
                            count=3,
                            sep=' ')
        return 1 / (1 + np.exp(-pot))

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Ejemplo n.º 9
0
def rollout_dummy_rbf_policy_4dof():
    # Environment
    env = WAMBallInCupSim(
        num_dof=4,
        max_steps=3000,
        # Note, when tuning the task args: the `R` matrices are now 4x4 for the 4 dof WAM
        task_args=dict(R=np.zeros((4, 4)),
                       R_dev=np.diag([0.2, 0.2, 1e-2, 1e-2])),
    )

    # Stabilize ball and print out the stable state
    env.reset()
    act = np.zeros(env.spec.act_space.flat_dim)
    for i in range(1500):
        env.step(act)
        env.render(mode=RenderMode(video=True))

    # Printing out actual positions for 4-dof (..just needed to setup the hard-coded values in the class)
    print('Ball pos:', env.sim.data.get_body_xpos('ball'))
    print('Cup goal:', env.sim.data.get_site_xpos('cup_goal'))
    print('Joint pos (incl. first rope angle):', env.sim.data.qpos[:5])

    # Apply DualRBFLinearPolicy and plot the joint states over the desired ones
    rbf_hparam = dict(num_feat_per_dim=7,
                      bounds=(np.array([0.]), np.array([1.])))
    policy = DualRBFLinearPolicy(env.spec, rbf_hparam, dim_mask=2)
    done, param = False, None
    while not done:
        ro = rollout(env,
                     policy,
                     render_mode=RenderMode(video=True),
                     eval=True,
                     reset_kwargs=dict(domain_param=param))
        print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
        done, _, param = after_rollout_query(env, policy, ro)
Ejemplo n.º 10
0
def eval_damping():
    """ Plot joint trajectories for different joint damping parameters """
    # Load experiment and remove possible randomization wrappers
    ex_dir = ask_for_experiment()
    env, policy, _ = load_experiment(ex_dir)
    env = inner_env(env)
    env.domain_param = WAMBallInCupSim.get_nominal_domain_param()

    data = []
    t = []
    dampings = [0., 1e-2, 1e-1, 1e0]
    print_cbt(f'Run policy for damping coefficients: {dampings}')
    for d in dampings:
        env.reset(domain_param=dict(joint_damping=d))
        ro = rollout(env,
                     policy,
                     render_mode=RenderMode(video=False),
                     eval=True)
        t.append(ro.env_infos['t'])
        data.append(ro.env_infos['qpos'])

    fig, ax = plt.subplots(3, sharex='all')
    ls = ['k-', 'b--', 'g-.', 'r:']  # line style setting for better visibility
    for i, idx in enumerate([1, 3, 5]):
        for j in range(len(dampings)):
            ax[i].plot(t[j],
                       data[j][:, idx],
                       ls[j],
                       label=f'damping: {dampings[j]}')
            if i == 0:
                ax[i].legend()
        ax[i].set_ylabel(f'joint {idx} pos [rad]')
    ax[2].set_xlabel('time [s]')
    plt.suptitle('Evaluation of joint damping coefficient')
    plt.show()
Ejemplo n.º 11
0
def create_joint_control_setup(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkJointCtrlSim(
        physicsEngine=physics_engine,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        taskCombinationMethod="sum",
        checkJointLimits=True,
    )
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [
            10 / 180 * math.pi,
            10 / 180 * math.pi,  # same as init config
            10 / 180 * math.pi +
            45.0 / 180.0 * math.pi * math.sin(2.0 * math.pi * 0.2 * t),
        ]  # oscillation in last link

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Ejemplo n.º 12
0
def _ps_sample_one(G):
    """
    Sample one rollout and return step count if counting steps, rollout count (1) otherwise.
    This function is used when a minimum number of steps was given.
    """
    ro = rollout(G.env, G.policy, bernoulli_reset=G.bernoulli_reset)
    return ro, len(ro)
Ejemplo n.º 13
0
    def _handle_neg_samples(self, cand_rets: np.ndarray, refs_rets: np.ndarray,
                            k: int, i: int) -> np.ndarray:
        """
        Process negative optimality gap samples by Looking at the other Reference Solutions

        :param cand_rets: array of the candidate's return values
        :param refs_rets: array of the references' return values
        :param k: index of the reference solution
        :param i: index of the domain
        :return refs_rets: if a better reference has been round the associated value will be overwritten
        """
        if refs_rets[k, i] < cand_rets[k, i]:
            print_cbt(
                f'\nReference {k + 1} is worse than the candidate on domain realization {i + 1}.\n'  # 1-based index
                'Trying to replace this reference at this realization with a different one',
                'y')
            for other_k in range(self.nG):
                if other_k == k:
                    # Do nothing for the bad solution that brought us here
                    continue
                else:
                    # Load a reference solution different from the the k-th
                    other_ref = pyrado.load(
                        self._subrtn_refs._policy, 'policy', 'pt',
                        self.save_dir,
                        dict(prefix=f'iter_{self._curr_iter}',
                             suffix=f'ref_{other_k}'))
                    other_ref_ret = 0
                    for r in range(self.nJ):
                        # Set the same random seed
                        pyrado.set_seed(self.base_seed + i * self.nJ + r)
                        # Set the circular index for the particular realization
                        self.env_dr.ring_idx = i
                        # Do the rollout and collect the return
                        ro_other_ref = rollout(self.env_dr,
                                               other_ref,
                                               eval=True)
                        other_ref_ret += ro_other_ref.undiscounted_return(
                        ) / self.nJ  # average over nJ seeds
                    # Store the value if value is better
                    if other_ref_ret > refs_rets[k, i]:
                        refs_rets[k, i] = other_ref_ret
                        # If a better one was found, do not iterate over the remaining reference solutions
                        break

            if refs_rets[k, i] > cand_rets[k, i]:
                # Found a different reference that achieves a higher return that the candidate
                print_cbt('Successfully handled a negative OG sample', 'g')
            else:
                refs_rets[k, i] = cand_rets[
                    k, i]  # forces optimality gap sample to be 0
                print_cbt(
                    'Unsuccessfully handled a negative OG sample: Set the value to 0',
                    'r')

        else:
            # Everything is as it should be
            pass

        return refs_rets
Ejemplo n.º 14
0
def create_ik_activation_setup(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkIKActivationSim(
        physicsEngine=physics_engine,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        taskCombinationMethod='product',
        positionTasks=True,
        checkJointLimits=False,
        collisionAvoidanceIK=True,
        observeTaskSpaceDiscrepancy=True,
    )
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [0.3 + 0.2 * math.sin(2. * math.pi * 0.2 * t), 0, 1]

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Ejemplo n.º 15
0
def joint_control_variant(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkJointCtrlSim(
        physicsEngine=physics_engine,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        checkJointLimits=True,
    )
    print_domain_params(env.domain_param)

    # Set up policy
    def policy_fcn(t: float):
        return [
            0.1,
            0.1,  # same as init config
            0.1 + 45. / 180. * math.pi * math.sin(2. * math.pi * 0.2 * t)
        ]  # oscillation in last link

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=True)
Ejemplo n.º 16
0
def generate_oscillation_data(dt, t_end, excitation):
    """
    Use OMOEnv to generate a 1-dim damped oscillation signal.

    :param dt: time step size [s]
    :param t_end: Time duration [s]
    :param excitation: type of excitation, either (initial) 'position' or 'force' (function of time)
    :return: 1-dim oscillation trajectory
    """
    env = OneMassOscillatorSim(dt, np.ceil(t_end / dt))
    env.domain_param = dict(m=1., k=10., d=2.0)
    if excitation == 'force':
        policy = TimePolicy(
            env.spec,
            functools.partial(_dirac_impulse, env_spec=env.spec, amp=0.5), dt)
        reset_kwargs = dict(init_state=np.array([0, 0]))
    elif excitation == 'position':
        policy = IdlePolicy(env.spec)
        reset_kwargs = dict(init_state=np.array([0.5, 0]))
    else:
        raise pyrado.ValueErr(given=excitation,
                              eq_constraint="'force' or 'position'")

    # Generate the data
    ro = rollout(env, policy, reset_kwargs=reset_kwargs, record_dts=False)
    return ro.observations[:, 0]
Ejemplo n.º 17
0
def task_activation_variant(dt, max_steps, max_dist_force, physics_engine,
                            graph_file_name):
    # Set up environment
    env = PlanarInsertTASim(
        physicsEngine=physics_engine,
        graphFileName=graph_file_name,
        dt=dt,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        taskCombinationMethod='sum',  # 'sum', 'mean',  'product', or 'softmax'
        checkJointLimits=False,
        collisionAvoidanceIK=True,
        observeForceTorque=True,
        observePredictedCollisionCost=False,
        observeManipulabilityIndex=False,
        observeCurrentManipulability=True,
        observeGoalDistance=False,
        observeDynamicalSystemDiscrepancy=False,
        observeTaskSpaceDiscrepancy=True,
    )
    env.reset(domain_param=dict(effector_friction=1.))

    # Set up policy
    def policy_fcn(t: float):
        return [0.7, 1, 0, 0.1, 0.5, 0.5]

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate and plot potentials
    print(env.obs_space.labels)
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=False)
Ejemplo n.º 18
0
def _ps_sample_one(G, eval: bool):
    """
    Sample one rollout and return step count if counting steps, rollout count (1) otherwise.
    This function is used when a minimum number of steps was given.
    """
    ro = rollout(G.env, G.policy, eval=eval)
    return ro, len(ro)
Ejemplo n.º 19
0
def task_activation_variant(dt, max_steps, max_dist_force, physics_engine):
    # Set up environment
    env = Planar3LinkTASim(
        physicsEngine=physics_engine,
        dt=dt,
        position_mps=True,
        max_steps=max_steps,
        max_dist_force=max_dist_force,
        checkJointLimits=False,
        collisionAvoidanceIK=True,
        observeCollisionCost=True,
        observeDynamicalSystemDiscrepancy=False,
    )
    print(env.obs_space.labels)

    # Set up policy
    def policy_fcn(t: float):
        if t < 3:
            return [0, 1, 0]
        elif t < 7:
            return [1, 0, 0]
        elif t < 10:
            return [.5, 0.5, 0]
        else:
            return [0, 0, 1]

    policy = TimePolicy(env.spec, policy_fcn, dt)

    # Simulate
    return rollout(env,
                   policy,
                   render_mode=RenderMode(video=True),
                   stop_on_done=False)
Ejemplo n.º 20
0
    def eval_ddp_policy(rollouts_real):
        init_states_real = np.array([ro.states[0, :] for ro in rollouts_real])
        rollouts_sim = []
        for i, _ in enumerate(range(num_eval_rollouts)):
            rollouts_sim.append(
                rollout(env_sim,
                        behavior_policy,
                        eval=True,
                        reset_kwargs=dict(init_state=init_states_real[i, :])))

        # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
        ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real,
                                                         rollouts_sim,
                                                         replicate=False)
        assert len(ros_real_tr) == len(ros_sim_tr)
        assert all([
            np.allclose(r.states[0, :], s.states[0, :])
            for r, s in zip(ros_real_tr, ros_sim_tr)
        ])

        # Return the average the loss
        losses = [
            algo.loss_fcn(ro_r, ro_s)
            for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
        ]
        return float(np.mean(np.asarray(losses)))
Ejemplo n.º 21
0
    def eval_policy(save_dir: [str, None],
                    env: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy,
                    mc_estimator: bool,
                    prefix: str,
                    num_rollouts: int,
                    num_parallel_envs: int = 1) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env, policy, eval=True).undiscounted_return()
                # If a reward of -1 is given, skip evaluation ahead and set all returns to zero
                if rets_real[i] == -1:
                    print_cbt('Set all returns for this policy to zero.', color='c')
                    rets_real = to.zeros(num_rollouts)
                    break
        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('Target domain performance', bright=True)
            print(tabulate([['mean return', to.mean(rets_real).item()],
                            ['std return', to.std(rets_real)],
                            ['min return', to.min(rets_real)],
                            ['max return', to.max(rets_real)]]))

        if mc_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean,
                                              num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
Ejemplo n.º 22
0
def test_rollout_wo_policy(env: SimEnv):
    def policy(obs):
        # Callable must receive and return tensors
        return to.from_numpy(env.spec.act_space.sample_uniform())

    ro = rollout(env, policy, render_mode=RenderMode())
    assert isinstance(ro, StepSequence)
    assert len(ro) <= env.max_steps
Ejemplo n.º 23
0
def test_dualrbf_policy(env, dim_mask):
    # Hyper-parameters for the RBF features are not important here
    rbf_hparam = dict(num_feat_per_dim=7, bounds=(np.array([0.]), np.array([1.])), scale=None)
    policy = DualRBFLinearPolicy(env.spec, rbf_hparam, dim_mask)
    assert policy.num_param == policy.num_active_feat*env.act_space.flat_dim//2

    ro = rollout(env, policy, eval=True)
    assert ro is not None
Ejemplo n.º 24
0
    def eval_policy(
        save_dir: Optional[pyrado.PathLike],
        env: Env,
        policy: Policy,
        prefix: str,
        num_rollouts: int,
        num_workers: int = 1,
    ) -> to.Tensor:
        """
        Evaluate a policy either in the source or in the target domain.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_workers: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f"Executing {prefix}_policy ...", "c", bright=True)

        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when evaluating on a real-world device
            rets_real = []
            for i in range(num_rollouts):
                rets_real.append(
                    rollout(env, policy, eval=True).undiscounted_return())

        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when evaluating in a simulation
            sampler = ParallelRolloutSampler(env,
                                             policy,
                                             num_workers=num_workers,
                                             min_rollouts=num_rollouts)
            ros = sampler.sample(eval=True)
            rets_real = [ro.undiscounted_return() for ro in ros]
        else:
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv])

        rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype())

        if save_dir is not None:
            # Save and print the evaluation results
            pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix)
            print_cbt("Target domain performance", bright=True)
            print(
                tabulate([
                    ["mean return", to.mean(rets_real).item()],
                    ["std return", to.std(rets_real)],
                    ["min return", to.min(rets_real)],
                    ["max return", to.max(rets_real)],
                ]))

        return to.mean(rets_real)
Ejemplo n.º 25
0
def _run_rollout_dp(G, domain_param, init_state=None):
    return rollout(
        G.env,
        G.policy,
        eval=True,  # render_mode=RenderMode(video=True),
        reset_kwargs={
            'domain_param': domain_param,
            'init_state': init_state
        })
Ejemplo n.º 26
0
 def _simulator(dp: to.Tensor) -> to.Tensor:
     """The most simple interface of a simulation to sbi, using `env` and `policy` from outer scope"""
     ro = rollout(
         env,
         policy,
         eval=True,
         reset_kwargs=dict(domain_param=dict(m=dp[0], k=dp[1], d=dp[2])))
     observation_sim = to.from_numpy(
         ro.observations[-1]).to(dtype=to.float32)
     return to.atleast_2d(observation_sim)
Ejemplo n.º 27
0
def _ps_run_one(G, num: int, eval: bool, seed: int, sub_seed: int):
    """
    Sample one rollout without specifying the initial state or the domain parameters.
    This function is used when a minimum number of rollouts was given.
    """
    return rollout(G.env,
                   G.policy,
                   eval=eval,
                   seed=seed,
                   sub_seed=sub_seed,
                   sub_sub_seed=num)
Ejemplo n.º 28
0
def _pes_sample_one(G, param):
    """ Sample one rollout with the current setting. """
    pol_param, dom_param, init_state = param
    vector_to_parameters(pol_param, G.policy.parameters())

    return rollout(G.env,
                   G.policy,
                   reset_kwargs={
                       'init_state': init_state,
                       'domain_param': dom_param,
                   })
Ejemplo n.º 29
0
def test_action_statistics(env: SimEnv, policy: Policy):
    sigma = 1.0  # with lower values like 0.1 we can observe violations of the tolerances

    # Create an action-based exploration strategy
    explstrat = NormalActNoiseExplStrat(policy, std_init=sigma)

    # Sample a deterministic rollout
    ro_policy = rollout(env,
                        policy,
                        eval=True,
                        max_steps=1000,
                        stop_on_done=False,
                        seed=0)
    ro_policy.torch(to.get_default_dtype())

    # Run the exploration strategy on the previously sampled rollout
    if policy.is_recurrent:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _, _ = explstrat(ro_policy.observations)
        else:
            act_expl, _ = explstrat(ro_policy.observations)
        # Get the hidden states from the deterministic rollout
        hidden_states = ro_policy.hidden_states
    else:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _ = explstrat(ro_policy.observations)
        else:
            act_expl = explstrat(ro_policy.observations)
        hidden_states = [
            0.0
        ] * ro_policy.length  # just something that does not violate the format

    ro_expl = StepSequence(
        actions=act_expl[:-1],  # truncate act due to last obs
        observations=ro_policy.observations,
        rewards=ro_policy.rewards,  # don't care but necessary
        hidden_states=hidden_states,
    )
    ro_expl.torch()

    # Compute action statistics and the ground truth
    actstats = compute_action_statistics(ro_expl, explstrat)
    gt_logprobs = Normal(loc=ro_policy.actions,
                         scale=sigma).log_prob(ro_expl.actions)
    gt_entropy = Normal(loc=ro_policy.actions, scale=sigma).entropy()

    to.testing.assert_allclose(actstats.log_probs,
                               gt_logprobs,
                               rtol=1e-4,
                               atol=1e-5)
    to.testing.assert_allclose(actstats.entropy,
                               gt_entropy,
                               rtol=1e-4,
                               atol=1e-5)
Ejemplo n.º 30
0
def simulator(mu):
    # In the end, the output of this could be a distance measure over trajectories instead of just the final state
    ro = rollout(
        env,
        policy,
        eval=True,
        reset_kwargs=dict(
            # domain_param=dict(k=mu[0], d=mu[1]), init_state=np.array([-0.7, 0.])  # no variance over the init state
            domain_param=dict(k=mu[0],
                              d=mu[1])  # no variance over the parameters
        ))
    return to.from_numpy(ro.observations[-1]).to(dtype=to.float32)