Esempio n. 1
0
def create_default_randomizer_qbb() -> DomainRandomizer:
    """
    Create the default randomizer for the `QBallBalancerSim`.

    :return: randomizer based on the nominal domain parameter values
    """
    from pyrado.environments.pysim.quanser_ball_balancer import QBallBalancerSim
    dp_nom = QBallBalancerSim.get_nominal_domain_param()
    return DomainRandomizer(
        NormalDomainParam(name='g', mean=dp_nom['g'], std=dp_nom['g']/10, clip_lo=1e-4),
        NormalDomainParam(name='m_ball', mean=dp_nom['m_ball'], std=dp_nom['m_ball']/5, clip_lo=1e-4),
        NormalDomainParam(name='r_ball', mean=dp_nom['r_ball'], std=dp_nom['r_ball']/5, clip_lo=1e-3),
        NormalDomainParam(name='l_plate', mean=dp_nom['l_plate'], std=dp_nom['l_plate']/5, clip_lo=5e-2),
        NormalDomainParam(name='r_arm', mean=dp_nom['r_arm'], std=dp_nom['r_arm']/5, clip_lo=1e-4),
        NormalDomainParam(name='K_g', mean=dp_nom['K_g'], std=dp_nom['K_g']/4, clip_lo=1e-2),
        NormalDomainParam(name='J_l', mean=dp_nom['J_l'], std=dp_nom['J_l']/4, clip_lo=1e-6),
        NormalDomainParam(name='J_m', mean=dp_nom['J_m'], std=dp_nom['J_m']/4, clip_lo=1e-9),
        NormalDomainParam(name='k_m', mean=dp_nom['k_m'], std=dp_nom['k_m']/4, clip_lo=1e-4),
        NormalDomainParam(name='R_m', mean=dp_nom['R_m'], std=dp_nom['R_m']/4, clip_lo=1e-4),
        UniformDomainParam(name='eta_g', mean=dp_nom['eta_g'], halfspan=dp_nom['eta_g']/4, clip_lo=1e-4, clip_up=1),
        UniformDomainParam(name='eta_m', mean=dp_nom['eta_m'], halfspan=dp_nom['eta_m']/4, clip_lo=1e-4, clip_up=1),
        UniformDomainParam(name='B_eq', mean=dp_nom['B_eq'], halfspan=dp_nom['B_eq']/4, clip_lo=1e-4),
        UniformDomainParam(name='c_frict', mean=dp_nom['c_frict'], halfspan=dp_nom['c_frict']/4, clip_lo=1e-4),
        UniformDomainParam(name='V_thold_x_pos', mean=dp_nom['V_thold_x_pos'], halfspan=dp_nom['V_thold_x_pos']/3),
        UniformDomainParam(name='V_thold_x_neg', mean=dp_nom['V_thold_x_neg'], halfspan=abs(dp_nom['V_thold_x_neg'])/3),
        UniformDomainParam(name='V_thold_y_pos', mean=dp_nom['V_thold_y_pos'], halfspan=dp_nom['V_thold_y_pos']/3),
        UniformDomainParam(name='V_thold_y_neg', mean=dp_nom['V_thold_y_neg'], halfspan=abs(dp_nom['V_thold_y_neg'])/3),
        UniformDomainParam(name='offset_th_x', mean=dp_nom['offset_th_x'], halfspan=6./180*np.pi),
        UniformDomainParam(name='offset_th_y', mean=dp_nom['offset_th_y'], halfspan=6./180*np.pi)
    )
Esempio n. 2
0
def test_qbb_kin(servo_ang):
    env = QBallBalancerSim(dt=0.02, max_steps=100)
    kin = QBallBalancerKin(env, num_opt_iter=50, render_mode=RenderMode(video=False))

    servo_ang = to.tensor(servo_ang, dtype=to.get_default_dtype())
    for th in servo_ang:
        plate_ang = kin(th)
        assert plate_ang is not None
Esempio n. 3
0
def create_qbb_setup(factor, dt, max_steps):
    # Set up environment
    init_state = np.array([0, 0, 0.1, 0.1, 0, 0, 0, 0])
    env = QBallBalancerSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QBallBalancerPDCtrl(env.spec)

    # Simulate
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=max_steps,
    )
    act_500Hz = ro.actions

    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt * factor),
                          init_state=init_state),
        render_mode=RenderMode(video=True),
        max_steps=int(max_steps / factor),
    )
    act_100Hz = ro.actions

    env = DownsamplingWrapper(env, factor)
    ro = rollout(
        env,
        policy,
        reset_kwargs=dict(domain_param=dict(dt=dt), init_state=init_state),
        render_mode=render_mode,
        max_steps=max_steps,
    )
    act_500Hz_w = ro.actions

    # Time in seconds
    time_500Hz = np.linspace(0, int(len(act_500Hz) * dt), int(len(act_500Hz)))
    time_100Hz = np.linspace(0, int(len(act_100Hz) * dt), int(len(act_100Hz)))
    time_500Hz_w = np.linspace(0, int(len(act_500Hz_w) * dt),
                               int(len(act_500Hz_w)))

    # Plot
    _, axs = plt.subplots(nrows=2)
    for i in range(2):
        axs[i].plot(time_500Hz, act_500Hz[:, i], label="500 Hz (original)")
        axs[i].plot(time_100Hz, act_100Hz[:, i], label="100 Hz", ls="--")
        axs[i].plot(time_500Hz_w,
                    act_500Hz_w[:, i],
                    label="500 Hz (wrapped)",
                    ls="--")
        axs[i].legend()
        axs[i].set_ylabel(env.act_space.labels[i])
    axs[1].set_xlabel("time [s]")
Esempio n. 4
0
# OR TECHNICAL UNIVERSITY OF DARMSTADT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""
Test the functionality of Pyrado using the Quanser Ball balancer setup.
"""
from pyrado.domain_randomization.utils import print_domain_params
from pyrado.environments.pysim.quanser_ball_balancer import QBallBalancerSim
from pyrado.policies.feed_forward.dummy import IdlePolicy
from pyrado.sampling.rollout import rollout
from pyrado.utils.data_types import RenderMode

if __name__ == "__main__":
    # Set up environment
    env = QBallBalancerSim(dt=1 / 500.0, max_steps=10000)
    env.reset(domain_param=dict(offset_th_x=50.0 / 180 * 3.141592))
    print_domain_params(env.domain_param)

    # Set up policy
    policy = IdlePolicy(env.spec)

    # Simulate
    ro = rollout(env,
                 policy,
                 render_mode=RenderMode(text=True),
                 stop_on_done=True)
Esempio n. 5
0
import pytest

from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim
from pyrado.environments.pysim.quanser_ball_balancer import QBallBalancerSim
from pyrado.environments.sim_base import SimEnv
from pyrado.exploration.stochastic_action import NormalActNoiseExplStrat
from pyrado.exploration.stochastic_params import NormalParamNoise
from pyrado.policies.base import Policy
from pyrado.policies.features import *


@pytest.mark.parametrize(
    "env",
    [
        BallOnBeamSim(dt=0.02, max_steps=1),
        QBallBalancerSim(dt=0.02, max_steps=1),
    ],
    ids=["bob", "qbb"],
)
@pytest.mark.parametrize("policy", ["linear_policy", "fnn_policy"],
                         ids=["lin", "fnn"],
                         indirect=True)
def test_noise_on_act(env: SimEnv, policy: Policy):
    for _ in range(100):
        # Init the exploration strategy
        act_noise_strat = NormalActNoiseExplStrat(policy,
                                                  std_init=0.5,
                                                  train_mean=True)

        # Set new parameters for the exploration noise
        std = to.ones(env.act_space.flat_dim) * to.rand(1)
Esempio n. 6
0
def default_qcpsu():
    return QBallBalancerSim(dt=0.002, max_steps=8000)
Esempio n. 7
0
def default_qcpst():
    return QBallBalancerSim(dt=0.01, max_steps=300)
Esempio n. 8
0
def default_qbb():
    return QBallBalancerSim(dt=0.01, max_steps=500)
Esempio n. 9
0
from matplotlib import pyplot as plt
from pyrado.environments.pysim.quanser_ball_balancer import QBallBalancerSim
from pyrado.environment_wrappers.action_normalization import ActNormWrapper
from pyrado.environment_wrappers.downsampling import DownsamplingWrapper
from pyrado.sampling.rollout import rollout
from pyrado.policies.environment_specific import QBallBalancerPDCtrl
from pyrado.utils.data_types import RenderMode

if __name__ == '__main__':
    # Set up environment
    factor = 5  # don't change this
    dt = 1 / 500.  # don't change this
    max_steps = 2000  # don't change this
    init_state = np.array([0, 0, 0.1, 0.1, 0, 0, 0, 0])
    env = QBallBalancerSim(dt=dt, max_steps=max_steps)
    env = ActNormWrapper(env)

    # Set up policy
    policy = QBallBalancerPDCtrl(env.spec)

    # Simulate
    ro = rollout(env,
                 policy,
                 reset_kwargs=dict(domain_param=dict(dt=dt),
                                   init_state=init_state),
                 render_mode=RenderMode(video=True),
                 max_steps=max_steps)
    act_500Hz = ro.actions

    ro = rollout(env,
Esempio n. 10
0
from pyrado.utils.data_types import dict_arraylike_to_float
from pyrado.utils.experiments import load_experiment, wrap_like_other_env
from pyrado.utils.input_output import print_cbt


if __name__ == '__main__':
    # Parse command line arguments
    args = get_argparser().parse_args()

    if args.max_steps == pyrado.inf:
        args.max_steps = 2500
        print_cbt(f'Set maximum number of time steps to {args.max_steps}', 'y')

    if args.env_name == QBallBalancerSim.name:
        # Create the environment for evaluating
        env = QBallBalancerSim(dt=args.dt, max_steps=args.max_steps)

        # Get the experiments' directories to load from
        prefixes = [
            osp.join(pyrado.EXP_DIR, 'ENV_NAME', 'ALGO_NAME'),
        ]
        ex_names = [
            '',
        ]
        ex_labels = [
            '',
        ]

    elif args.env_name in [QCartPoleStabSim.name, QCartPoleSwingUpSim.name]:
        # Create the environment for evaluating
        if args.env_name == QCartPoleSwingUpSim.name:
Esempio n. 11
0
from pyrado.utils.input_output import print_cbt


if __name__ == '__main__':
    # Parse command line arguments
    args = get_argparser().parse_args()
    if args.max_steps == pyrado.inf:
        args.max_steps = 2000
        print_cbt(f'Set maximum number of time steps to {args.max_steps}', 'y')

    # Create one-dim evaluation grid
    param_spec = dict()

    if args.env_name == QBallBalancerSim.name:
        # Create the environment for evaluating
        env = QBallBalancerSim(dt=args.dt, max_steps=args.max_steps, load_experimental_tholds=True)

        # param_spec['g'] = np.linspace(8.91, 12.91, num=11, endpoint=True)
        # param_spec['m_ball'] = np.linspace(0.001, 0.033, num=11, endpoint=True)
        # param_spec['r_ball'] = np.linspace(0.01, 0.1, num=11, endpoint=True)
        # param_spec['r_arm'] = np.linspace(0.0254*0.3, 0.0254*1.7, num=11, endpoint=True)
        # param_spec['l_plate'] = np.linspace(0.275*0.3, 0.275*1.7, num=11, endpoint=True)
        # param_spec['J_l'] = np.linspace(5.2822e-5 * 0.5, 5.2822e-5 * 1.5, num=11, endpoint=True)
        # param_spec['J_m'] = np.linspace(4.6063e-7*0.5, 4.6063e-7*1.5, num=11, endpoint=True)
        # param_spec['K_g'] = np.linspace(70*0.5, 70*1.5, num=11, endpoint=True)
        # param_spec['eta_g'] = np.linspace(0.6, 1.0, num=11, endpoint=True)
        # param_spec['eta_m'] = np.linspace(0.49, 0.89, num=11, endpoint=True)
        # param_spec['k_m'] = np.linspace(0.0077*0.3, 0.0077*1.7, num=11, endpoint=True)
        # param_spec['k_m'] = np.linspace(0.004, 0.012, num=11, endpoint=True)
        # param_spec['R_m'] = np.linspace(2.6*0.5, 2.6*1.5, num=11, endpoint=True)
        # param_spec['B_eq'] = np.linspace(0.0, 0.2, num=11, endpoint=True)
Esempio n. 12
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma",
                                          [None, 0.99, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical("hidden_sizes_policy",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_policy",
                                      ["to_tanh", "to_relu"])),
    )

    # Critic
    vfcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical("hidden_sizes_critic",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_critic",
                                      ["to_tanh", "to_relu"])),
    )
    critic_hparam = dict(
        batch_size=250,
        gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0),
        lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0),
        num_epoch=trial.suggest_int("num_epoch_critic", 1, 10),
        lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical("standardize_adv_critic",
                                                  [True, False]),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_critic",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=300,
        batch_size=250,
        min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) *
        env.max_steps,
        num_epoch=trial.suggest_int("num_epoch_algo", 1, 10),
        eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2),
        std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0),
        lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_algo",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy,
               critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=1,
                                     min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Esempio n. 13
0
        domain_param = None

    # Extract the time if possible
    if hasattr(rollouts[0], "time"):
        dt = rollouts[0].time[1] - rollouts[0].time[0]  # dt is constant
    elif args.dt is not None:
        dt = args.dt
    else:
        raise pyrado.ValueErr(
            msg=
            "There was no time field in the loaded rollout to infer the time step size from, neither has "
            "it been specified explicitly! Please provide the time step size using --dt."
        )

    if env_name == QBallBalancerSim.name:
        env = QBallBalancerSim(dt=dt)

    elif env_name == QCartPoleSwingUpSim.name:
        env = QCartPoleSwingUpSim(dt=dt)

    elif env_name == QQubeSwingUpSim.name:
        env = QQubeSwingUpSim(dt=dt)

    elif env_name == "wam-bic":  # avoid loading mujoco
        from pyrado.environments.mujoco.wam_bic import WAMBallInCupSim

        env = WAMBallInCupSim(num_dof=4)
        env.init_space = BoxSpace(-pyrado.inf,
                                  pyrado.inf,
                                  shape=env.init_space.shape)
Esempio n. 14
0
def create_default_randomizer_qbb() -> DomainRandomizer:
    """
    Create the default randomizer for the `QBallBalancerSim`.

    :return: randomizer based on the nominal domain parameter values
    """
    from pyrado.environments.pysim.quanser_ball_balancer import QBallBalancerSim

    dp_nom = QBallBalancerSim.get_nominal_domain_param()
    return DomainRandomizer(
        NormalDomainParam(name="gravity_const",
                          mean=dp_nom["gravity_const"],
                          std=dp_nom["gravity_const"] / 10,
                          clip_lo=1e-4),
        NormalDomainParam(name="ball_mass",
                          mean=dp_nom["ball_mass"],
                          std=dp_nom["ball_mass"] / 5,
                          clip_lo=1e-4),
        NormalDomainParam(name="ball_radius",
                          mean=dp_nom["ball_radius"],
                          std=dp_nom["ball_radius"] / 5,
                          clip_lo=1e-3),
        NormalDomainParam(name="plate_length",
                          mean=dp_nom["plate_length"],
                          std=dp_nom["plate_length"] / 5,
                          clip_lo=5e-2),
        NormalDomainParam(name="arm_radius",
                          mean=dp_nom["arm_radius"],
                          std=dp_nom["arm_radius"] / 5,
                          clip_lo=1e-4),
        NormalDomainParam(name="gear_ratio",
                          mean=dp_nom["gear_ratio"],
                          std=dp_nom["gear_ratio"] / 4,
                          clip_lo=1e-2),
        NormalDomainParam(name="load_inertia",
                          mean=dp_nom["load_inertia"],
                          std=dp_nom["load_inertia"] / 4,
                          clip_lo=1e-6),
        NormalDomainParam(name="motor_inertia",
                          mean=dp_nom["motor_inertia"],
                          std=dp_nom["motor_inertia"] / 4,
                          clip_lo=1e-9),
        NormalDomainParam(name="motor_back_emf",
                          mean=dp_nom["motor_back_emf"],
                          std=dp_nom["motor_back_emf"] / 4,
                          clip_lo=1e-4),
        NormalDomainParam(name="motor_resistance",
                          mean=dp_nom["motor_resistance"],
                          std=dp_nom["motor_resistance"] / 4,
                          clip_lo=1e-4),
        UniformDomainParam(
            name="gear_efficiency",
            mean=dp_nom["gear_efficiency"],
            halfspan=dp_nom["gear_efficiency"] / 4,
            clip_lo=1e-4,
            clip_up=1,
        ),
        UniformDomainParam(
            name="motor_efficiency",
            mean=dp_nom["motor_efficiency"],
            halfspan=dp_nom["motor_efficiency"] / 4,
            clip_lo=1e-4,
            clip_up=1,
        ),
        UniformDomainParam(
            name="combined_damping",
            mean=dp_nom["combined_damping"],
            halfspan=dp_nom["combined_damping"] / 4,
            clip_lo=1e-4,
        ),
        UniformDomainParam(name="ball_damping",
                           mean=dp_nom["ball_damping"],
                           halfspan=dp_nom["ball_damping"] / 4,
                           clip_lo=1e-4),
        UniformDomainParam(name="voltage_thold_x_pos",
                           mean=dp_nom["voltage_thold_x_pos"],
                           halfspan=dp_nom["voltage_thold_x_pos"] / 3),
        UniformDomainParam(
            name="voltage_thold_x_neg",
            mean=dp_nom["voltage_thold_x_neg"],
            halfspan=abs(dp_nom["voltage_thold_x_neg"]) / 3,
        ),
        UniformDomainParam(name="voltage_thold_y_pos",
                           mean=dp_nom["voltage_thold_y_pos"],
                           halfspan=dp_nom["voltage_thold_y_pos"] / 3),
        UniformDomainParam(
            name="voltage_thold_y_neg",
            mean=dp_nom["voltage_thold_y_neg"],
            halfspan=abs(dp_nom["voltage_thold_y_neg"]) / 3,
        ),
        UniformDomainParam(name="offset_th_x",
                           mean=dp_nom["offset_th_x"],
                           halfspan=6.0 / 180 * np.pi),
        UniformDomainParam(name="offset_th_y",
                           mean=dp_nom["offset_th_y"],
                           halfspan=6.0 / 180 * np.pi),
    )
Esempio n. 15
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1/250., max_steps=1500)
    env = ActNormWrapper(env)

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )

    # Critic
    value_fcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    critic_hparam = dict(
        gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        batch_size=100,
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=500,
        min_steps=25*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        batch_size=100,
        std_init=0.9,
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret
Esempio n. 16
0
    # Parse command line arguments
    args = get_argparser().parse_args()
    dt = args.dt if args.dt is not None else 0.01

    if args.env_name == QCartPoleSwingUpSim.name:
        env = QCartPoleSwingUpSim(dt=dt,
                                  max_steps=int(5 / dt),
                                  wild_init=False)
        state = np.array([0, 87 / 180 * np.pi, 0, 0])

    elif args.env_name == QQubeSwingUpSim.name:
        env = QQubeSwingUpSim(dt=dt, max_steps=int(5 / dt))
        state = np.array([5 / 180 * np.pi, 87 / 180 * np.pi, 0, 0])

    elif args.env_name == QBallBalancerSim.name:
        env = QBallBalancerSim(dt=dt, max_steps=int(5 / dt))
        state = np.array(
            [2 / 180 * np.pi, 2 / 180 * np.pi, 0.1, -0.08, 0, 0, 0, 0])

    elif args.env_name == OneMassOscillatorSim.name:
        env = OneMassOscillatorSim(dt=dt, max_steps=int(5 / dt))
        state = np.array([-0.7, 0])

    elif args.env_name == PendulumSim.name:
        env = PendulumSim(dt=dt, max_steps=int(5 / dt))
        state = np.array([87 / 180 * np.pi, 0])

    elif args.env_name == BallOnBeamSim.name:
        env = BallOnBeamSim(dt=dt, max_steps=int(5 / dt))
        state = np.array([-0.25, 0, 0, +20 / 180 * np.pi])
Esempio n. 17
0
from pyrado.environment_wrappers.domain_randomization import DomainRandWrapperBuffer
from pyrado.environment_wrappers.observation_noise import GaussianObsNoiseWrapper
from pyrado.environment_wrappers.observation_partial import ObsPartialWrapper
from pyrado.logger.experiment import setup_experiment, save_list_of_dicts_to_yaml
from pyrado.policies.features import FeatureStack, identity_feat
from pyrado.policies.linear import LinearPolicy
from pyrado.sampling.sequences import *

if __name__ == '__main__':
    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(QBallBalancerSim.name, f'{SPOTA.name}-{HCNormal.name}',
                              f'{LinearPolicy.name}_obsnoise-s_actedlay-10', seed=1001)

    # Environment and domain randomization
    env_hparams = dict(dt=1/100., max_steps=500)
    env = QBallBalancerSim(**env_hparams)
    env = GaussianObsNoiseWrapper(env, noise_std=[1/180*pi, 1/180*pi, 0.005, 0.005,  # [rad, rad, m, m, ...
                                                  10/180*pi, 10/180*pi, 0.05, 0.05])  # ... rad/s, rad/s, m/s, m/s]
    # env = ObsPartialWrapper(env, mask=[0, 0, 0, 0, 1, 1, 0, 0])
    env = ActDelayWrapper(env)
    randomizer = get_default_randomizer(env)
    randomizer.add_domain_params(UniformDomainParam(name='act_delay', mean=5, halfspan=5, clip_lo=0, roundint=True))
    env = DomainRandWrapperBuffer(env, randomizer)

    # Policy
    policy_hparam = dict(feats=FeatureStack([identity_feat]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Initialize with Quanser's PD gains
    init_policy_param_values = to.tensor([[-14., 0, -14*3.45, 0, 0, 0, -14*2.11, 0],
                                          [0, -14., 0, -14*3.45, 0, 0, 0, -14*2.11]])
Esempio n. 18
0
        (4, 'uniform'), (4, 'normal'), (4, 'Marsaglia'),
        (15, 'uniform'), (15, 'normal')
    ]
)
def test_sample_from_unit_sphere_surface(num_dim, method):
    s = sample_from_hyper_sphere_surface(num_dim, method)
    assert 0.95 <= to.norm(s, p=2) <= 1.05


@pytest.mark.sampling
@pytest.mark.parametrize(
    'env, policy', [
        (BallOnBeamSim(dt=0.02, max_steps=100),
         LinearPolicy(BallOnBeamSim(dt=0.02, max_steps=100).spec,
                      FeatureStack([const_feat, identity_feat, squared_feat]))),
        (QBallBalancerSim(dt=0.02, max_steps=100),
         LinearPolicy(QBallBalancerSim(dt=0.02, max_steps=100).spec,
                      FeatureStack([const_feat, identity_feat, squared_feat])))
    ], ids=['bob_linpol', 'qbb_linpol']
)
def test_rollout_wo_exploration(env, policy):
    ro = rollout(env, policy, render_mode=RenderMode())
    assert isinstance(ro, StepSequence)
    assert len(ro) <= env.max_steps


@pytest.mark.parametrize(
    'mean, cov', [
        (to.tensor([5., 7.]), to.tensor([[2., 0.], [0., 2.]])),
    ], ids=['2dim']
)