def _create_spaces(self): # State space (normalized time, since we do not have a simulation) self._state_space = BoxSpace(np.array([0.]), np.array([1.])) # Action space (PD controller on 3 joint positions and velocities) max_act = np.array([ np.pi, np.pi, np.pi, # [rad, rad, rad, ... 10 * np.pi, 10 * np.pi, 10 * np.pi ]) # ... rad/s, rad/s, rad/s] self._act_space = BoxSpace(-max_act, max_act, labels=[ r'$q_{1,des}$', r'$q_{3,des}$', r'$q_{5,des}$', r'$\dot{q}_{1,des}$', r'$\dot{q}_{3,des}$', r'$\dot{q}_{5,des}$' ]) # Observation space (normalized time) self._obs_space = BoxSpace(np.array([0.]), np.array([1.]), labels=['$t$'])
def _create_spaces(self): # State space (normalized time, since we do not have a simulation) self._state_space = BoxSpace(np.array([0.]), np.array([1.])) # Action space (PD controller on joint positions and velocities) if self.num_dof == 4: self._act_space = act_space_wam_4dof elif self.num_dof == 7: self._act_space = act_space_wam_7dof # Observation space (normalized time) self._obs_space = BoxSpace(np.array([0.]), np.array([1.]), labels=['t'])
def skyline( dt: Union[int, float, np.ndarray], t_end: Union[int, float, np.ndarray], t_intvl_space: BoxSpace, val_space: BoxSpace, ) -> Tuple[np.ndarray, np.ndarray]: """ Step function that randomly samples a value from the given range, and then holds this value for a time interval which is also randomly sampled given a range of time intervals. This procedure is repeated until the sequence is long enough, i.e. `dt * t_end` samples. :param dt: time step size :param t_end: final time :param t_intvl_space: 1-dim `BoxSpace` determining the range of time intervals that can be sampled :param val_space: 1-dim `BoxSpace` determining the range of values that can be sampled :return: array of time steps together with the associated array of values """ if dt <= 0: raise pyrado.ValueErr(given=dt, g_constraint="0") if t_end < dt: raise pyrado.ValueErr(given=t_end, ge_constraint=f"{dt}") if not isinstance(t_intvl_space, BoxSpace): raise pyrado.TypeErr(given=t_intvl_space, expected_type=BoxSpace) if not isinstance(val_space, BoxSpace): raise pyrado.TypeErr(given=val_space, expected_type=BoxSpace) if not t_intvl_space.flat_dim == 1: raise pyrado.ShapeErr(given=t_intvl_space, expected_match=(1, )) if not val_space.flat_dim == 1: raise pyrado.ShapeErr(given=val_space, expected_match=(1, )) dt = np.asarray(dt, dtype=np.float32) t_end = np.asarray(t_end, dtype=np.float32) # First iter t_intvl = t_intvl_space.sample_uniform() t_intvl = np.clip(t_intvl, dt, t_end + dt) t = np.arange(start=0.0, stop=t_intvl, step=dt) vals = val_space.sample_uniform() * np.ones_like(t) # Iterate until the time is up while t[-1] < t_end: t_intvl = t_intvl_space.sample_uniform() t_intvl = np.clip(t_intvl, dt, t_end - t[-1] + dt) t_new = np.arange(start=t[-1] + dt, stop=t[-1] + t_intvl, step=dt) t = np.concatenate([t, t_new]) val_new = val_space.sample_uniform() * np.ones_like(t_new) vals = np.concatenate([vals, val_new]) return t, vals
def _create_spaces(self): # State space (joint positions and velocities) state_shape = (2 * self.num_dof, ) state_up, state_lo = np.full(state_shape, pyrado.inf), np.full( state_shape, -pyrado.inf) self._state_space = BoxSpace(state_lo, state_up) # Action space (PD controller on joint positions and velocities) if self.num_dof == 4: self._act_space = act_space_wam_4dof elif self.num_dof == 7: self._act_space = act_space_wam_7dof # Observation space (normalized time) self._obs_space = BoxSpace(np.array([0.]), np.array([1.]), labels=['t'])
def obs_space(self) -> Space: lo = np.concatenate([ WAM_Q_LIMITS_LO_7DOF[:self._num_dof], WAM_QD_LIMITS_LO_7DOF[:self._num_dof] ]) up = np.concatenate([ WAM_Q_LIMITS_UP_7DOF[:self._num_dof], WAM_QD_LIMITS_UP_7DOF[:self._num_dof] ]) return BoxSpace(bound_lo=lo, bound_up=up)
def obs_space(self) -> Space: # Observation space (normalized time and optionally cup and ball position) obs_lo, obs_up, labels = [0.0], [1.0], ["t"] if self.observe_ball: obs_lo.extend([-3.0, -3.0]) obs_up.extend([3.0, 3.0]) labels.extend(["ball_x", "ball_z"]) if self.observe_cup: obs_lo.extend([-3.0, -3.0]) obs_up.extend([3.0, 3.0]) labels.extend(["cup_x", "cup_z"]) return BoxSpace(obs_lo, obs_up, labels=labels)
def test_bayrn_power(ex_dir, env: SimEnv, bayrn_hparam: dict): pyrado.set_seed(0) # Environments and domain randomization env_real = deepcopy(env) env_sim = DomainRandWrapperLive(env, create_zero_var_randomizer(env)) dp_map = create_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real.domain_param = dict(mass_pend_pole=0.024 * 1.1, mass_rot_pole=0.095 * 1.1) env_real = wrap_like_other_env(env_real, env_sim) # Policy and subroutine policy_hparam = dict(energy_gain=0.587, ref_energy=0.827) policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam) subrtn_hparam = dict( max_iter=1, pop_size=8, num_init_states_per_domain=1, num_is_samples=4, expl_std_init=0.1, num_workers=1, ) subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam) # Set the boundaries for the GP dp_nom = inner_env(env_sim).get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array([ 0.8 * dp_nom["mass_pend_pole"], 1e-8, 0.8 * dp_nom["mass_rot_pole"], 1e-8 ]), bound_up=np.array([ 1.2 * dp_nom["mass_pend_pole"], 1e-7, 1.2 * dp_nom["mass_rot_pole"], 1e-7 ]), ) # Create algorithm and train algo = BayRn(ex_dir, env_sim, env_real, subrtn, ddp_space, **bayrn_hparam, num_workers=1) algo.train() assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
def state_space(self) -> Space: lo = np.concatenate([ WAM_Q_LIMITS_LO_7DOF[:self._num_dof], WAM_QD_LIMITS_LO_7DOF[:self._num_dof] ]) up = np.concatenate([ WAM_Q_LIMITS_UP_7DOF[:self._num_dof], WAM_QD_LIMITS_UP_7DOF[:self._num_dof] ]) return BoxSpace( bound_lo=lo, bound_up=up, labels=[f"q_{i}" for i in range(1, 8)] + [f"qd_{i}" for i in range(1, 8)], )
def __init__( self, mapping: Dict[int, Tuple[str, str]], trafo_mask: Union[list, to.Tensor], prior: DomainRandomizer = None, scale_params: bool = False, use_cuda: bool = False, ): """ Constructor :param mapping: mapping from subsequent integers to domain distribution parameters, where the first string of the value tuple is the name of the domain parameter (e.g. mass, length) and the second string is the name of the distribution parameter (e.g. mean, std). The integers are indices of the numpy array which come from the algorithm. :param trafo_mask: every domain parameter that is set to `True` in this mask will be learned via a 'virtual' parameter, i.e. in sqrt-space, and then finally squared to retrieve the domain parameter. This transformation is useful to avoid setting a negative variance. :param prior: prior believe about the distribution parameters in from of a `DomainRandomizer` :param scale_params: if `True`, the sqrt-transformed policy parameters are scaled in the range of $[-0.5, 0.5]$. The advantage of this is to make the parameter-based exploration easier. :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU """ if not isinstance(mapping, dict): raise pyrado.TypeErr(given=mapping, expected_type=dict) if not len(trafo_mask) == len(mapping): raise pyrado.ShapeErr(given=trafo_mask, expected_match=mapping) self._mapping = mapping self.mask = to.tensor(trafo_mask, dtype=to.bool) self.prior = prior self._scale_params = scale_params # Define the parameter space by using the Policy.env_spec.act_space param_spec = EnvSpec(obs_space=EmptySpace(), act_space=BoxSpace(-pyrado.inf, pyrado.inf, shape=len(mapping))) # Call Policy's constructor super().__init__(param_spec, use_cuda) self.params = nn.Parameter(to.empty(param_spec.act_space.flat_dim), requires_grad=True) if self._scale_params: self.param_scaler = None # initialized during init_param() self.init_param(prior=prior)
def state_space(self) -> Space: # State space (joint positions and velocities) state_lo = np.concatenate([ WAM_Q_LIMITS_LO_7DOF[:self._num_dof], WAM_QD_LIMITS_LO_7DOF[:self._num_dof] ]) state_up = np.concatenate([ WAM_Q_LIMITS_UP_7DOF[:self._num_dof], WAM_QD_LIMITS_UP_7DOF[:self._num_dof] ]) # Ball and cup (x,y,z)-space if self.observe_ball: state_lo = np.r_[state_lo, np.full((3, ), -3.0)] state_up = np.r_[state_up, np.full((3, ), 3.0)] if self.observe_cup: state_lo = np.r_[state_lo, np.full((3, ), -3.0)] state_up = np.r_[state_up, np.full((3, ), 3.0)] return BoxSpace(state_lo, state_up)
# max_iter=300, # min_steps=23*env_sim.max_steps, # num_epoch=7, # eps_clip=0.0744, # batch_size=500, # std_init=0.9074, # lr=3.446e-04, # max_grad_norm=1., # num_workers=12, # ) # subrtn = PPO(ex_dir, env_sim, policy, critic, **subrtn_hparam) # Set the boundaries for the GP dp_nom = QQubeSwingUpSim.get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array([0.8*dp_nom['Mp'], 1e-12, 0.8*dp_nom['Mr'], 1e-12]), bound_up=np.array([1.2*dp_nom['Mp'], 1e-11, 1.2*dp_nom['Mr'], 1e-11]) ) # Algorithm bayrn_hparam = dict( max_iter=15, acq_fc='UCB', acq_param=dict(beta=0.25), acq_restarts=500, acq_samples=1000, num_init_cand=4, warmstart=False, num_eval_rollouts_real=10, # sim-2-sim # thold_succ_subrtn=300, )
env = OneMassOscillatorSim(**env_hparams, task_args=dict(task_args=dict(state_des=np.array([0.5, 0])))) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=[32, 32], shared_hidden_nonlin=to.relu, ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict( hidden_sizes=[32, 32], hidden_nonlin=to.relu ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( max_iter=1000*env.max_steps, memory_size=100*env.max_steps, gamma=0.995, num_batch_updates=1, tau=0.995, ent_coeff_init=0.2, learn_ent_coeff=True, target_update_intvl=5, standardize_rew=False, min_steps=1,
) subrtn = PPO(ex_dir, env_sim, policy, critic, **subrtn_hparam) # Set the boundaries for the GP dp_nom = QQubeSwingUpSim.get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array([ 0.8 * dp_nom["mass_pend_pole"], dp_nom["mass_pend_pole"] / 5000, 0.8 * dp_nom["mass_rot_pole"], dp_nom["mass_rot_pole"] / 5000, 0.8 * dp_nom["length_pend_pole"], dp_nom["length_pend_pole"] / 5000, 0.8 * dp_nom["length_rot_pole"], dp_nom["length_rot_pole"] / 5000, ]), bound_up=np.array([ 1.2 * dp_nom["mass_pend_pole"], dp_nom["mass_pend_pole"] / 20, 1.2 * dp_nom["mass_rot_pole"], dp_nom["mass_rot_pole"] / 20, 1.2 * dp_nom["length_pend_pole"], dp_nom["length_pend_pole"] / 20, 1.2 * dp_nom["length_rot_pole"], dp_nom["length_rot_pole"] / 20, ]), ) # policy_init = to.load(osp.join(pyrado.EXP_DIR, QQubeSwingUpSim.name, PPO.name, 'EXP_NAME', 'policy.pt')) # valuefcn_init = to.load(osp.join(pyrado.EXP_DIR, QQubeSwingUpSim.name, PPO.name, 'EXP_NAME', 'valuefcn.pt')) # Algorithm
dp_nom = WAMBallInCupSim.get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array( [ 0.95 * dp_nom["rope_length"], dp_nom["rope_length"] / 1000, 0.0 * dp_nom["rope_damping"], dp_nom["rope_damping"] / 100, 0.85 * dp_nom["ball_mass"], dp_nom["ball_mass"] / 1000, 0.0 * dp_nom["joint_2_dryfriction"], dp_nom["joint_2_dryfriction"] / 100, 0.0 * dp_nom["joint_2_damping"], dp_nom["joint_2_damping"] / 100, ] ), bound_up=np.array( [ 1.05 * dp_nom["rope_length"], dp_nom["rope_length"] / 20, 2 * dp_nom["rope_damping"], dp_nom["rope_damping"] / 2, 1.15 * dp_nom["ball_mass"], dp_nom["ball_mass"] / 10, 2 * dp_nom["joint_2_dryfriction"], dp_nom["joint_2_dryfriction"] / 2, 2 * dp_nom["joint_2_damping"], dp_nom["joint_2_damping"] / 2, ] ), )
alpha=0.2) plt.scatter(x.data.numpy(), obj_fcn().numpy(), s=16, color=colors[e]) plt.xlabel('$x$') plt.ylabel('$f(x)$') plt.legend() plt.show() assert noisy_nonlin_fcn(x, f=f, noise_std=noise_std) < noisy_nonlin_fcn( x_init, f=f, noise_std=noise_std) @pytest.mark.parametrize('dt', [0.1], ids=['0.1']) @pytest.mark.parametrize('t_end', [6.], ids=['1.']) @pytest.mark.parametrize( 't_intvl_space', [ BoxSpace(0.1, 0.11, shape=1), BoxSpace(0.123, 0.456, shape=1), BoxSpace(10., 20., shape=1), ], ids=['small_time_intvl', 'real_time_intvl', 'large_time_intvl']) @pytest.mark.parametrize('val_space', [BoxSpace(-5., 3., shape=1)], ids=['-5_to_3']) def test_skyline(dt: Union[int, float], t_end: Union[int, float], t_intvl_space: BoxSpace, val_space: BoxSpace): # Create the skyline function t, vals = skyline(dt, t_end, t_intvl_space, val_space) assert isinstance(t, np.ndarray) and isinstance(vals, np.ndarray) assert len(t) == len(vals) def test_check_prompt():
from pyrado.policies.feed_forward.linear import LinearPolicy from pyrado.policies.features import * from pyrado.policies.recurrent.two_headed_rnn import TwoHeadedRNNPolicyBase from pyrado.sampling.rollout import rollout from pyrado.sampling.step_sequence import StepSequence from pyrado.utils.data_sets import TimeSeriesDataSet from pyrado.utils.data_types import RenderMode from pyrado.utils.functions import skyline from pyrado.utils.nn_layers import IndiNonlinLayer from tests.conftest import m_needs_bullet, m_needs_mujoco, m_needs_rcs, m_needs_libtorch from tests.environment_wrappers.mock_env import MockEnv @pytest.fixture(scope='function', params=[ (skyline(0.02, 20., BoxSpace(0.5, 3, shape=(1,)), BoxSpace(-2., 3., shape=(1,)))[1], 0.7, 50, False, True), ], ids=['skyline_0.8_50_notstdized_scaled']) def tsdataset(request): return TimeSeriesDataSet( data=to.from_numpy(request.param[0]), ratio_train=request.param[1], window_size=request.param[2], standardize_data=request.param[3], scale_min_max_data=request.param[4] ) @pytest.mark.features @pytest.mark.parametrize(
std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1., num_workers=8, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999)) subrtn = PPO(ex_dir, env_sim, policy, critic, **subrtn_hparam) # Set the boundaries for the GP dp_nom = QQubeSwingUpSim.get_nominal_domain_param() ddp_space = BoxSpace(bound_lo=np.array([ 0.8 * dp_nom['Mp'], dp_nom['Mp'] / 5000, 0.8 * dp_nom['Mr'], dp_nom['Mr'] / 5000, 0.8 * dp_nom['Lp'], dp_nom['Lp'] / 5000, 0.8 * dp_nom['Lr'], dp_nom['Lr'] / 5000 ]), bound_up=np.array([ 1.2 * dp_nom['Mp'], dp_nom['Mp'] / 20, 1.2 * dp_nom['Mr'], dp_nom['Mr'] / 20, 1.2 * dp_nom['Lp'], dp_nom['Lp'] / 20, 1.2 * dp_nom['Lr'], dp_nom['Lr'] / 20 ])) # policy_init = to.load(osp.join(pyrado.EXP_DIR, QQubeSwingUpSim.name, PPO.name, 'EXP_NAME', 'policy.pt')) # valuefcn_init = to.load(osp.join(pyrado.EXP_DIR, QQubeSwingUpSim.name, PPO.name, 'EXP_NAME', 'valuefcn.pt')) # Algorithm bayrn_hparam = dict( thold_succ=300., max_iter=15, acq_fc='EI', # acq_param=dict(beta=0.2),
def state_space(self) -> Space: # Normalized time return BoxSpace(np.array([0.0]), np.array([1.0]), labels=["t"])
WAM_Q_LIMITS_LO_7DOF = np.array([-2.6, -2.0, -2.8, -0.9, -4.76, -1.6, -3.0 ]) + 5 * np.pi / 180 WAM_Q_LIMITS_UP_7DOF = np.array([+2.6, +2.0, +2.8, +3.1, +1.24, +1.6, +3.0 ]) - 5 * np.pi / 180 # Arbitrarily set velocity limits [rad/s] WAM_QD_LIMITS_LO_7DOF = -np.array([ 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi ]) WAM_QD_LIMITS_UP_7DOF = +np.array([ 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi, 4 * np.pi ]) # Torque limits # 7 DoF MAX_TORQUE_7DOF = np.array([150.0, 125.0, 40.0, 60.0, 5.0, 5.0, 2.0]) TORQUE_SPACE_WAM_7DOF = BoxSpace(-MAX_TORQUE_7DOF, MAX_TORQUE_7DOF) # 4 DoF MAX_TORQUE_4DOF = MAX_TORQUE_7DOF[:4] TORQUE_SPACE_WAM_4DOF = BoxSpace(-MAX_TORQUE_4DOF, MAX_TORQUE_4DOF) # Default PD gains from robcom / SL WAM_PGAINS_7DOF = np.array([200.0, 300.0, 100.0, 100.0, 10.0, 10.0, 2.5]) WAM_DGAINS_7DOF = np.array([7.0, 15.0, 5.0, 2.5, 0.3, 0.3, 0.05]) WAM_PGAINS_4DOF = WAM_PGAINS_7DOF[:4] WAM_DGAINS_4DOF = WAM_DGAINS_7DOF[:4] # Desired initial joint angles # 7 DoF INIT_QPOS_DES_7DOF = np.array([0.0, 0.5876, 0.0, 1.36, 0.0, -0.321, -1.57]) # 4 DoF arm INIT_QPOS_DES_4DOF = np.array([0.0, 0.6, 0.0, 1.25])
def state_space(self): max_state = np.array([100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]) return BoxSpace(-max_state, max_state, labels=["x_1", "x_2", "x_2", "x_3", "x_5", "x_6", "x_7", "x_8"])
# Select the data set data_set_name = args.mode or "skyline" # Experiment ex_dir = setup_experiment(TSPred.name, ADNPolicy.name) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Load the data if data_set_name == "skyline": dt = 0.01 _, vals = skyline(dt=dt, t_end=20.0, t_intvl_space=BoxSpace(0.5, 3, shape=(1, )), val_space=BoxSpace(-2.0, 3.0, shape=(1, ))) data = to.from_numpy(vals).to(dtype=to.get_default_dtype()).view(-1, 1) else: data = pd.read_csv( osp.join(pyrado.PERMA_DIR, "misc", f"{data_set_name}.csv")) if data_set_name == "daily_min_temperatures": data = to.tensor(data["Temp"].values, dtype=to.get_default_dtype()).view(-1, 1) dt = 1.0 elif data_set_name == "monthly_sunspots": data = to.tensor(data["Sunspots"].values, dtype=to.get_default_dtype()).view(-1, 1) dt = 1.0 elif "oscillation" in data_set_name: data = to.tensor(data["Positions"].values,
5: ('ball_mass', 'std'), 6: ('joint_stiction', 'mean'), 7: ('joint_stiction', 'halfspan'), 8: ('joint_damping', 'mean'), 9: ('joint_damping', 'halfspan'), } env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Set the boundaries for the GP (must be consistent with dp_map) dp_nom = WAMBallInCupSim.get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array([0.95*dp_nom['rope_length'], dp_nom['rope_length']/1000, 0.*dp_nom['rope_damping'], dp_nom['rope_damping']/100, 0.85*dp_nom['ball_mass'], dp_nom['ball_mass']/1000, 0.*dp_nom['joint_stiction'], dp_nom['joint_stiction']/100, 0.*dp_nom['joint_damping'], dp_nom['joint_damping']/100]), bound_up=np.array([1.05*dp_nom['rope_length'], dp_nom['rope_length']/20, 2*dp_nom['rope_damping'], dp_nom['rope_damping']/2, 1.15*dp_nom['ball_mass'], dp_nom['ball_mass']/10, 2*dp_nom['joint_stiction'], dp_nom['joint_stiction']/2, 2*dp_nom['joint_damping'], dp_nom['joint_damping']/2]) ) # Setting the ip address to None ensures that robcom does not try to connect to the server pc env_real_hparams = dict( num_dof=4, max_steps=1750, ) env_real = WAMBallInCupRealEpisodic(**env_real_hparams) # Policy policy_hparam = dict(
def test_snapshots_notmeta(ex_dir, env: SimEnv, policy, algo_class, algo_hparam): # Collect hyper-parameters, create algorithm, and train common_hparam = dict(max_iter=1, num_workers=1) common_hparam.update(algo_hparam) if issubclass(algo_class, ActorCritic): common_hparam.update( min_rollouts=3, critic=GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh)), ) elif issubclass(algo_class, ParameterExploring): common_hparam.update(num_init_states_per_domain=1) elif issubclass(algo_class, (DQL, SAC)): common_hparam.update(memory_size=1000, num_updates_per_step=2, gamma=0.99, min_rollouts=1) fnn_hparam = dict(hidden_sizes=[8, 8], hidden_nonlin=to.tanh) if issubclass(algo_class, DQL): # Override the setting env = BallOnBeamDiscSim(env.dt, env.max_steps) net = FNN( input_size=DiscreteActQValPolicy.get_qfcn_input_size(env.spec), output_size=DiscreteActQValPolicy.get_qfcn_output_size(), **fnn_hparam, ) policy = DiscreteActQValPolicy(spec=env.spec, net=net) else: # Override the setting env = ActNormWrapper(env) policy = TwoHeadedGRUPolicy(env.spec, shared_hidden_size=8, shared_num_recurrent_layers=1) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) common_hparam.update(qfcn_1=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) common_hparam.update(qfcn_2=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) else: raise NotImplementedError # Simulate training algo = algo_class(ex_dir, env, policy, **common_hparam) algo.policy.param_values += to.tensor([42.0]) if isinstance(algo, ActorCritic): algo.critic.vfcn.param_values += to.tensor([42.0]) # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy if isinstance(algo, ActorCritic): critic_loaded = algo_loaded.critic # Check assert all(algo.policy.param_values == policy_loaded.param_values) if isinstance(algo, ActorCritic): assert all( algo.critic.vfcn.param_values == critic_loaded.vfcn.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)
def default_randomizer(): return DomainRandomizer( NormalDomainParam(name="mass", mean=1.2, std=0.1, clip_lo=10, clip_up=100), UniformDomainParam(name="special", mean=0, halfspan=42, clip_lo=-7.4, roundint=True), NormalDomainParam(name="length", mean=4, std=0.6, clip_up=50.1), UniformDomainParam(name="time_delay", mean=13, halfspan=6, clip_up=17, roundint=True), MultivariateNormalDomainParam(name="multidim", mean=10 * to.ones((2,)), cov=2 * to.eye(2), clip_up=11), ) # -------------- # Other Fixtures # -------------- @pytest.fixture( scope="function", params=[ (skyline(0.02, 20.0, BoxSpace(0.5, 3, shape=(1,)), BoxSpace(-2.0, 3.0, shape=(1,)))[1], 0.7, 50, False, True), ], ids=["skyline_0.8_50_notstdized_scaled"], ) def dataset_ts(request): return TimeSeriesDataSet( data=to.from_numpy(request.param[0]).to(dtype=to.get_default_dtype()), ratio_train=request.param[1], window_size=request.param[2], standardize_data=request.param[3], scale_min_max_data=request.param[4], )
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical("shared_hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict( hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( "min_steps_algo", [1]), # 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform("memory_size_algo", 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform("tau_algo", 0.99, 1.0), ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9), learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo", [True, False]), standardize_rew=trial.suggest_categorical("standardize_rew_algo", [False]), gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0), target_update_intvl=trial.suggest_categorical( "target_update_intvl_algo", [1, 5]), num_updates_per_step=trial.suggest_categorical( "num_batch_updates_algo", [1, 5]), batch_size=trial.suggest_categorical("batch_size_algo", [128, 256, 512]), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f"trial_{trial.number}")) algo = SAC(study_dir, env, policy, qfcn_1, qfcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
if env_name == QBallBalancerSim.name: env = QBallBalancerSim(dt=dt) elif env_name == QCartPoleSwingUpSim.name: env = QCartPoleSwingUpSim(dt=dt) elif env_name == QQubeSwingUpSim.name: env = QQubeSwingUpSim(dt=dt) elif env_name == "wam-bic": # avoid loading mujoco from pyrado.environments.mujoco.wam_bic import WAMBallInCupSim env = WAMBallInCupSim(num_dof=4) env.init_space = BoxSpace(-pyrado.inf, pyrado.inf, shape=env.init_space.shape) elif env_name == "wam-jsc": # avoid loading mujoco from pyrado.environments.mujoco.wam_jsc import WAMJointSpaceCtrlSim env = WAMJointSpaceCtrlSim(num_dof=7) env.init_space = BoxSpace(-pyrado.inf, pyrado.inf, shape=env.init_space.shape) elif env_name == "mg-ik": # avoid loading _rcsenv from pyrado.environments.rcspysim.mini_golf import MiniGolfIKSim env = MiniGolfIKSim(dt=dt)
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 100., max_steps=600) env = QQubeSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( 'shared_hidden_sizes_policy', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical('shared_hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic q_fcn_hparam = dict( hidden_sizes=trial.suggest_categorical( 'hidden_sizes_critic', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( 'min_steps_algo', [1]), # , 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform('memory_size_algo', 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform('tau_algo', 0.99, 1.), alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9), learn_alpha=trial.suggest_categorical('learn_alpha_algo', [True, False]), standardize_rew=trial.suggest_categorical('standardize_rew_algo', [False]), gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.), target_update_intvl=trial.suggest_categorical( 'target_update_intvl_algo', [1, 5]), num_batch_updates=trial.suggest_categorical('num_batch_updates_algo', [1, 5]), batch_size=trial.suggest_categorical('batch_size_algo', [128, 256, 512]), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(ex_dir, f'trial_{trial.number}')) algo = SAC(ex_dir, env, policy, q_fcn_1, q_fcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler( env, policy, num_envs=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
import numpy as np import torch as to from matplotlib import pyplot as plt from matplotlib import ticker import pyrado from pyrado.policies.features import FeatureStack, RBFFeat from pyrado.policies.linear import LinearPolicy from pyrado.spaces import BoxSpace from pyrado.utils.data_types import EnvSpec if __name__ == '__main__': # Define some arbitrary EnvSpec obs_space = BoxSpace(bound_lo=np.array([-5., -12.]), bound_up=np.array([10., 6.])) act_space = BoxSpace(bound_lo=np.array([-1.]), bound_up=np.array([1.])) spec = EnvSpec(obs_space, act_space) num_fpd = 5 num_eval_points = 500 policy_hparam = dict( feats=FeatureStack([RBFFeat(num_feat_per_dim=num_fpd, bounds=obs_space.bounds, scale=None)]) ) policy = LinearPolicy(spec, **policy_hparam) eval_grid_0 = to.linspace(-5., 10, num_eval_points) eval_grid_1 = to.linspace(-12., 6, num_eval_points) eval_grid = to.stack([eval_grid_0, eval_grid_1], dim=1) feat_vals = to.empty(num_eval_points, num_fpd*obs_space.flat_dim)
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import numpy as np from pyrado.spaces import BoxSpace # 4 DoF arm, 2 DoF actuated init_qpos_des_4dof = np.array([0., 0.6, 0., 1.25]) act_min_wam_4dof = np.array([-1.985, -0.9, -10 * np.pi, -10 * np.pi]) act_max_wam_4dof = np.array([1.985, np.pi, 10 * np.pi, 10 * np.pi]) labels_4dof = ['q_1_des', 'q_3_des', 'q_dot_1_des', 'q_dot_3_des'] act_space_wam_4dof = BoxSpace(act_min_wam_4dof, act_max_wam_4dof, labels=labels_4dof) # 7 DoF arm, 3 DoF actuated init_qpos_des_7dof = np.array([0., 0.5876, 0., 1.36, 0., -0.321, -1.57]) act_min_wam_7dof = np.array( [-1.985, -0.9, -np.pi / 2, -10 * np.pi, -10 * np.pi, -10 * np.pi]) act_max_wam_7dof = np.array( [1.985, np.pi, np.pi / 2, 10 * np.pi, 10 * np.pi, 10 * np.pi]) labels_7dof = [ 'q_1_des', 'q_3_des', 'q_5_des', 'q_dot_1_des', 'q_dot_3_des', 'q_dot_5_des' ] act_space_wam_7dof = BoxSpace(act_min_wam_7dof, act_max_wam_7dof, labels=labels_7dof)
# num_init_states_per_domain=6, # num_is_samples=10, # expl_std_init=2.0, # expl_std_min=0.02, # symm_sampling=False, # num_workers=32, # ) # subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam) # Set the boundaries for the GP dp_nom = QQubeSwingUpSim.get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array([ 0.8 * dp_nom["mass_pend_pole"], 1e-8, 0.8 * dp_nom["mass_rot_pole"], 1e-8 ]), bound_up=np.array([ 1.2 * dp_nom["mass_pend_pole"], 1e-7, 1.2 * dp_nom["mass_rot_pole"], 1e-7 ]), ) # Algorithm bayrn_hparam = dict( max_iter=15, acq_fc="UCB", acq_param=dict(beta=0.25), acq_restarts=500, acq_samples=1000, num_init_cand=4, warmstart=False, num_eval_rollouts_real=100,