def _eval_cand_and_ref_one_domain(self, i: int) -> tuple: """ Evaluate the candidate and the k-th reference solution (see outer loop) in the i-th domain using nJ rollouts. :param i: index of the domain to evaluate in :return: average return values for the candidate and the k-th reference in the i-th domain """ cand_ret_avg = 0. refs_ret_avg = 0. # Do nJ rollouts for each set of physics params for r in range(self.nJ): # Candidate solution set_seed(self.base_seed + i * self.nJ + r) # Set the circular index for the particular realization self._env_dr.ring_idx = i # Do the rollout and collect the return ro_cand = rollout(self._env_dr, self._subrtn_cand.policy, eval=True) cand_ret_avg += ro_cand.undiscounted_return() # Reference solution set_seed(self.base_seed + i * self.nJ + r) # Set the circular index for the particular realization self._env_dr.ring_idx = i # Do the rollout and collect the return ro_ref = rollout(self._env_dr, self._subrtn_refs.policy, eval=True) refs_ret_avg += ro_ref.undiscounted_return() return cand_ret_avg / self.nJ, refs_ret_avg / self.nJ # average over nJ seeds
def _handle_neg_samples(self, cand_rets: np.ndarray, refs_rets: np.ndarray, k: int, i: int) -> np.ndarray: """ Process negative optimality gap samples by Looking at the other Reference Solutions :param cand_rets: array of the candidate's return values :param refs_rets: array of the references' return values :param k: index of the reference solution :param i: index of the domain :return refs_rets: if a better reference has been round the associated value will be overwritten """ if refs_rets[k, i] < cand_rets[k, i]: print_cbt( f'\nReference {k + 1} is worse than the candidate on domain realization {i + 1}.\n' # 1-based index 'Trying to replace this reference at this realization with a different one', 'y') for other_k in range(self.nG): if other_k == k: # Do nothing for the bad solution that brought us here continue else: # Load a reference solution different from the the k-th other_ref = pyrado.load( self._subrtn_refs._policy, 'policy', 'pt', self.save_dir, dict(prefix=f'iter_{self._curr_iter}', suffix=f'ref_{other_k}')) other_ref_ret = 0 for r in range(self.nJ): # Set the same random seed pyrado.set_seed(self.base_seed + i * self.nJ + r) # Set the circular index for the particular realization self.env_dr.ring_idx = i # Do the rollout and collect the return ro_other_ref = rollout(self.env_dr, other_ref, eval=True) other_ref_ret += ro_other_ref.undiscounted_return( ) / self.nJ # average over nJ seeds # Store the value if value is better if other_ref_ret > refs_rets[k, i]: refs_rets[k, i] = other_ref_ret # If a better one was found, do not iterate over the remaining reference solutions break if refs_rets[k, i] > cand_rets[k, i]: # Found a different reference that achieves a higher return that the candidate print_cbt('Successfully handled a negative OG sample', 'g') else: refs_rets[k, i] = cand_rets[ k, i] # forces optimality gap sample to be 0 print_cbt( 'Unsuccessfully handled a negative OG sample: Set the value to 0', 'r') else: # Everything is as it should be pass return refs_rets
def train(self, snapshot_mode: str = "latest", seed: int = None, meta_info: dict = None): """ Train one/multiple policy/policies in a given environment. :param snapshot_mode: determines when the snapshots are stored (e.g. on every iteration or on new high-score) :param seed: seed value for the random number generators, pass `None` for no seeding :param meta_info: is not `None` if this algorithm is run as a subroutine of a meta-algorithm, contains a dict of information about the current iteration of the meta-algorithm """ if self._policy is not None: print_cbt( f"{get_class_name(self)} started training a {get_class_name(self._policy)} " f"with {self._policy.num_param} parameters using the snapshot mode {snapshot_mode}.", "g", ) # Set dropout and batch normalization layers to training mode self._policy.train() else: print_cbt( f"{get_class_name(self)} started training using the snapshot mode {snapshot_mode}.", "g") # Set all rngs' seeds if seed is not None: set_seed(seed, verbose=True) while not self.stopping_criterion_met(): # Record current iteration to logger self.logger.add_value(self.iteration_key, self._curr_iter) # Acquire data, save the training progress, and update the parameters self.step(snapshot_mode, meta_info) # Update logger and print self.logger.record_step() # Increase the iteration counter self._curr_iter += 1 if self.stopping_criterion_met(): stopping_reason = "Stopping criteria met!" else: stopping_reason = "Maximum number of iterations reached!" if self._policy is not None: print_cbt( f"{get_class_name(self)} finished training a {get_class_name(self._policy)} " f"with {self._policy.num_param} parameters. {stopping_reason}", "g", ) # Set dropout and batch normalization layers to evaluation mode self._policy.eval() else: print_cbt( f"{get_class_name(self)} finished training. {stopping_reason}", "g")
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 250., max_steps=1500) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(feats=FeatureStack([ identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat([4, 5]) ])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=50, pop_size=trial.suggest_int('pop_size', 50, 200), num_rollouts=trial.suggest_int('num_rollouts', 4, 10), num_is_samples=trial.suggest_int('num_is_samples', 5, 40), expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5), symm_sampling=trial.suggest_categorical('symm_sampling', [True, False]), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'), env, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def __init__(self, env_name: str, algo_name: str, extra_info: str = None, exp_id: str = None, timestamp: datetime = None, base_dir: str = pyrado.TEMP_DIR, seed: int = None): """ Constructor :param env_name: environment trained on :param algo_name: algorithm trained with :param extra_info: additional information on the experiment (freeform) :param exp_id: combined timestamp and extra_info, usually the final folder name. :param timestamp: experiment creation timestamp :param base_dir: base storage directory :param seed: seed value for the random number generators, pass None for no seeding """ if exp_id is not None: # Try to parse extra_info from exp id sd = exp_id.split('--', 1) if len(sd) == 1: timestr = sd[0] else: timestr, extra_info = sd # Parse time string if '_' in timestr: timestamp = datetime.strptime(timestr, timestamp_format) else: timestamp = datetime.strptime(timestr, timestamp_date_format) else: # Create exp id from timestamp and info if timestamp is None: timestamp = datetime.now() exp_id = timestamp.strftime(timestamp_format) if extra_info is not None: exp_id = exp_id + '--' + extra_info # Store values self.env_name = env_name self.algo_name = algo_name self.extra_info = extra_info self.exp_id = exp_id self.timestamp = timestamp self.base_dir = base_dir self.seed = seed # Set the random seed if seed is not None: pyrado.set_seed(seed) print_cbt(f"Set the random number generators' seed to {seed}.", 'y')
def test_domain_param_transforms(env: SimEnv, trafo_class: Type): pyrado.set_seed(0) # Create a mask for a random domain parameter offset = 1 idx = random.randint(0, len(env.supported_domain_param) - 1) sel_dp_change = list(env.supported_domain_param)[idx] sel_dp_fix = list( env.supported_domain_param)[(idx + offset) % len(env.supported_domain_param)] while (offset == 1 or any([ item in sel_dp_change for item in VORTEX_ONLY_DOMAIN_PARAM_LIST ]) or any([item in sel_dp_fix for item in VORTEX_ONLY_DOMAIN_PARAM_LIST])): idx = random.randint(0, len(env.supported_domain_param) - 1) sel_dp_change = list(env.supported_domain_param)[idx] sel_dp_fix = list( env.supported_domain_param)[(idx + offset) % len(env.supported_domain_param)] offset += 1 mask = (sel_dp_change, ) wenv = trafo_class(env, mask) assert isinstance(wenv, DomainParamTransform) # Check 5 random values for _ in range(5): # Change the selected domain parameter new_dp_val = random.random() * env.get_nominal_domain_param( )[sel_dp_change] new_dp_val = abs( new_dp_val) + 1e-6 # due to the domain of the new params transformed_new_dp_val = wenv.forward(new_dp_val) wenv.domain_param = { sel_dp_change: transformed_new_dp_val } # calls inverse transform if not isinstance(inner_env(wenv), SimPyEnv): wenv.reset( ) # the RcsPySim and MujocoSim classes need to be reset to apply the new domain param # Test the actual domain param and the the getters assert inner_env(wenv)._domain_param[sel_dp_change] == pytest.approx( new_dp_val, abs=1e-5) assert wenv.domain_param[sel_dp_change] == pytest.approx(new_dp_val, abs=1e-5) assert wenv.domain_param[sel_dp_fix] != pytest.approx(new_dp_val)
def reset(self, seed: int = None): """ Reset the algorithm to it's initial state. This should NOT reset learned policy parameters. By default, this resets the iteration count and the exploration strategy. Be sure to call this function if you override it. :param seed: seed value for the random number generators, pass `None` for no seeding """ # Reset the exploration strategy if any if self.expl_strat is not None: self.expl_strat.reset_expl_params() # Reset internal variables self._curr_iter = 0 self._cnt_samples = 0 self._highest_avg_ret = -pyrado.inf # Set all rngs' seeds if seed is not None: set_seed(seed, verbose=True)
def test_env_specific(env: Env): pyrado.set_seed(0) if "qbb" in env.name: policy = QBallBalancerPDCtrl(env.spec) policy.reset() elif "qcp" in env.name: policy = QCartPoleSwingUpAndBalanceCtrl(env.spec) policy.reset() elif "qq" in env.name: policy = QQubeSwingUpAndBalanceCtrl(env.spec) policy.reset() else: raise NotImplementedError # Sample an observation and do an action 10 times for _ in range(10): obs = env.obs_space.sample_uniform() obs = to.from_numpy(obs).to(dtype=to.get_default_dtype()) act = policy(obs) assert isinstance(act, to.Tensor)
def test_action_statistics(env, policy): sigma = 1. # with lower values like 0.1 we can observe violations of the tolerances # Create an action-based exploration strategy explstrat = NormalActNoiseExplStrat(policy, std_init=sigma) # Sample a deterministic rollout pyrado.set_seed(0) ro_policy = rollout(env, policy, eval=True, max_steps=1000, stop_on_done=False) ro_policy.torch() # Run the exploration strategy on the previously sampled rollout if policy.is_recurrent: if isinstance(policy, TwoHeadedPolicy): act_expl, _, _ = explstrat(ro_policy.observations) else: act_expl, _ = explstrat(ro_policy.observations) # Get the hidden states from the deterministic rollout hidden_states = ro_policy.hidden_states else: if isinstance(policy, TwoHeadedPolicy): act_expl, _ = explstrat(ro_policy.observations) else: act_expl = explstrat(ro_policy.observations) hidden_states = [0.]*ro_policy.length # just something that does not violate the format ro_expl = StepSequence( actions=act_expl[:-1], # truncate act due to last obs observations=ro_policy.observations, rewards=ro_policy.rewards, # don't care but necessary hidden_states=hidden_states ) # Compute action statistics and the ground truth actstats = compute_action_statistics(ro_expl, explstrat) gt_logprobs = Normal(loc=ro_policy.actions, scale=sigma).log_prob(ro_expl.actions) gt_entropy = Normal(loc=ro_policy.actions, scale=sigma).entropy() to.testing.assert_allclose(actstats.log_probs, gt_logprobs, rtol=1e-4, atol=1e-5) to.testing.assert_allclose(actstats.entropy, gt_entropy, rtol=1e-4, atol=1e-5)
def test_order_act_noise_act_norm(env: SimEnv): # First noise wrapper then normalization wrapper wrapped_env_noise = GaussianActNoiseWrapper( env, noise_mean=0.2 * np.ones(env.act_space.shape), noise_std=0.1 * np.ones(env.act_space.shape)) wrapped_env_noise_norm = ActNormWrapper(wrapped_env_noise) # First normalization wrapper then noise wrapper wrapped_env_norm = ActNormWrapper(env) wrapped_env_norm_noise = GaussianActNoiseWrapper( wrapped_env_norm, noise_mean=0.2 * np.ones(env.act_space.shape), noise_std=0.1 * np.ones(env.act_space.shape)) # Sample some values directly from the act_spaces for i in range(3): pyrado.set_seed(i) act_noise_norm = wrapped_env_noise_norm.act_space.sample_uniform() pyrado.set_seed(i) act_norm_noise = wrapped_env_norm_noise.act_space.sample_uniform() # These samples must be the same since were not passed to _process_act function assert np.allclose(act_noise_norm, act_norm_noise) # Process a sampled action for i in range(3): # Sample a small random action such that the de-normalization does not map it to the act_space limits rand_act = 0.01 * env.act_space.sample_uniform() pyrado.set_seed(i) wrapped_env_noise_norm.reset() obs_noise_norm, _, _, _ = wrapped_env_noise_norm.step(rand_act) pyrado.set_seed(i) wrapped_env_norm_noise.reset() obs_norm_noise, _, _, _ = wrapped_env_norm_noise.step(rand_act) # The order of processing (first normalization or first randomization must make a difference) assert not np.allclose(obs_noise_norm, obs_norm_noise)
def adn_variant(dt, max_steps, max_dist_force, physics_engine, normalize_obs=True, obsnorm_cpp=True): pyrado.set_seed(1001) # Explicit normalization bounds elb = { 'EffectorLoadCell_Fx': -100., 'EffectorLoadCell_Fz': -100., 'Effector_Xd': -1, 'Effector_Zd': -1, 'GD_DS0d': -1, 'GD_DS1d': -1, 'GD_DS2d': -1, } eub = { 'GD_DS0': 3., 'GD_DS1': 3, 'GD_DS2': 3, 'EffectorLoadCell_Fx': 100., 'EffectorLoadCell_Fz': 100., 'Effector_Xd': .5, 'Effector_Zd': .5, 'GD_DS0d': .5, 'GD_DS1d': .5, 'GD_DS2d': .5, 'PredCollCost_h50': 1000. } extra_kwargs = {} if normalize_obs and obsnorm_cpp: extra_kwargs['normalizeObservations'] = True extra_kwargs['obsNormOverrideLower'] = elb extra_kwargs['obsNormOverrideUpper'] = eub # Set up environment env = Planar3LinkTASim(physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, collisionAvoidanceIK=True, taskCombinationMethod='sum', **extra_kwargs) if normalize_obs and not obsnorm_cpp: env = ObsNormWrapper(env, explicit_lb=elb, explicit_ub=eub) # Set up random policy policy_hparam = dict( tau_init=0.2, activation_nonlin=to.sigmoid, potentials_dyn_fcn=pd_cubic, ) policy = ADNPolicy(spec=env.spec, dt=dt, **policy_hparam) print_cbt('Running ADNPolicy with random initialization', 'c', bright=True) # Simulate and plot potentials ro = rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True) plot_potentials(ro) return ro
figsize=(8, 12), sharex='col', tight_layout=True) # Try to run several possible cases for pe in ['Bullet', 'Vortex']: print_cbt(f'Running with {pe} physics engine', 'c', bright=True) if rcsenv.supportsPhysicsEngine(pe): env, policy = create_setup(pe, dt=0.01, max_steps=1000, max_dist_force=0.) # Simulate pyrado.set_seed(1) ro = rollout(env, policy, render_mode=RenderMode(video=True)) # Render plots axs[0].plot(ro.observations[:, 0], label=pe) axs[1].plot(ro.observations[:, 1], label=pe) axs[2].plot(ro.observations[:, 2], label=pe) axs[0].legend() axs[1].legend() axs[2].legend() # Show plots axs[0].set_title('gBotKuka.xml') axs[0].set_ylabel('plate x pos') axs[1].set_ylabel('plate y pos') axs[2].set_ylabel('plate z pos')
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 100., max_steps=600) env = QQubeSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( 'shared_hidden_sizes_policy', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical('shared_hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic q_fcn_hparam = dict( hidden_sizes=trial.suggest_categorical( 'hidden_sizes_critic', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( 'min_steps_algo', [1]), # , 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform('memory_size_algo', 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform('tau_algo', 0.99, 1.), alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9), learn_alpha=trial.suggest_categorical('learn_alpha_algo', [True, False]), standardize_rew=trial.suggest_categorical('standardize_rew_algo', [False]), gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.), target_update_intvl=trial.suggest_categorical( 'target_update_intvl_algo', [1, 5]), num_batch_updates=trial.suggest_categorical('num_batch_updates_algo', [1, 5]), batch_size=trial.suggest_categorical('batch_size_algo', [128, 256, 512]), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(ex_dir, f'trial_{trial.number}')) algo = SAC(ex_dir, env, policy, q_fcn_1, q_fcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler( env, policy, num_envs=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
Play around with PyTorch's 1-dim concolution class (in the context of using it for the NFPolicy class) .. seealso:: # https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728 # https://github.com/jayleicn/TVQAplus/blob/master/model/cnn.py """ import torch as to import torch.nn as nn from matplotlib import pyplot as plt import pyrado from pyrado.policies.initialization import init_param from pyrado.utils.nn_layers import MirrConv1d if __name__ == '__main__': pyrado.set_seed(0) hand_coded_filter = False # use a ramp from 0 to 1 instead of random weights use_depth_wise_conv = False use_custom_mirr_layer = False use_custom_bell_init = True batch_size = 1 num_neurons = 360 # each potential-based neuron is basically like time steps of a signal in_channels = 1 # number of input signals out_channels = 6 # number of filters if hand_coded_filter: out_channels = 1 kernel_size = 16 # larger number smooth out and reduce the length of the output signal, use odd numbers padding_mode = 'circular' # circular, reflective, zeros padding = kernel_size // 2 if padding_mode != 'circular' else kernel_size - 1
def test_out_of_bounds_base_seed(base_seed, sub_seed, sub_sub_seed, expected): assert pyrado.set_seed(base_seed, sub_seed, sub_sub_seed, verbose=True) == expected assert pyrado.get_base_seed() == base_seed
def _run_set_seed(G, seed): """Ignore global space, and forward to `pyrado.set_seed()`""" pyrado.set_seed(seed)
Play around with PyTorch's 1-dim concolution class (in the context of using it for the NFPolicy class) .. seealso:: # https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728 # https://github.com/jayleicn/TVQAplus/blob/master/model/cnn.py """ import torch as to import torch.nn as nn from matplotlib import pyplot as plt import pyrado from pyrado.utils.nn_layers import MirrConv1d if __name__ == '__main__': pyrado.set_seed(10) hand_coded_filter = False # use a ramp from 0 to 1 instead of random weights use_depth_wise_conv = False use_custom_symm_init = True batch_size = 1 num_neurons = 360 # each potential-based neuron is basically like time steps of a signal in_channels = 2 # number of input signals out_channels = 6 # number of filters if hand_coded_filter: out_channels = 1 kernel_size = 17 # larger number smooth out and reduce the length of the output signal, use odd numbers padding_mode = 'circular' # circular, reflective, zeros padding = kernel_size//2 if padding_mode != 'circular' else kernel_size - 1
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma", [None, 0.99, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical("hidden_sizes_policy", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) # Critic vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) critic_hparam = dict( batch_size=250, gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0), lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0), num_epoch=trial.suggest_int("num_epoch_critic", 1, 10), lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3), standardize_adv=trial.suggest_categorical("standardize_adv_critic", [True, False]), max_grad_norm=trial.suggest_categorical("max_grad_norm_critic", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=300, batch_size=250, min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) * env.max_steps, num_epoch=trial.suggest_int("num_epoch_algo", 1, 10), eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2), std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical("max_grad_norm_algo", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Load the data data_set_name = "oscillation_50Hz_initpos-0.5" data = pd.read_csv(osp.join(pyrado.PERMA_DIR, "misc", f"{data_set_name}.csv")) if data_set_name == "daily_min_temperatures": data = to.tensor(data["Temp"].values, dtype=to.get_default_dtype()).view(-1, 1) elif data_set_name == "monthly_sunspots": data = to.tensor(data["Sunspots"].values, dtype=to.get_default_dtype()).view(-1, 1) elif "oscillation" in data_set_name: data = to.tensor(data["Positions"].values, dtype=to.get_default_dtype()).view(-1, 1) else: raise pyrado.ValueErr( given=data_set_name, eq_constraint="'daily_min_temperatures', 'monthly_sunspots', " "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4", ) # Dataset data_set_hparam = dict( name=data_set_name, ratio_train=0.7, window_size=trial.suggest_int("dataset_window_size", 1, 100), standardize_data=False, scale_min_max_data=True, ) dataset = TimeSeriesDataSet(data, **data_set_hparam) # Policy policy_hparam = dict( dt=0.02 if "oscillation" in data_set_name else 1.0, obs_layer=None, activation_nonlin=to.tanh, potentials_dyn_fcn=fcn_from_str( trial.suggest_categorical("policy_potentials_dyn_fcn", ["pd_linear", "pd_cubic"]) ), tau_init=trial.suggest_loguniform("policy_tau_init", 1e-2, 1e3), tau_learnable=True, kappa_init=trial.suggest_categorical("policy_kappa_init", [0, 1e-4, 1e-2]), kappa_learnable=True, capacity_learnable=True, potential_init_learnable=trial.suggest_categorical("policy_potential_init_learnable", [True, False]), init_param_kwargs=trial.suggest_categorical("policy_init_param_kwargs", [None]), use_cuda=False, ) policy = ADNPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam) # Algorithm algo_hparam = dict( windowed=trial.suggest_categorical("algo_windowed", [True, False]), max_iter=1000, optim_class=optim.Adam, optim_hparam=dict( lr=trial.suggest_uniform("optim_lr", 5e-4, 5e-2), eps=trial.suggest_uniform("optim_eps", 1e-8, 1e-5), weight_decay=trial.suggest_uniform("optim_weight_decay", 5e-5, 5e-3), ), loss_fcn=nn.MSELoss(), ) csv_logger = create_csv_step_logger(osp.join(study_dir, f"trial_{trial.number}")) algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate num_init_samples = dataset.window_size _, loss_trn = TSPred.evaluate( policy, dataset.data_trn_inp, dataset.data_trn_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False, ) _, loss_tst = TSPred.evaluate( policy, dataset.data_tst_inp, dataset.data_tst_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False, ) return loss_trn
f"Did not find a recorded real trajectory (qpos_real_{mode} and qvel_real_{mode}) for this policy. " f"Run deployment/run_policy_wam.py to get real-world trajectories.", "y", bright=True, ) # Load the policy and the environment env, policy, _ = load_experiment(ex_dir, args) # Get nominal environment env = remove_all_dr_wrappers(env) env.domain_param = env.get_nominal_domain_param() env.stop_on_collision = False # Fix seed for reproducibility pyrado.set_seed(args.seed) # Use the recorded initial state from the real system init_state = env.init_space.sample_uniform() if real_data_exists: if input( "Use the recorded initial state from the real system? [y] / n " ).lower() == "" or "y": init_state[:env.num_dof] = qpos_real[0, :] # Define indices of actuated joints act_idcs = [1, 3, 5] if env.num_dof == 7 else [1, 3] # Do rollout in simulation ro = rollout(env, policy,
def bootstrap_ci( data: np.ndarray, stat_fcn: Callable, num_reps: int, alpha: float, ci_sides: int, bias_correction: bool = False, studentized: bool = False, seed: int = None, ): r""" Re-sampling input data using the nonparametric bootstrap method, computing bootstrap replications using stat_fcn and computing a confidence interval on the statistic of interest given by `stat_fcn` which needs to expect the argument `axis` (like numpy functions do). .. seealso:: [1] https://projecteuclid.org/download/pdf_1/euclid.ss/1032280214 [2] https://people.csail.mit.edu/tommi/papers/SteJaa-nips03.pdf [3] Cameron & Trivedi, "Microeconometrics: Methods and Applications", 2005, page 361 [4] http://users.stat.umn.edu/~helwig/notes/bootci-Notes.pdf [5] https://www.diva-portal.org/smash/get/diva2:130905/FULLTEXT01.pdf [6] https://www.ethz.ch/content/dam/ethz/special-interest/math/statistics/sfs/Education/Advanced%20Studies%20in%20Applied%20Statistics/course-material-1719/Nonparametric%20Methods/lecture_2up.pdf [7] https://ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and-statistics-spring-2014/readings/MIT18_05S14_Reading24.pdf :param data: data to bootstrap from (for now only 1D arrays supported) :param stat_fcn: function to compute a statistic of interest (e.g. mean, variance) on bootstrap samples :param num_reps: number of samples in every bootstrap sample :param alpha: determines the confidence level $1 - \alpha \in [0, 1]$ :param ci_sides: one or two-sided confidence interval :param axis: axis to compute along in case of 2-dim data :param bias_correction: bool to decide if the bias should be subtracted (see [2]). However, the confidence intervals are constructed independent of the bias-correction (see [5, p.7]). The bias-correction can be dangerous in practice. Even though T_bc(D) is less biased than T(D), the bias-corrected estimator may have substantially larger variance. This is due to a possibly higher variability in the estimate of the bias, particularly when computed from small data sets. Other estimates of the bias-correction factor than stat_emp possible, see [4]. :param studentized: flag to determine if the method based on the t-distribution is used (leads to a wider ci) :param seed: value for the random number generators' seeds, pass `None` to skip seeding :return: mean of the bootstrap replications, and the confidence interval """ if not isinstance(data, np.ndarray): raise pyrado.TypeErr(given=data, expected_type=np.ndarray) if not callable(stat_fcn): raise pyrado.TypeErr(given=stat_fcn, expected_type=Callable) if not isinstance(alpha, (int, float)): raise pyrado.TypeErr(given=alpha, expected_type=[int, float]) if not isinstance(num_reps, int) and num_reps > 0: raise pyrado.TypeErr(given=num_reps, expected_type=int) if not (ci_sides == 1 or ci_sides == 2): raise pyrado.ValueErr(given=ci_sides, eq_constraint="1 or 2") data = np.atleast_2d(data) if data.shape[0] == 1: data = np.transpose(data) # correct for np.atleast_2d if data.ndim > 2: raise pyrado.ShapeErr(msg="The data array needs to be at max two-dimensional!") num_data_samples = data.shape[0] dim_data_samples = data.shape[1] # Set the seed if provided pyrado.set_seed(seed) # Get the bootstrap replications. The size of the samples drawn by the bootstrap method have to be equal input # sample, since the variance of the statistic to be computed depends on sample size data_bs = np.stack( [data[np.random.choice(num_data_samples, num_data_samples, replace=True)] for _ in range(num_reps)], axis=2 ) # data_bs = np.random.choice(data, size=(num_data_samples, dim_data_samples, num_reps), replace=True) # Compute the statistic of interest based on the empirical distribution (input data) stat_emp = stat_fcn(data, axis=0) assert stat_emp.shape == (dim_data_samples,) # Compute the statistic of interest based on the resampled distribution -->> bootstrap replications stat_bs = stat_fcn(data_bs, axis=0) assert stat_bs.shape == (dim_data_samples, num_reps) # Correct for the bias introduced by bootstrapping if bias_correction: # bias-corrected statistic (see (2) in [2], or (11.10) in [3]) stat_bs_bc = 2 * stat_emp.reshape(-1, 1) - np.mean( stat_bs ) # repl_bc = stat_emp - bias, with bias = mean_repl - stat_emp # Return the bias-corrected estimator based on the original sample a.k.a. empirical distribution, # but use the correction also for the bootstrap replications stat_ret = stat_bs_bc else: # Return the estimator based on the original sample a.k.a. empirical distribution stat_ret = stat_emp # Compute the deviation to the value of the statistic based on the empirical distribution (see [7]). This is # analogous to the deviation of the empirical value around the true population value, # i.e. delta = stat_emp - stat_pop # Note: it makes no difference if one uses the percentile operator before or after this difference delta_bs = stat_bs - stat_emp.reshape(-1, 1) assert delta_bs.shape == (dim_data_samples, num_reps) # Confidence interval with asymptotic refinement (a.k.a. percentile-t method) if studentized: # Compute the standard error of the original sample se_emp = np.std(data, axis=0, ddof=0) / np.sqrt(data.shape[0]) # for dividing by (n-1) set ddof=1 assert se_emp.shape == (dim_data_samples,) if np.any(se_emp < 1e-9): print_cbt("The standard error of the empirical data (se_emp) is below 1e-9.", "y") # Compute the standard error of the replications for the bootstrapped t-statistic se_bs = np.std(data_bs, axis=0, ddof=0) / np.sqrt(data_bs.shape[0]) assert se_bs.shape == (dim_data_samples, num_reps) if np.any(se_bs < 1e-9): # use any for version 2 above print_cbt( "The standard error of the bootstrapped data (se_bs) is below 1e-9. " "Setting confidence interval bounds to infinity.", "y", ) return stat_ret, -pyrado.inf, pyrado.inf # Compute the t-statistic of the replications t_bs = delta_bs / se_bs # is consistent with [3, p. 360] t_bs.sort() # Two-sided confidence interval if ci_sides == 2: t_lo, t_up = np.percentile(t_bs, 100 * np.array([alpha / 2, 1 - alpha / 2]), axis=1) # One-sided confidence interval (lower and upper bound as if there would only be one of them) else: t_lo, t_up = np.percentile(t_bs, 100 * np.array([alpha, 1 - alpha]), axis=1) ci_lo = stat_emp - t_up * se_emp # see [3, (11.6) p. 364] ci_up = stat_emp - t_lo * se_emp # see [3, (11.6) p. 364] # Confidence interval without asymptotic refinement (a.k.a. basic method) else: delta_bs.sort() # Two-sided confidence interval if ci_sides == 2: delta_lo, delta_up = np.percentile(delta_bs, 100 * np.array([alpha / 2, 1 - alpha / 2]), axis=1) # One-sided confidence interval (lower and upper bound as if there would only be one of them) else: delta_lo, delta_up = np.percentile(delta_bs, 100 * np.array([alpha, 1 - alpha]), axis=1) ci_lo = stat_emp - delta_up ci_up = stat_emp - delta_lo assert ci_lo.shape == (dim_data_samples,) assert ci_up.shape == (dim_data_samples,) return stat_ret, ci_lo, ci_up
def rollout( env: Env, policy: Union[nn.Module, Policy, Callable], eval: bool = False, max_steps: Optional[int] = None, reset_kwargs: Optional[dict] = None, render_mode: RenderMode = RenderMode(), render_step: int = 1, no_reset: bool = False, no_close: bool = False, record_dts: bool = False, stop_on_done: bool = True, seed: Optional[int] = None, sub_seed: Optional[int] = None, sub_sub_seed: Optional[int] = None, ) -> StepSequence: """ Perform a rollout (i.e. sample a trajectory) in the given environment using given policy. :param env: environment to use (`SimEnv` or `RealEnv`) :param policy: policy to determine the next action given the current observation. This policy may be wrapped by an exploration strategy. :param eval: pass `False` if the rollout is executed during training, else `True`. Forwarded to PyTorch `Module`. :param max_steps: maximum number of time steps, if `None` the environment's property is used :param reset_kwargs: keyword arguments passed to environment's reset function :param render_mode: determines if the user sees an animation, console prints, or nothing :param render_step: rendering interval, renders every step if set to 1 :param no_reset: do not reset the environment before running the rollout :param no_close: do not close (and disconnect) the environment after running the rollout :param record_dts: flag if the time intervals of different parts of one step should be recorded (for debugging) :param stop_on_done: set to false to ignore the environment's done flag (for debugging) :param seed: seed value for the random number generators, pass `None` for no seeding :return paths of the observations, actions, rewards, and information about the environment as well as the policy """ # Check the input if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) # Don't restrain policy type, can be any callable if not isinstance(eval, bool): raise pyrado.TypeErr(given=eval, expected_type=bool) # The max_steps argument is checked by the environment's setter if not (isinstance(reset_kwargs, dict) or reset_kwargs is None): raise pyrado.TypeErr(given=reset_kwargs, expected_type=dict) if not isinstance(render_mode, RenderMode): raise pyrado.TypeErr(given=render_mode, expected_type=RenderMode) # Initialize the paths obs_hist = [] act_hist = [] act_app_hist = [] rew_hist = [] state_hist = [] env_info_hist = [] t_hist = [] if isinstance(policy, Policy): if policy.is_recurrent: hidden_hist = [] # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly if isinstance(getattr(policy, "policy", policy), PotentialBasedPolicy): pot_hist = [] stim_ext_hist = [] stim_int_hist = [] elif isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy): head_2_hist = [] if record_dts: dt_policy_hist = [] dt_step_hist = [] dt_remainder_hist = [] # Override the number of steps to execute if max_steps is not None: env.max_steps = max_steps # Set all rngs' seeds (call before resetting) if seed is not None: pyrado.set_seed(seed, sub_seed, sub_sub_seed) # Reset the environment and pass the kwargs if reset_kwargs is None: reset_kwargs = dict() obs = np.zeros(env.obs_space.shape) if no_reset else env.reset(**reset_kwargs) # Setup rollout information rollout_info = dict(env_name=env.name, env_spec=env.spec) if isinstance(inner_env(env), SimEnv): rollout_info["domain_param"] = env.domain_param if isinstance(policy, Policy): # Reset the policy, i.e. the exploration strategy in case of step-based exploration. # In case the environment is a simulation, the current domain parameters are passed to the policy. This allows # the policy policy to update it's internal model, e.g. for the energy-based swing-up controllers if isinstance(env, SimEnv): policy.reset(domain_param=env.domain_param) else: policy.reset() # Set dropout and batch normalization layers to the right mode if eval: policy.eval() else: policy.train() # Check for recurrent policy, which requires initializing the hidden state if policy.is_recurrent: hidden = policy.init_hidden() # Initialize animation env.render(render_mode, render_step=1) # Initialize the main loop variables done = False t = 0.0 # time starts at zero t_hist.append(t) if record_dts: t_post_step = time.time() # first sample of remainder is useless # ---------- # Begin loop # ---------- # Terminate if the environment signals done, it also keeps track of the time while not (done and stop_on_done) and env.curr_step < env.max_steps: # Record step start time if record_dts or render_mode.video: t_start = time.time() # dual purpose if record_dts: dt_remainder = t_start - t_post_step # Check observations if np.isnan(obs).any(): env.render(render_mode, render_step=1) raise pyrado.ValueErr( msg=f"At least one observation value is NaN!" + tabulate( [list(env.obs_space.labels), [*color_validity(obs, np.invert(np.isnan(obs)))]], headers="firstrow" ) ) # Get the agent's action obs_to = to.from_numpy(obs).type(to.get_default_dtype()) # policy operates on PyTorch tensors with to.no_grad(): if isinstance(policy, Policy): if policy.is_recurrent: if isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy): act_to, head_2_to, hidden_next = policy(obs_to, hidden) else: act_to, hidden_next = policy(obs_to, hidden) else: if isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy): act_to, head_2_to = policy(obs_to) else: act_to = policy(obs_to) else: # If the policy ist not of type Policy, it should still operate on PyTorch tensors act_to = policy(obs_to) act = act_to.detach().cpu().numpy() # environment operates on numpy arrays # Check actions if np.isnan(act).any(): env.render(render_mode, render_step=1) raise pyrado.ValueErr( msg=f"At least one action value is NaN!" + tabulate( [list(env.act_space.labels), [*color_validity(act, np.invert(np.isnan(act)))]], headers="firstrow" ) ) # Record time after the action was calculated if record_dts: t_post_policy = time.time() # Ask the environment to perform the simulation step state = env.state.copy() obs_next, rew, done, env_info = env.step(act) # Get the potentially clipped action, i.e. the one that was actually done in the environment act_app = env.limit_act(act) # Record time after the step i.e. the send and receive is completed if record_dts: t_post_step = time.time() dt_policy = t_post_policy - t_start dt_step = t_post_step - t_post_policy # Record data obs_hist.append(obs) act_hist.append(act) act_app_hist.append(act_app) rew_hist.append(rew) state_hist.append(state) env_info_hist.append(env_info) if record_dts: dt_policy_hist.append(dt_policy) dt_step_hist.append(dt_step) dt_remainder_hist.append(dt_remainder) t += dt_policy + dt_step + dt_remainder else: t += env.dt t_hist.append(t) if isinstance(policy, Policy): if policy.is_recurrent: hidden_hist.append(hidden) hidden = hidden_next # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly if isinstance(getattr(policy, "policy", policy), PotentialBasedPolicy): pot_hist.append(hidden) stim_ext_hist.append(getattr(policy, "policy", policy).stimuli_external.detach().cpu().numpy()) stim_int_hist.append(getattr(policy, "policy", policy).stimuli_internal.detach().cpu().numpy()) elif isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy): head_2_hist.append(head_2_to) # Store the observation for next step (if done, this is the final observation) obs = obs_next # Render if wanted (actually renders the next state) env.render(render_mode, render_step) if render_mode.video: do_sleep = True if pyrado.mujoco_loaded: from pyrado.environments.mujoco.base import MujocoSimEnv if isinstance(env, MujocoSimEnv): # MuJoCo environments seem to crash on time.sleep() do_sleep = False if do_sleep: # Measure time spent and sleep if needed t_end = time.time() t_sleep = env.dt + t_start - t_end if t_sleep > 0: time.sleep(t_sleep) # -------- # End loop # -------- if not no_close: # Disconnect from EnvReal instance (does nothing for EnvSim instances) env.close() # Add final observation to observations list obs_hist.append(obs) state_hist.append(env.state.copy()) # Return result object res = StepSequence( observations=obs_hist, actions=act_hist, actions_applied=act_app_hist, rewards=rew_hist, states=state_hist, time=t_hist, rollout_info=rollout_info, env_infos=env_info_hist, complete=True, # the rollout function always returns complete paths ) # Add special entries to the resulting rollout if isinstance(policy, Policy): if policy.is_recurrent: res.add_data("hidden_states", hidden_hist) if isinstance(getattr(policy, "policy", policy), PotentialBasedPolicy): res.add_data("potentials", pot_hist) res.add_data("stimuli_external", stim_ext_hist) res.add_data("stimuli_internal", stim_int_hist) elif isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy): res.add_data("head_2", head_2_hist) if record_dts: res.add_data("dts_policy", dt_policy_hist) res.add_data("dts_step", dt_step_hist) res.add_data("dts_remainder", dt_remainder_hist) return res
import torch as to from botorch.acquisition import ExpectedImprovement, ProbabilityOfImprovement, UpperConfidenceBound from botorch.fit import fit_gpytorch_model from botorch.models import SingleTaskGP from botorch.optim import optimize_acqf from gpytorch.mlls import ExactMarginalLogLikelihood from matplotlib import pyplot as plt from tqdm import tqdm from pyrado import set_seed from pyrado.utils.functions import noisy_nonlin_fcn from pyrado.utils.math import UnitCubeProjector if __name__ == "__main__": # Adjustable experiment parameters set_seed(1001) num_init_samples = 4 # number of initial random points num_iter = 6 # number of BO updates noise_std = 0.0 # noise level acq_fcn = "EI" # acquisition function (UCB / EI / PI) num_acq_restarts = 100 # number of restarts for optimizing the acquisition function num_acq_samples = 500 # number of samples for used for optimizing the acquisition function ucb_beta = 0.1 # UCB coefficient (only necessary if UCB is used # Function boundaries x_min_raw, x_max_raw = (-2.0, 5.0) x_min, x_max = (0.0, 1.0) bounds_raw = to.tensor([[x_min_raw], [x_max_raw]]) bounds = to.tensor([[x_min], [x_max]]) uc = UnitCubeProjector(bounds_raw[0, :], bounds_raw[1, :])
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Load the data data_set_name = 'oscillation_50Hz_initpos-0.5' data = pd.read_csv(osp.join(pyrado.PERMA_DIR, 'time_series', f'{data_set_name}.csv')) if data_set_name == 'daily_min_temperatures': data = to.tensor(data['Temp'].values, dtype=to.get_default_dtype()).view(-1, 1) elif data_set_name == 'monthly_sunspots': data = to.tensor(data['Sunspots'].values, dtype=to.get_default_dtype()).view(-1, 1) elif 'oscillation' in data_set_name: data = to.tensor(data['Positions'].values, dtype=to.get_default_dtype()).view(-1, 1) else: raise pyrado.ValueErr( given=data_set_name, eq_constraint="'daily_min_temperatures', 'monthly_sunspots', " "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4") # Dataset data_set_hparam = dict( name=data_set_name, ratio_train=0.7, window_size=trial.suggest_int('dataset_window_size', 1, 100), standardize_data=False, scale_min_max_data=True ) dataset = TimeSeriesDataSet(data, **data_set_hparam) # Policy policy_hparam = dict( dt=0.02 if 'oscillation' in data_set_name else 1., hidden_size=trial.suggest_int('policy_hidden_size', 2, 51), obs_layer=None, activation_nonlin=fcn_from_str( trial.suggest_categorical('policy_activation_nonlin', ['to_tanh', 'to_sigmoid'])), mirrored_conv_weights=trial.suggest_categorical('policy_mirrored_conv_weights', [True, False]), conv_out_channels=1, conv_kernel_size=None, conv_padding_mode=trial.suggest_categorical('policy_conv_padding_mode', ['zeros', 'circular']), tau_init=trial.suggest_loguniform('policy_tau_init', 1e-2, 1e3), tau_learnable=True, kappa_init=trial.suggest_categorical('policy_kappa_init', [0, 1e-4, 1e-2]), kappa_learnable=True, potential_init_learnable=trial.suggest_categorical('policy_potential_init_learnable', [True, False]), init_param_kwargs=trial.suggest_categorical('policy_init_param_kwargs', [None, dict(bell=True)]), use_cuda=False ) policy = NFPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam) # Algorithm algo_hparam = dict( windowed=trial.suggest_categorical('algo_windowed', [True, False]), max_iter=1000, optim_class=optim.Adam, optim_hparam=dict( lr=trial.suggest_uniform('optim_lr', 5e-4, 5e-2), eps=trial.suggest_uniform('optim_eps', 1e-8, 1e-5), weight_decay=trial.suggest_uniform('optim_weight_decay', 5e-5, 5e-3) ), loss_fcn=nn.MSELoss(), ) csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}')) algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate num_init_samples = dataset.window_size _, loss_trn = TSPred.evaluate(policy, dataset.data_trn_inp, dataset.data_trn_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False) _, loss_tst = TSPred.evaluate(policy, dataset.data_tst_inp, dataset.data_tst_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False) return loss_trn
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1/250., max_steps=1500) env = ActNormWrapper(env) # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # Critic value_fcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) critic_hparam = dict( gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), batch_size=100, lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) critic = GAE(value_fcn, **critic_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=500, min_steps=25*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), batch_size=100, std_init=0.9, lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret
""" First, we create an `Experiment`, which basically is a folder (by default in `Pyrado/data/temp`). The experiments are stored using the following scheme: <base_dir>/<env_name>/<algo_name>/<timestamp>--<extra_info>. This rule is only required for the automatic search for experiments (e.g. used in `sim_policy()`). This search function requires the individual experiment folders to start with `date_time`. Aside from this, you can name your experiments and folders however you like. Use the `load_experiment()` function to later oad your results. It will look for an environment as well as a policy file in the provided path. """ ex_dir = setup_experiment(BallOnBeamSim.name, f"{HCNormal.name}_{LinearPolicy.name}", "ident-sin") """ Additionally, you can set a seed for the random number generators. It is suggested to do so, if you want to compare changes of certain hyper-parameters to eliminate the effect of the initial state and the initial policy parameters (both are sampled randomly in most cases). """ pyrado.set_seed(seed=0, verbose=True) """ Set up the environment a.k.a. domain to train in. After creating the environment, you can apply various wrappers which are modular. Note that the order of wrappers might be of importance. For example, wrapping an environment with an `ObsNormWrapper` and then with an `GaussianObsNoiseWrapper` applies the noise on the normalized observations, and yields different results than the reverse order of wrapping. Environments in Pyrado can be of different types: (i) written in Python only (like the Quanser simulations or simple OpenAI Gym environments), (ii) wrapped as well as self-designed MuJoCo-based simulations, or (iii) self-designed robotic environments powered by Rcs using either the Bullet or Vortex physics engine. None of the simulations includes any computer vision aspects. It is all about dynamics-based interaction and (continuous) control. The degree of randomization for the environments varies strongly, since it is a lot of work to randomize them properly (including testing) and I have to graduate after all ;) """ env_hparams = dict(dt=1 / 50.0, max_steps=300) env = BallOnBeamSim(**env_hparams) env = ActNormWrapper(env)
f"{SysIdViaEpisodicRL.name}-{CEM.name}") subrtn, subrtn_hparam = create_cem_subrtn(ex_dir, env_sim, ddp_policy) elif args.mode == REPS.name: ex_dir = setup_experiment(env_real.name, f"{SysIdViaEpisodicRL.name}-{REPS.name}") subrtn, subrtn_hparam = create_reps_subrtn(ex_dir, env_sim, ddp_policy) elif args.mode == NES.name: ex_dir = setup_experiment(env_real.name, f"{SysIdViaEpisodicRL.name}-{NES.name}") subrtn, subrtn_hparam = create_nes_subrtn(ex_dir, env_sim, ddp_policy) else: raise NotImplementedError( "Select mode cem, reps, or nes via the command line argument -m") # Set the seed pyrado.set_seed(1001, verbose=True) # Set the hyper-parameters of SysIdViaEpisodicRL num_eval_rollouts = 5 algo_hparam = dict( metric=None, std_obs_filt=5, obs_dim_weight=[1, 1, 1, 1, 10, 10], num_rollouts_per_distr=len(dp_map) * 10, # former 50 num_workers=subrtn_hparam["num_workers"], ) # Save the environments and the hyper-parameters save_dicts_to_yaml( dict(env=env_hparams), dict(subrtn=subrtn_hparam, subrtn_name=subrtn.name),
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical("shared_hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict( hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( "min_steps_algo", [1]), # 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform("memory_size_algo", 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform("tau_algo", 0.99, 1.0), ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9), learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo", [True, False]), standardize_rew=trial.suggest_categorical("standardize_rew_algo", [False]), gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0), target_update_intvl=trial.suggest_categorical( "target_update_intvl_algo", [1, 5]), num_updates_per_step=trial.suggest_categorical( "num_batch_updates_algo", [1, 5]), batch_size=trial.suggest_categorical("batch_size_algo", [128, 256, 512]), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f"trial_{trial.number}")) algo = SAC(study_dir, env, policy, qfcn_1, qfcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
parser.add_argument("--frequency", default=250, type=int) parser.set_defaults(max_steps=600) parser.add_argument("--ppo_iterations", default=150, type=int) parser.add_argument("--sprl_iterations", default=50, type=int) parser.add_argument("--cov_only", action="store_true") args = parser.parse_args() # Experiment (set seed before creating the modules) ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{PPO.name}_{FNNPolicy.name}", f"{args.frequency}Hz_{args.max_steps}ROLen_{args.ppo_iterations}PPOIter_{args.sprl_iterations}SPRLIter_cov_only{args.cov_only}_seed_{args.seed}", ) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environment env_hparams = dict(dt=1 / float(args.frequency), max_steps=args.max_steps) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) # FNN # policy_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) # FNN # vfcn_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU
args = get_argparser().parse_args() # Set up the example ex_dir = osp.join(pyrado.EVAL_DIR, 'illustrative_example') env = CatapultExample(m=1., g_M=3.71, k_M=1000., x_M=0.5, g_V=8.87, k_V=3000., x_V=1.5) psi = 0.7 # true probability of drawing Venus S = 100 # 100 N = 30 # 30 noise_th_scale = 0.15 # 0.15 set_seed(args.seed) fig_size = tuple([0.75 * x for x in pyrado.figsize_thesis_1percol_18to10]) th_true_opt = env.opt_policy_param( 1 - psi, psi) # true probabilities instead of counts J_true_opt = env.opt_est_expec_return( 1 - psi, psi) # true probabilities instead of counts print(f'th_true_opt: {th_true_opt}') print(f'J_true_opt: {J_true_opt}\n') # Initialize containers n_M_hist = np.empty((S, N)) n_V_hist = np.empty((S, N)) th_n_opt_hist = np.empty((S, N)) th_c_hist = np.empty((S, N)) Jhat_th_n_opt_hist = np.empty((S, N))