def __init__(self, wrapped_task: Task, mode: FinalRewMode, factor: float = 1e3): """ Constructor :param wrapped_task: task to wrap :param mode: mode for calculating the final reward :param factor: value to scale the final reward, does not matter if `mode.time_dependent is True` """ # Call TaskWrapper's constructor super().__init__(wrapped_task) if not isinstance(mode, FinalRewMode): raise pyrado.TypeErr(given=mode, expected_type=FinalRewMode) if mode.user_input and (mode.always_positive or mode.always_negative or mode.state_dependent or mode.time_dependent): print_cbt( 'If the user_input == True, then all other specifications in FinalRewMode are ignored.', 'w') self.mode = mode self.factor = factor self._yielded_final_rew = False
def eval_damping(): """ Plot joint trajectories for different joint damping parameters """ # Load experiment and remove possible randomization wrappers ex_dir = ask_for_experiment() env, policy, _ = load_experiment(ex_dir) env = inner_env(env) env.domain_param = WAMBallInCupSim.get_nominal_domain_param() data = [] t = [] dampings = [0., 1e-2, 1e-1, 1e0] print_cbt(f'Run policy for damping coefficients: {dampings}') for d in dampings: env.reset(domain_param=dict(joint_damping=d)) ro = rollout(env, policy, render_mode=RenderMode(video=False), eval=True) t.append(ro.env_infos['t']) data.append(ro.env_infos['qpos']) fig, ax = plt.subplots(3, sharex='all') ls = ['k-', 'b--', 'g-.', 'r:'] # line style setting for better visibility for i, idx in enumerate([1, 3, 5]): for j in range(len(dampings)): ax[i].plot(t[j], data[j][:, idx], ls[j], label=f'damping: {dampings[j]}') if i == 0: ax[i].legend() ax[i].set_ylabel(f'joint {idx} pos [rad]') ax[2].set_xlabel('time [s]') plt.suptitle('Evaluation of joint damping coefficient') plt.show()
def check_ball_collisions(self, verbose: bool = False) -> bool: """ Check if an undesired collision with the ball occurs. :param verbose: print messages on collision :return: `True` if the ball collides with something else than the central parts of the cup """ for i in range(self.sim.data.ncon): # Get current contact object contact = self.sim.data.contact[i] # Extract body-id and body-name of both contact geoms body1 = self.model.geom_bodyid[contact.geom1] body1_name = self.model.body_names[body1] body2 = self.model.geom_bodyid[contact.geom2] body2_name = self.model.body_names[body2] # Evaluate if the ball collides with part of the WAM (collision bodies) # or the connection of WAM and cup (geom_ids) c1 = body1_name == 'ball' and (body2_name in self._collision_bodies or contact.geom2 in self._collision_geom_ids) c2 = body2_name == 'ball' and (body1_name in self._collision_bodies or contact.geom1 in self._collision_geom_ids) if c1 or c2: if verbose: print_cbt( f'Undesired collision of {body1_name} and {body2_name} detected!', 'y') return True return False
def contains(self, cand: np.ndarray, verbose: bool = False) -> bool: valid = any([s.contains(cand) for s in self._spaces]) if not valid and verbose: print_cbt(f'Violated all of the {len(self._spaces)} subspaces!', 'r') for s in self._spaces: s.contains(cand, verbose) return valid
def _keys_in_columns(df: DataFrame, *cands, verbose: bool) -> bool: if any([c not in df.columns for c in cands]): if verbose: print_cbt(f"Did not find {list(cands)} in the data frame. Skipped the associated plot.") return False else: return True
def check_ball_in_cup(self, *args, verbose: bool = False): """ Check if the ball is in the cup. :param verbose: print messages when ball is in the cup :return: `True` if the ball is in the cup """ for i in range(self.sim.data.ncon): # Get current contact object contact = self.sim.data.contact[i] # Extract body-id and body-name of both contact geoms body1 = self.model.geom_bodyid[contact.geom1] body1_name = self.model.body_names[body1] body2 = self.model.geom_bodyid[contact.geom2] body2_name = self.model.body_names[body2] # Evaluate if the ball collides with part of the WAM (collision bodies) # or the connection of WAM and cup (geom_ids) cup_inner_id = self.model._geom_name2id['cup_inner'] c1 = body1_name == 'ball' and contact.geom2 == cup_inner_id c2 = body2_name == 'ball' and contact.geom1 == cup_inner_id if c1 or c2: if verbose: print_cbt( f'The ball is in the cup at time step {self.curr_step}.', 'y') return True return False
def adapt(self, domain_distr_param: str, domain_distr_param_value: to.Tensor): """ Update this domain parameter. :param domain_distr_param: distribution parameter to update, e.g. mean or std :param domain_distr_param_value: new value of the distribution parameter """ # Set the attributes super().adapt(domain_distr_param, domain_distr_param_value) # Re-create the distributions, otherwise the changes will have no effect if domain_distr_param in ["target_mean", "target_cov_chol_flat"]: try: self._target_distr = MultivariateNormal(self.target_mean, self.target_cov, validate_args=True) except ValueError as err: print_cbt( f"Inputs that lead to the ValueError from PyTorch Distributions:\n" f"target_distribution; domain_distr_param = {domain_distr_param}\nloc = {self.target_mean}\ncov = {self.target_cov}" ) raise err if domain_distr_param in ["context_mean", "context_cov_chol_flat"]: try: self._context_distr = MultivariateNormal(self.context_mean, self.context_cov, validate_args=True) except ValueError as err: print_cbt( f"Inputs that lead to the ValueError from PyTorch Distributions:\n" f"init_distribution; domain_distr_param = {domain_distr_param}\nloc = {self.context_mean}\ncov = {self.context_cov}" ) raise err
def reset(self, init_state: np.ndarray = None, domain_param: dict = None) -> np.ndarray: # Create robcom GoTo process gt = self._client.create(robcom.Goto, 'RIGHT_ARM', '') # Move to initial state within 5 seconds gt.add_step(5., self.init_pose_des) print_cbt('Moving the Barret WAM to the initial position.', 'c', bright=True) # Start process and wait for completion gt.start() gt.wait_for_completion() print_cbt('Reached the initial position.', 'c') # Reset the task which also resets the reward function if necessary self._task.reset(env_spec=self.spec) input('Hit enter to continue.') # Reset time steps self._curr_step = 0 self.state = np.array([self._curr_step / self.max_steps]) return self.observe(self.state)
def calibrate(self): if self._calibrated: return print_cbt("Calibrating the cart-pole ...", "c") # Go to the left with completion_context("Going to the left", color="c", bright=True): obs, _, _, _ = self.step(np.zeros(self.act_space.shape)) ctrl = QCartPoleGoToLimCtrl(obs, positive=True) while not ctrl.done: act = ctrl(obs) obs, _, _, _ = self.step(act) if ctrl.success: self._norm_x_lim[1] = obs[0] else: raise RuntimeError("Going to the left limit failed!") # Go to the right with completion_context("Going to the right", color="c", bright=True): obs, _, _, _ = self.step(np.zeros(self.act_space.shape)) ctrl = QCartPoleGoToLimCtrl(obs, positive=False) while not ctrl.done: act = ctrl(obs) obs, _, _, _ = self.step(act) if ctrl.success: self._norm_x_lim[0] = obs[0] else: raise RuntimeError("Going to the right limit failed!") # Activate the absolute cart position: self._calibrated = True
def wrap_like_other_env(env_targ: Env, env_src: [SimEnv, EnvWrapper]) -> Env: """ Wrap a given real environment like it's simulated counterpart (except the domain randomization of course). :param env_targ: target environment e.g. environment representing the physical device :param env_src: source environment e.g. simulation environment used for training :return: target environment """ if env_src.dt > env_targ.dt: ds_factor = int(env_src.dt / env_targ.dt) env_targ = DownsamplingWrapper(env_targ, ds_factor) print_cbt( f'Wrapped the env with an DownsamplingWrapper of factor {ds_factor}.', 'c') if typed_env(env_src, ActNormWrapper) is not None: env_targ = ActNormWrapper(env_targ) print_cbt('Wrapped the env with an ActNormWrapper.', 'c') if typed_env(env_src, ObsNormWrapper) is not None: env_targ = ObsNormWrapper(env_targ) print_cbt('Wrapped the env with an ObsNormWrapper.', 'c') elif typed_env(env_src, ObsRunningNormWrapper) is not None: env_targ = ObsRunningNormWrapper(env_targ) print_cbt('Wrapped the env with an ObsRunningNormWrapper.', 'c') if typed_env(env_src, ObsPartialWrapper) is not None: env_targ = ObsPartialWrapper(env_targ, mask=typed_env( env_src, ObsPartialWrapper).keep_mask, keep_selected=True) print_cbt('Wrapped the env with an ObsPartialWrapper.', 'c') return env_targ
def rollout_dummy_rbf_policy_4dof(): # Environment env = WAMBallInCupSim( num_dof=4, max_steps=3000, # Note, when tuning the task args: the `R` matrices are now 4x4 for the 4 dof WAM task_args=dict(R=np.zeros((4, 4)), R_dev=np.diag([0.2, 0.2, 1e-2, 1e-2])), ) # Stabilize ball and print out the stable state env.reset() act = np.zeros(env.spec.act_space.flat_dim) for i in range(1500): env.step(act) env.render(mode=RenderMode(video=True)) # Printing out actual positions for 4-dof (..just needed to setup the hard-coded values in the class) print('Ball pos:', env.sim.data.get_body_xpos('ball')) print('Cup goal:', env.sim.data.get_site_xpos('cup_goal')) print('Joint pos (incl. first rope angle):', env.sim.data.qpos[:5]) # Apply DualRBFLinearPolicy and plot the joint states over the desired ones rbf_hparam = dict(num_feat_per_dim=7, bounds=(np.array([0.]), np.array([1.]))) policy = DualRBFLinearPolicy(env.spec, rbf_hparam, dim_mask=2) done, param = False, None while not done: ro = rollout(env, policy, render_mode=RenderMode(video=True), eval=True, reset_kwargs=dict(domain_param=param)) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, _, param = after_rollout_query(env, policy, ro)
def __init__(self, num_feat_per_dim: int, bounds: [ Sequence[np.ndarray], Sequence[to.Tensor], Sequence[float] ], scale: float = None, state_wise_norm: bool = True): """ Constructor :param num_feat_per_dim: number of radial basis functions, identical for every dimension of the input :param bounds: lower and upper bound for the Gaussians' centers, the input dimension is inferred from them :param scale: scaling factor for the squared distance, if `None` the factor is determined such that two neighboring RBFs have a value of 0.2 at the other center :param state_wise_norm: `True` to apply the normalization across input state dimensions separately (every dimension sums to one), or `False` to jointly normalize them """ if not num_feat_per_dim > 1: raise pyrado.ValueErr(given=num_feat_per_dim, g_constraint='1') if not len(bounds) == 2: raise pyrado.ShapeErr(given=bounds, expected_match=np.empty(2)) # Get the bounds, e.g. from the observation space and then clip them in case the bounds_to = [None, None] for i, b in enumerate(bounds): if isinstance(b, np.ndarray): bounds_to[i] = to.from_numpy(b) elif isinstance(b, to.Tensor): bounds_to[i] = b.clone() elif isinstance(b, (int, float)): bounds_to[i] = to.tensor(b, dtype=to.get_default_dtype()).view( 1, ) else: raise pyrado.TypeErr( given=b, expected_type=[np.ndarray, to.Tensor, int, float]) if any([any(np.isinf(b)) for b in bounds_to]): bound_lo, bound_up = [ to.clamp(b, min=-1e6, max=1e6) for b in bounds_to ] print_cbt('Clipped the bounds of the RBF centers to [-1e6, 1e6].', 'y') else: bound_lo, bound_up = bounds_to # Create a matrix with center locations for the Gaussians num_dim = len(bound_lo) self.num_feat = num_feat_per_dim * num_dim self.centers = to.empty(num_feat_per_dim, num_dim) for i in range(num_dim): # Features along columns self.centers[:, i] = to.linspace(bound_lo[i], bound_up[i], num_feat_per_dim) if scale is None: delta_center = self.centers[1, :] - self.centers[0, :] self.scale = -to.log(to.tensor(0.2)) / to.pow(delta_center, 2) else: self.scale = scale self._state_wise_norm = state_wise_norm
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, pop_size: [int, None] = None, num_workers: int = 4, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy parameter set :param pop_size: number of solutions in the population, pass `None` to use a default that scales logarithmically with the number of policy parameters :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not (isinstance(pop_size, int) or pop_size is None): raise pyrado.TypeErr(given=pop_size, expected_type=int) if isinstance(pop_size, int) and pop_size <= 0: raise pyrado.ValueErr(given=pop_size, g_constraint='0') # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy, logger) # Store the inputs self._env = env self.num_rollouts = num_rollouts # Auto-select population size if needed if pop_size is None: pop_size = 4 + int(3 * np.log(policy.num_param)) print_cbt(f'Initialized population size to {pop_size}.', 'y') self.pop_size = pop_size # Create sampler self.sampler = ParameterExplorationSampler( env, policy, num_workers=num_workers, num_rollouts_per_param=num_rollouts, ) # Stopping criterion self.ret_avg_stack = 1e3 * np.random.randn(20) # stack size = 20 self.thold_ret_std = 1e-1 # algorithm terminates if below for multiple iterations # Saving the best policy (this is not the mean for policy parameter exploration) self.best_policy_param = policy.param_values.clone() # Set this in subclasses self._expl_strat = None
def close(self): """ Sends a zero-step and closes the communication. """ if self._qsoc.is_open(): self.step(np.zeros(self.act_space.shape)) self._qsoc.close() print_cbt('Closed the connection to the Quanser device.', 'c', bright=True)
def _compute_candidate(self, nc: int): """ Train and save one candidate solution to a pt-file :param nc: number of domains used for training the candidate solution """ # Do a warm start if desired self._subrtn_cand.init_modules( self.warmstart_cand, prefix=f"iter_{self._curr_iter - 1}", suffix="cand", policy_param_init=self.cand_policy_param_init, valuefcn_param_init=self.cand_critic_param_init, ) # Sample sets of physics params xi_{1}, ..., xi_{nc} self.env_dr.fill_buffer(nc) env_params_cand = self.env_dr.randomizer.get_params() joblib.dump( env_params_cand, osp.join(self.save_dir, f"iter_{self._curr_iter}_env_params_cand.pkl")) print("Randomized parameters of for the candidate solution:") print_domain_params(env_params_cand) # Reset the subroutine algorithm which includes resetting the exploration self._cnt_samples += self._subrtn_cand.sample_count self._subrtn_cand.reset() print("Reset candidate exploration noise.") pol_param_before = self._subrtn_cand.policy.param_values.clone() if isinstance(self._subrtn_cand, ActorCritic): # Set dropout and batch normalization layers to training mode self._subrtn_cand.critic.vfcn.train() critic_param_before = self._subrtn_cand.critic.vfcn.param_values.clone( ) # Solve the (approx) stochastic program SP_nc for the sampled physics parameter sets print_cbt(f"\nIteration {self._curr_iter} | Candidate solution\n", "c", bright=True) self._subrtn_cand.train(snapshot_mode="best", meta_info=dict( prefix=f"iter_{self._curr_iter}", suffix="cand")) if (self._subrtn_cand.policy.param_values == pol_param_before).all(): warn( "The candidate's policy parameters did not change during training!", UserWarning) if isinstance(self._subrtn_refs, ActorCritic): if (self._subrtn_cand.critic.vfcn.param_values == critic_param_before).all(): warn( "The candidate's critic parameters did not change during training!", UserWarning) print_cbt("Learned an approx solution for SP_nc.\n", "y")
def load(name: str, load_dir: PathLike, prefix: str = "", suffix: str = "", obj=None, verbose: bool = False): """ Load an object object using a prefix or suffix, depending on the meta information. :param name: name of the object for loading including the file extension, e.g. 'policy.pt' for PyTorch modules like a Pyrado `Policy` instance :param load_dir: directory to load from :param prefix: prefix for altering the name, e.g. 'iter_0_...' :param suffix: suffix for altering the name, e.g. '..._ref' :param obj: PyTorch module to load into, this can be `None` except for the case if you want to load and save the module's `state_dict` :param verbose: if `True`, print the path of what has been loaded .. seealso:: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference """ if not isinstance(name, str): raise TypeErr(given=name, expected_type=str) if not osp.isdir(load_dir): raise PathErr(given=load_dir) if not isinstance(prefix, str): raise TypeErr(given=prefix, expected_type=str) elif prefix != "": # A valid non-default prefix was given prefix = prefix + "_" if not isinstance(suffix, str): raise TypeErr(given=suffix, expected_type=str) elif suffix != "": # A valid non-default prefix was given suffix = "_" + suffix # Infer file type file_ext = name[name.rfind(".") + 1:] if not (file_ext in ["pt", "npy", "pkl"]): raise ValueErr( msg="Only pt, npy, and pkl files are currently supported!") # Load the data name_wo_file_ext = name[:name.find(".")] name_load = f"{prefix}{name_wo_file_ext}{suffix}.{file_ext}" obj_ = _load_fcn(path=osp.join(load_dir, name_load), extension=file_ext) assert obj_ is not None if isinstance(obj_, dict) and file_ext == "pt": # PyTorch saves state_dict as an OrderedDict obj.load_state_dict(obj_) else: obj = obj_ if verbose: print_cbt(f"Loaded {osp.join(load_dir, name_load)}", "g") return obj
def plot_features(ro: StepSequence, policy: Policy): """ Plot all features given the policy and the observation trajectories. :param policy: linear policy used during the rollout :param ro: input rollout """ if not isinstance(policy, LinearPolicy): print_cbt( 'Plotting of the feature values is only supports linear policies!', 'y') return if hasattr(ro, 'observations'): # Use recorded time stamps if possible t = ro.env_infos.get('t', np.arange(0, ro.length)) if hasattr( ro, 'env_infos') else np.arange(0, ro.length) # Recover the features from the observations feat_vals = policy.eval_feats(to.from_numpy(ro.observations)) dim_feat = range(feat_vals.shape[1]) if len(dim_feat) <= 6: divisor = 2 elif len(dim_feat) <= 12: divisor = 4 else: divisor = 8 num_cols = int(np.ceil(len(dim_feat) / divisor)) num_rows = int(np.ceil(len(dim_feat) / num_cols)) fig, axs = plt.subplots(num_rows, num_cols, figsize=(num_cols * 5, num_rows * 3), constrained_layout=True) fig.suptitle('Feature values over Time') plt.subplots_adjust(hspace=.5) colors = plt.get_cmap('tab20')(np.linspace(0, 1, len(dim_feat))) if len(dim_feat) == 1: axs.plot(t, feat_vals[:-1, dim_feat[0]], label=_get_obs_label(ro, dim_feat[0])) axs.legend() else: for i in range(num_rows): for j in range(num_cols): if j + i * num_cols < len(dim_feat): # Omit the last observation for simplicity axs[i, j].plot(t, feat_vals[:-1, j + i * num_cols], label=rf'$\phi_{j + i*num_cols}$', c=colors[j + i * num_cols]) axs[i, j].legend() else: # We might create more subplots than there are observations pass plt.show()
def _is_curr_task_done(self, state: np.ndarray, act: np.ndarray, remaining_steps: int, verbose: bool = False) -> float: """ Check if the current task is done. If so, move to the next one and return the final reward of this task. :param state: current state :param act: current action :param remaining_steps: number of time steps left in the episode :param verbose: print messages on success or failure :return: final return of the current subtask """ if (not self.succeeded_tasks[self._idx_curr] and not self.failed_tasks[self._idx_curr] and self._tasks[self._idx_curr].is_done(state)): # Task has not been marked done yet, but is now done if self._tasks[self._idx_curr].has_succeeded(state): # Check off successfully completed task self.succeeded_tasks[self._idx_curr] = True if verbose: print_cbt( f"task {self._idx_curr} has succeeded (is done) at state {state}", "g") elif self._tasks[self._idx_curr].has_failed(state): # Check off unsuccessfully completed task self.failed_tasks[self._idx_curr] = True if verbose: print_cbt( f"Task {self._idx_curr} has failed (is done) at state {state}", "r") else: raise pyrado.ValueErr( msg= f"Task {self._idx_curr} neither succeeded or failed but is done!" ) # Memorize current reward if self.hold_rew_when_done: self.held_rews[self._idx_curr] = self._tasks[ self._idx_curr].step_rew(state, act, remaining_steps=0) # Give a reward for completing the task defined by the task task_final_rew = self._tasks[self._idx_curr].final_rew( state, remaining_steps) # Advance to the next task self.idx_curr = (self._idx_curr + 1) % len(self) else: task_final_rew = 0.0 return task_final_rew