def __call__(self, dp_values: to.Tensor = None) -> Tuple[to.Tensor, StepSequence]: """ Yield one rollout from the pre-recorded buffer of rollouts, and compute the features of the data used for sbi. :param dp_values: ignored, just here for the interface compatibility :return: features computed from the time series data, and the complete rollout """ print_cbt_once(f"Using pre-recorded target domain rollouts to from {self.rollouts_dir}", "g") # Get pre-recoded rollout and advance the index if not isinstance(self.rollouts_rec, list): raise pyrado.TypeErr(given=self.rollouts_rec, expected_type=list) if not isinstance(self.rollouts_rec[0], StepSequence): raise pyrado.TypeErr(given=self.rollouts_rec[0], expected_type=StepSequence) ro = self.rollouts_rec[self._ring_idx] self._ring_idx = (self._ring_idx + 1) % self.num_rollouts # Pre-processing ro.torch() # Assemble the data data_real = to.cat([ro.states[:-1, :], ro.get_data_values(self._action_field)], dim=1) if self._embedding.requires_target_domain_data: data_real = to.cat([data_real, data_real], dim=1) # Compute the features data_real = data_real.unsqueeze(0) # only one target domain rollout data_real = self._embedding(Embedding.pack(data_real)) # shape [1, dim_feat] # Check shape (here no batching and always one rollout) if data_real.shape[0] != 1 or data_real.ndim != 2: raise pyrado.ShapeErr(given=data_real, expected_match=(1, -1)) return data_real, ro
def test_print_cbt_once(color, bright): # Reset the flag for this test print_cbt_once.has_run = False msg = 'You should only read this once per color and brightness' for i in range(10): print_cbt_once(msg, color, bright, tag='tag', end='\n') if i > 0: assert print_cbt_once.has_run
def step(self, snapshot_mode: str, meta_info: dict = None): if self._memory.isempty: # Warm-up phase print_cbt_once("Collecting samples until replay memory if full.", "w") # Sample steps and store them in the replay memory ros = self.sampler_init.sample() self._memory.push(ros) else: # Sample steps and store them in the replay memory ros = self.sampler.sample() self._memory.push(ros) self._cnt_samples += sum([ro.length for ro in ros ]) # don't count the evaluation samples # Log metrics computed from the old policy (before the update) if self._curr_iter % self.logger.print_intvl == 0: ros = self.sampler_eval.sample() rets = [ro.undiscounted_return() for ro in ros] ret_max = np.max(rets) ret_med = np.median(rets) ret_avg = np.mean(rets) ret_min = np.min(rets) ret_std = np.std(rets) else: ret_max, ret_med, ret_avg, ret_min, ret_std = 5 * [ -pyrado.inf ] # dummy values self.logger.add_value("max return", ret_max, 4) self.logger.add_value("median return", ret_med, 4) self.logger.add_value("avg return", ret_avg, 4) self.logger.add_value("min return", ret_min, 4) self.logger.add_value("std return", ret_std, 4) self.logger.add_value("avg memory reward", self._memory.avg_reward(), 4) self.logger.add_value("avg rollout length", np.mean([ro.length for ro in ros]), 4) self.logger.add_value("num total samples", self._cnt_samples) # Save snapshot data self.make_snapshot(snapshot_mode, float(ret_avg), meta_info) # Use data in the memory to update the policy and the Q-functions self.update()
def get_V_tholds(cls, load_experiments: bool = True) -> dict: """ If available, the voltage thresholds computed from measurements, else use default values. """ # Hard-coded default thresholds tholds = dict(V_thold_x_pos=0.28, V_thold_x_neg=-0.10, V_thold_y_pos=0.28, V_thold_y_neg=-0.074) if load_experiments: if cls.measured_tholds is None: ex_dir = osp.join(pyrado.EVAL_DIR, 'volt_thold_qbb') if osp.exists(ex_dir) and osp.isdir(ex_dir) and os.listdir( ex_dir): print_cbt_once( 'Found measured thresholds, using the averages.', 'g') # Calculate cumulative running average cma = np.zeros((2, 2)) i = 0. for f in os.listdir(ex_dir): if f.endswith('.npy'): i += 1. cma = cma + (np.load(osp.join(ex_dir, f)) - cma) / i tholds['V_thold_x_pos'] = cma[0, 1] tholds['V_thold_x_neg'] = cma[0, 0] tholds['V_thold_y_pos'] = cma[1, 1] tholds['V_thold_y_neg'] = cma[1, 0] else: print_cbt_once( 'No measured thresholds found, falling back to default values.', 'y') # Cache results for future calls cls.measured_tholds = tholds else: tholds = cls.measured_tholds return tholds
def get_voltage_tholds(cls, load_experiments: bool = True) -> dict: """If available, the voltage thresholds computed from measurements, else use default values.""" # Hard-coded default thresholds tholds = dict(voltage_thold_x_pos=0.28, voltage_thold_x_neg=-0.10, voltage_thold_y_pos=0.28, voltage_thold_y_neg=-0.074) if load_experiments: if cls.measured_tholds is None: ex_dir = osp.join(pyrado.EVAL_DIR, "volt_thold_qbb") if osp.exists(ex_dir) and osp.isdir(ex_dir) and os.listdir( ex_dir): print_cbt_once( "Found measured thresholds, using the averages.", "g") # Calculate cumulative running average cma = np.zeros((2, 2)) i = 0.0 for f in filter(lambda f: f.endswith(".npy"), os.listdir(".npy")): i += 1.0 cma = cma + (np.load(osp.join(ex_dir, f)) - cma) / i tholds["voltage_thold_x_pos"] = cma[0, 1] tholds["voltage_thold_x_neg"] = cma[0, 0] tholds["voltage_thold_y_pos"] = cma[1, 1] tholds["voltage_thold_y_neg"] = cma[1, 0] else: print_cbt_once( "No measured thresholds found, falling back to default values.", "y") # Cache results for future calls cls.measured_tholds = tholds else: tholds = cls.measured_tholds return tholds
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param pop_size: number of solutions in the population :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, LinearPolicy): print_cbt_once('PoWER was designed for linear policies.', 'y') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Initialize memory for importance sampling self.num_is_samples = min(pop_size, num_is_samples) self.is_mem_ret = 1e-6 * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param)
def step(self, act: np.ndarray) -> tuple: # Start robcom direct-control process if self._curr_step == 0: print_cbt('Executing trajectory on Barret WAM', color='c', bright=True) self._dc.start() info = dict(t=self._curr_step * self._dt, act_raw=act) # Current reward depending on the (measurable) state and the current (unlimited) action remaining_steps = self._max_steps - ( self._curr_step + 1) if self._max_steps is not pyrado.inf else 0 self._curr_rew = self._task.step_rew( self.state, act, remaining_steps) # always 0 for wam-bic-real # Limit the action act = self.limit_act(act) # The policy operates on specific indices `self.idcs_act`, i.e. joint 1 and 3 (and 5) self._qpos_des[self.idcs_act] = self.qpos_des_init[ self.idcs_act] + act[:len(self.idcs_act)] self._qvel_des[self.idcs_act] = act[len(self.idcs_act):] # Send desired positions and velocities to robcom self._dc.groups.set(robcom.JointDesState.POS, self._qpos_des) self._dc.groups.set(robcom.JointDesState.VEL, self._qvel_des) self._dc.send_updates() # Sleep to keep the frequency to_sleep = self._dt - (time.time() - self._t) if to_sleep > 0.: time.sleep(to_sleep) else: print_cbt_once( 'The step call was too slow for the control frequency', color='y') self._t = time.time() # Get current joint angles and angular velocities qpos, qvel = self._get_joint_state() self.qpos_real[self._curr_step] = qpos self.qvel_real[self._curr_step] = qvel self.state = np.concatenate([qpos, qvel]) # Update current step and state self._curr_step += 1 # A GoallessTask only signals done when has_failed() is true, i.e. the the state is out of bounds done = self._task.is_done(self.state) # always false for wam-bic-real # Check if exceeded max time steps if self._curr_step >= self._max_steps: done = True # Add final reward if done if done: # Ask the user to enter the final reward self._curr_rew += self._task.final_rew(self.state, remaining_steps) # Stop robcom direct-control process self._dc.stop() # Stop robcom data streaming self._client.set(robcom.Streaming, False) return self.observe(self.state), self._curr_rew, done, info
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, eps: float, num_init_states_per_domain: int, pop_size: Optional[int], expl_std_init: float, expl_std_min: float = 0.01, num_domains: int = 1, symm_sampling: bool = False, softmax_transform: bool = False, use_map: bool = True, optim_mode: Optional[str] = "scipy", num_epoch_dual: int = 1000, lr_dual: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP optimizer from scipy (recommended) :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if `optim_mode = 'scipy'` :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'` :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)): print_cbt_once("REPS was designed for linear policies.", "y") # Call ParameterExploring's constructor super().__init__( save_dir=save_dir, env=env, policy=policy, max_iter=max_iter, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Store the inputs self.eps = eps self.softmax_transform = softmax_transform self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, use_cuda=self._policy.device != "cpu", ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Dual optimization self.num_epoch_dual = num_epoch_dual self._log_eta = to.tensor([0.0], requires_grad=True) self.optim_mode = optim_mode.lower() if self.optim_mode == "scipy": pass elif self.optim_mode == "torch": self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4) # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5) # used in [2], but unstable here else: raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])