def __call__(self, dp_values: to.Tensor = None) -> Tuple[to.Tensor, StepSequence]:
        """
        Yield one rollout from the pre-recorded buffer of rollouts, and compute the features of the data used for sbi.

        :param dp_values: ignored, just here for the interface compatibility
        :return: features computed from the time series data, and the complete rollout
        """
        print_cbt_once(f"Using pre-recorded target domain rollouts to from {self.rollouts_dir}", "g")

        # Get pre-recoded rollout and advance the index
        if not isinstance(self.rollouts_rec, list):
            raise pyrado.TypeErr(given=self.rollouts_rec, expected_type=list)
        if not isinstance(self.rollouts_rec[0], StepSequence):
            raise pyrado.TypeErr(given=self.rollouts_rec[0], expected_type=StepSequence)

        ro = self.rollouts_rec[self._ring_idx]
        self._ring_idx = (self._ring_idx + 1) % self.num_rollouts

        # Pre-processing
        ro.torch()

        # Assemble the data
        data_real = to.cat([ro.states[:-1, :], ro.get_data_values(self._action_field)], dim=1)
        if self._embedding.requires_target_domain_data:
            data_real = to.cat([data_real, data_real], dim=1)

        # Compute the features
        data_real = data_real.unsqueeze(0)  # only one target domain rollout
        data_real = self._embedding(Embedding.pack(data_real))  # shape [1, dim_feat]

        # Check shape (here no batching and always one rollout)
        if data_real.shape[0] != 1 or data_real.ndim != 2:
            raise pyrado.ShapeErr(given=data_real, expected_match=(1, -1))

        return data_real, ro
Example #2
0
def test_print_cbt_once(color, bright):
    # Reset the flag for this test
    print_cbt_once.has_run = False

    msg = 'You should only read this once per color and brightness'
    for i in range(10):
        print_cbt_once(msg, color, bright, tag='tag', end='\n')
        if i > 0:
            assert print_cbt_once.has_run
Example #3
0
    def step(self, snapshot_mode: str, meta_info: dict = None):
        if self._memory.isempty:
            # Warm-up phase
            print_cbt_once("Collecting samples until replay memory if full.",
                           "w")
            # Sample steps and store them in the replay memory
            ros = self.sampler_init.sample()
            self._memory.push(ros)
        else:
            # Sample steps and store them in the replay memory
            ros = self.sampler.sample()
            self._memory.push(ros)
        self._cnt_samples += sum([ro.length for ro in ros
                                  ])  # don't count the evaluation samples

        # Log metrics computed from the old policy (before the update)
        if self._curr_iter % self.logger.print_intvl == 0:
            ros = self.sampler_eval.sample()
            rets = [ro.undiscounted_return() for ro in ros]
            ret_max = np.max(rets)
            ret_med = np.median(rets)
            ret_avg = np.mean(rets)
            ret_min = np.min(rets)
            ret_std = np.std(rets)
        else:
            ret_max, ret_med, ret_avg, ret_min, ret_std = 5 * [
                -pyrado.inf
            ]  # dummy values
        self.logger.add_value("max return", ret_max, 4)
        self.logger.add_value("median return", ret_med, 4)
        self.logger.add_value("avg return", ret_avg, 4)
        self.logger.add_value("min return", ret_min, 4)
        self.logger.add_value("std return", ret_std, 4)
        self.logger.add_value("avg memory reward", self._memory.avg_reward(),
                              4)
        self.logger.add_value("avg rollout length",
                              np.mean([ro.length for ro in ros]), 4)
        self.logger.add_value("num total samples", self._cnt_samples)

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)

        # Use data in the memory to update the policy and the Q-functions
        self.update()
Example #4
0
    def get_V_tholds(cls, load_experiments: bool = True) -> dict:
        """ If available, the voltage thresholds computed from measurements, else use default values. """
        # Hard-coded default thresholds
        tholds = dict(V_thold_x_pos=0.28,
                      V_thold_x_neg=-0.10,
                      V_thold_y_pos=0.28,
                      V_thold_y_neg=-0.074)

        if load_experiments:
            if cls.measured_tholds is None:
                ex_dir = osp.join(pyrado.EVAL_DIR, 'volt_thold_qbb')
                if osp.exists(ex_dir) and osp.isdir(ex_dir) and os.listdir(
                        ex_dir):
                    print_cbt_once(
                        'Found measured thresholds, using the averages.', 'g')
                    # Calculate cumulative running average
                    cma = np.zeros((2, 2))
                    i = 0.
                    for f in os.listdir(ex_dir):
                        if f.endswith('.npy'):
                            i += 1.
                            cma = cma + (np.load(osp.join(ex_dir, f)) -
                                         cma) / i
                    tholds['V_thold_x_pos'] = cma[0, 1]
                    tholds['V_thold_x_neg'] = cma[0, 0]
                    tholds['V_thold_y_pos'] = cma[1, 1]
                    tholds['V_thold_y_neg'] = cma[1, 0]
                else:
                    print_cbt_once(
                        'No measured thresholds found, falling back to default values.',
                        'y')

                # Cache results for future calls
                cls.measured_tholds = tholds
            else:
                tholds = cls.measured_tholds

        return tholds
Example #5
0
    def get_voltage_tholds(cls, load_experiments: bool = True) -> dict:
        """If available, the voltage thresholds computed from measurements, else use default values."""
        # Hard-coded default thresholds
        tholds = dict(voltage_thold_x_pos=0.28,
                      voltage_thold_x_neg=-0.10,
                      voltage_thold_y_pos=0.28,
                      voltage_thold_y_neg=-0.074)

        if load_experiments:
            if cls.measured_tholds is None:
                ex_dir = osp.join(pyrado.EVAL_DIR, "volt_thold_qbb")
                if osp.exists(ex_dir) and osp.isdir(ex_dir) and os.listdir(
                        ex_dir):
                    print_cbt_once(
                        "Found measured thresholds, using the averages.", "g")
                    # Calculate cumulative running average
                    cma = np.zeros((2, 2))
                    i = 0.0
                    for f in filter(lambda f: f.endswith(".npy"),
                                    os.listdir(".npy")):
                        i += 1.0
                        cma = cma + (np.load(osp.join(ex_dir, f)) - cma) / i
                    tholds["voltage_thold_x_pos"] = cma[0, 1]
                    tholds["voltage_thold_x_neg"] = cma[0, 0]
                    tholds["voltage_thold_y_pos"] = cma[1, 1]
                    tholds["voltage_thold_y_neg"] = cma[1, 0]
                else:
                    print_cbt_once(
                        "No measured thresholds found, falling back to default values.",
                        "y")

                # Cache results for future calls
                cls.measured_tholds = tholds
            else:
                tholds = cls.measured_tholds

        return tholds
Example #6
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, LinearPolicy):
            print_cbt_once('PoWER was designed for linear policies.', 'y')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = 1e-6 * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)
Example #7
0
    def step(self, act: np.ndarray) -> tuple:
        # Start robcom direct-control process
        if self._curr_step == 0:
            print_cbt('Executing trajectory on Barret WAM',
                      color='c',
                      bright=True)
            self._dc.start()

        info = dict(t=self._curr_step * self._dt, act_raw=act)

        # Current reward depending on the (measurable) state and the current (unlimited) action
        remaining_steps = self._max_steps - (
            self._curr_step + 1) if self._max_steps is not pyrado.inf else 0
        self._curr_rew = self._task.step_rew(
            self.state, act, remaining_steps)  # always 0 for wam-bic-real

        # Limit the action
        act = self.limit_act(act)

        # The policy operates on specific indices `self.idcs_act`, i.e. joint 1 and 3 (and 5)
        self._qpos_des[self.idcs_act] = self.qpos_des_init[
            self.idcs_act] + act[:len(self.idcs_act)]
        self._qvel_des[self.idcs_act] = act[len(self.idcs_act):]

        # Send desired positions and velocities to robcom
        self._dc.groups.set(robcom.JointDesState.POS, self._qpos_des)
        self._dc.groups.set(robcom.JointDesState.VEL, self._qvel_des)
        self._dc.send_updates()

        # Sleep to keep the frequency
        to_sleep = self._dt - (time.time() - self._t)
        if to_sleep > 0.:
            time.sleep(to_sleep)
        else:
            print_cbt_once(
                'The step call was too slow for the control frequency',
                color='y')
        self._t = time.time()

        # Get current joint angles and angular velocities
        qpos, qvel = self._get_joint_state()
        self.qpos_real[self._curr_step] = qpos
        self.qvel_real[self._curr_step] = qvel
        self.state = np.concatenate([qpos, qvel])

        # Update current step and state
        self._curr_step += 1

        # A GoallessTask only signals done when has_failed() is true, i.e. the the state is out of bounds
        done = self._task.is_done(self.state)  # always false for wam-bic-real

        # Check if exceeded max time steps
        if self._curr_step >= self._max_steps:
            done = True

        # Add final reward if done
        if done:
            # Ask the user to enter the final reward
            self._curr_rew += self._task.final_rew(self.state, remaining_steps)

            # Stop robcom direct-control process
            self._dc.stop()

            # Stop robcom data streaming
            self._client.set(robcom.Streaming, False)

        return self.observe(self.state), self._curr_rew, done, info
Example #8
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        eps: float,
        num_init_states_per_domain: int,
        pop_size: Optional[int],
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        softmax_transform: bool = False,
        use_map: bool = True,
        optim_mode: Optional[str] = "scipy",
        num_epoch_dual: int = 1000,
        lr_dual: float = 5e-4,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP
                           optimizer from scipy (recommended)
        :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if
                               `optim_mode = 'scipy'`
        :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'`
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)):
            print_cbt_once("REPS was designed for linear policies.", "y")

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Store the inputs
        self.eps = eps
        self.softmax_transform = softmax_transform
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=self._policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Dual optimization
        self.num_epoch_dual = num_epoch_dual
        self._log_eta = to.tensor([0.0], requires_grad=True)
        self.optim_mode = optim_mode.lower()
        if self.optim_mode == "scipy":
            pass
        elif self.optim_mode == "torch":
            self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4)
            # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5)  # used in [2], but unstable here
        else:
            raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])