コード例 #1
0
ファイル: sbi_base.py プロジェクト: fdamken/SimuRLacra
    def __setstate__(self, state):
        # Call Algorithm's __setstate__()
        super().__setstate__(state)

        # Reconstruct the simulator for sbi
        try:
            rollouts_real = pyrado.load("rollouts_real.pkl",
                                        self._save_dir,
                                        prefix=f"iter_{self._curr_iter}")
        except FileNotFoundError:
            try:
                rollouts_real = pyrado.load(
                    "rollouts_real.pkl",
                    self._save_dir,
                    prefix=f"iter_{self._curr_iter - 1}")
            except (FileNotFoundError, RuntimeError, pyrado.PathErr,
                    pyrado.TypeErr, pyrado.ValueErr):
                rollouts_real = None
        self._setup_sbi(state["_sbi_prior"],
                        rollouts_real)  # sbi_prior is fine as it is

        # Reconstruct the tensorboard printer with the once from this algorithm
        summary_writer = state["_logger"].printers[2].writer
        assert isinstance(summary_writer, SummaryWriter)
        self.__dict__["_subrtn_sbi"]._summary_writer = summary_writer

        # Set the internal sbi construction callable given the predefined posterior hyper-parameter.
        self.__dict__["_subrtn_sbi"]._build_neural_net = sbiutils.posterior_nn(
            **self.posterior_hparam)
コード例 #2
0
def _load_experiment(ex_dir: pyrado.PathLike):
    # Load the algorithm
    algo = Algorithm.load_snapshot(ex_dir)
    if not isinstance(algo, (NPDR, BayesSim)):
        raise pyrado.TypeErr(given=algo, expected_type=(NPDR, BayesSim))

    # Load the prior and the data
    prior = pyrado.load("prior.pt", ex_dir)
    data_real = pyrado.load("data_real.pt", ex_dir)

    # Load the posteriors
    posteriors = [
        SBIBase.load_posterior(ex_dir, idx_round=i, verbose=True)
        for i in range(algo.num_sbi_rounds)
    ]
    posteriors = remove_none_from_list(
        posteriors)  # in case the algorithm terminated early

    if data_real.shape[0] > len(posteriors):
        print_cbt(
            f"Found {data_real.shape[0]} data sets but {len(posteriors)} posteriors. Truncated the superfluous data.",
            "y",
        )
        data_real = data_real[:len(posteriors), :]

    # Artificially repeat the data (which was the same for every round) to later be able to use the same code
    data_real = data_real.repeat(len(posteriors), 1)
    assert data_real.shape[0] == len(posteriors)

    return algo, prior, data_real, posteriors
コード例 #3
0
ファイル: bayrn.py プロジェクト: fdamken/SimuRLacra
    def train_argmax_policy(
        load_dir: pyrado.PathLike,
        env_sim: MetaDomainRandWrapper,
        subrtn: Algorithm,
        num_restarts: int,
        num_samples: int,
        policy_param_init: to.Tensor = None,
        valuefcn_param_init: to.Tensor = None,
        subrtn_snapshot_mode: str = "best",
    ) -> Policy:
        """
        Train a policy based on the maximizer of the posterior mean.

        :param load_dir: directory to load from
        :param env_sim: simulation environment
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine
        :return: the final BayRn policy
        """
        # Load the required data
        cands = pyrado.load("candidates.pt", load_dir)
        cands_values = pyrado.load("candidates_values.pt",
                                   load_dir).unsqueeze(1)
        ddp_space = pyrado.load("ddp_space.pkl", load_dir)

        if cands.shape[0] > cands_values.shape[0]:
            print_cbt(
                f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring the"
                f"candidates without evaluation for computing the argmax.",
                "y",
            )
            cands = cands[:cands_values.shape[0], :]

        # Find the maximizer
        argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values,
                                                  ddp_space, num_restarts,
                                                  num_samples)

        # Set the domain randomizer
        env_sim.adapt_randomizer(argmax_cand.numpy())

        # Reset the subroutine algorithm which includes resetting the exploration
        subrtn.reset()

        # Do a warm start
        subrtn.init_modules(warmstart=True,
                            policy_param_init=policy_param_init,
                            valuefcn_param_init=valuefcn_param_init)

        subrtn.train(snapshot_mode=subrtn_snapshot_mode,
                     meta_info=dict(suffix="argmax"))
        return subrtn.policy
コード例 #4
0
    def eval_init_policies(self):
        """
        Execute the trained initial policies on the target device and store the estimated return per candidate.
        The number of initial policies to evaluate is the number of found policies.
        """
        # Crawl through the experiment's directory
        for root, dirs, files in os.walk(self.save_dir):
            dirs.clear()  # prevents walk() from going into subdirectories
            found_policies = [p for p in files if p.startswith('init_') and p.endswith('_policy.pt')]
            found_cands = [c for c in files if c.startswith('init_') and c.endswith('_candidate.pt')]
        if not len(found_policies) == len(found_cands):
            raise pyrado.ValueErr(msg='Found a different number of initial policies than candidates!')
        elif len(found_policies) == 0:
            raise pyrado.ValueErr(msg='No policies or candidates found!')

        num_init_cand = len(found_cands)
        cands_values = to.empty(num_init_cand)

        # Load all found candidates to save them into a single tensor
        found_cands = natural_sort(found_cands)  # the order is important since it determines the rows of the tensor
        cands = to.stack([to.load(osp.join(self.save_dir, c)) for c in found_cands])

        # Evaluate learned policies from random candidates on the target environment (real-world) system
        for i in range(num_init_cand):
            policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'init_{i}'))
            cands_values[i] = self.eval_policy(self.save_dir, self._env_real, policy, self.mc_estimator,
                                               prefix=f'init_{i}', num_rollouts=self.num_eval_rollouts_real)

        # Save candidates's and their returns into tensors (policy is saved during training or exists already)
        # pyrado.save(cands, 'candidates', 'pt', self._save_dir, meta_info)
        pyrado.save(cands_values, 'candidates_values', 'pt', self.save_dir, meta_info=None)
        self.cands, self.cands_values = cands, cands_values
コード例 #5
0
ファイル: dql.py プロジェクト: arlene-kuehn/SimuRLacra
    def init_modules(self,
                     warmstart: bool,
                     suffix: str = '',
                     prefix: str = None,
                     **kwargs):
        # Initialize the policy
        super().init_modules(warmstart, suffix, prefix, **kwargs)

        if prefix is None:
            prefix = f'iter_{self._curr_iter - 1}'

        tpi = kwargs.get('target_param_init', None)

        if warmstart and tpi is not None:
            self.qfcn_targ.init_param(tpi)
        elif warmstart and tpi is None and self._curr_iter > 0:
            self.qfcn_targ = pyrado.load(self.qfcn_targ,
                                         'qfcn_target',
                                         'pt',
                                         self.save_dir,
                                         meta_info=dict(prefix=prefix,
                                                        suffix=suffix))
        else:
            # Reset the target Q-function
            self.qfcn_targ.init_param()
コード例 #6
0
ファイル: spota.py プロジェクト: arlene-kuehn/SimuRLacra
    def _handle_neg_samples(self, cand_rets: np.ndarray, refs_rets: np.ndarray,
                            k: int, i: int) -> np.ndarray:
        """
        Process negative optimality gap samples by Looking at the other Reference Solutions

        :param cand_rets: array of the candidate's return values
        :param refs_rets: array of the references' return values
        :param k: index of the reference solution
        :param i: index of the domain
        :return refs_rets: if a better reference has been round the associated value will be overwritten
        """
        if refs_rets[k, i] < cand_rets[k, i]:
            print_cbt(
                f'\nReference {k + 1} is worse than the candidate on domain realization {i + 1}.\n'  # 1-based index
                'Trying to replace this reference at this realization with a different one',
                'y')
            for other_k in range(self.nG):
                if other_k == k:
                    # Do nothing for the bad solution that brought us here
                    continue
                else:
                    # Load a reference solution different from the the k-th
                    other_ref = pyrado.load(
                        self._subrtn_refs._policy, 'policy', 'pt',
                        self.save_dir,
                        dict(prefix=f'iter_{self._curr_iter}',
                             suffix=f'ref_{other_k}'))
                    other_ref_ret = 0
                    for r in range(self.nJ):
                        # Set the same random seed
                        pyrado.set_seed(self.base_seed + i * self.nJ + r)
                        # Set the circular index for the particular realization
                        self.env_dr.ring_idx = i
                        # Do the rollout and collect the return
                        ro_other_ref = rollout(self.env_dr,
                                               other_ref,
                                               eval=True)
                        other_ref_ret += ro_other_ref.undiscounted_return(
                        ) / self.nJ  # average over nJ seeds
                    # Store the value if value is better
                    if other_ref_ret > refs_rets[k, i]:
                        refs_rets[k, i] = other_ref_ret
                        # If a better one was found, do not iterate over the remaining reference solutions
                        break

            if refs_rets[k, i] > cand_rets[k, i]:
                # Found a different reference that achieves a higher return that the candidate
                print_cbt('Successfully handled a negative OG sample', 'g')
            else:
                refs_rets[k, i] = cand_rets[
                    k, i]  # forces optimality gap sample to be 0
                print_cbt(
                    'Unsuccessfully handled a negative OG sample: Set the value to 0',
                    'r')

        else:
            # Everything is as it should be
            pass

        return refs_rets
コード例 #7
0
    def init_modules(self,
                     warmstart: bool,
                     suffix: str = '',
                     prefix: str = None,
                     **kwargs):
        if prefix is None:
            prefix = f'iter_{self._curr_iter - 1}'

        ppi = kwargs.get('policy_param_init', None)
        vpi = kwargs.get('valuefcn_param_init', None)

        if warmstart and ppi is not None and vpi is not None:
            self._policy.init_param(ppi)
            self._critic.vfcn.init_param(vpi)
            print_cbt('Learning given an fixed parameter initialization.', 'w')

        elif warmstart and ppi is None and self._curr_iter > 0:
            self._policy = pyrado.load(self._policy,
                                       'policy',
                                       'pt',
                                       self.save_dir,
                                       meta_info=dict(prefix=prefix,
                                                      suffix=suffix))
            self._critic.vfcn = pyrado.load(self._critic.vfcn,
                                            'vfcn',
                                            'pt',
                                            self.save_dir,
                                            meta_info=dict(prefix=prefix,
                                                           suffix=suffix))
            print_cbt(
                f'Learning given the results from iteration {self._curr_iter - 1}',
                'w')

        else:
            # Reset the policy
            self._policy.init_param()
            self._critic.vfcn.init_param()
            print_cbt('Learning from scratch.', 'w')
コード例 #8
0
ファイル: sac.py プロジェクト: fdamken/SimuRLacra
    def init_modules(self, warmstart: bool, suffix: str = "", prefix: str = None, **kwargs):
        # Initialize the policy
        super().init_modules(warmstart, suffix, prefix, **kwargs)

        if prefix is None:
            prefix = f"iter_{self._curr_iter - 1}"

        t1pi = kwargs.get("target1_param_init", None)
        t2pi = kwargs.get("target2_param_init", None)

        if warmstart and None not in (t1pi, t2pi):
            self.qfcn_targ_1.init_param(t1pi)
            self.qfcn_targ_2.init_param(t2pi)
        elif warmstart and None in (t1pi, t2pi) and self._curr_iter > 0:
            self.qfcn_targ_1 = pyrado.load(
                "qfcn_target1.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self.qfcn_targ_1
            )
            self.qfcn_targ_2 = pyrado.load(
                "qfcn_target2.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self.qfcn_targ_2
            )
        else:
            # Reset the target Q-functions
            self.qfcn_targ_1.init_param()
            self.qfcn_targ_2.init_param()
コード例 #9
0
ファイル: actor_critic.py プロジェクト: fdamken/SimuRLacra
    def init_modules(self, warmstart: bool, suffix: str = "", prefix: str = None, **kwargs):
        if prefix is None:
            prefix = f"iter_{self._curr_iter - 1}"

        ppi = kwargs.get("policy_param_init", None)
        vpi = kwargs.get("valuefcn_param_init", None)

        if warmstart and ppi is not None and vpi is not None:
            self._policy.init_param(ppi)
            self._critic.vfcn.init_param(vpi)
            print_cbt("Learning given an fixed parameter initialization.", "w")

        elif warmstart and ppi is None and self._curr_iter > 0:
            self._policy = pyrado.load("policy.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self._policy)
            self._critic.vfcn = pyrado.load(
                "vfcn.pt", self.save_dir, prefix=prefix, suffix=suffix, obj=self._critic.vfcn
            )
            print_cbt(f"Learning given the results from iteration {self._curr_iter - 1}", "w")

        else:
            # Reset the policy
            self._policy.init_param()
            self._critic.vfcn.init_param()
            print_cbt("Learning from scratch.", "w")
コード例 #10
0
ファイル: sbi_base.py プロジェクト: fdamken/SimuRLacra
    def get_latest_proposal_prev_iter(
            self) -> Union[sbiutils.BoxUniform, DirectPosterior]:
        """
        Get either the prior or the conditioned posterior from the (last round of) previous iteration.

        :return: proposal for simulating with sbi
        """
        if self._curr_iter == 0 or (hasattr(self, "reset_proposal_each_iter")
                                    and self.reset_proposal_each_iter):
            proposal = self._sbi_prior
        else:
            prefix = f"iter_{self._curr_iter - 1}_round_{self.num_sbi_rounds - 1}"
            proposal = pyrado.load("posterior.pt",
                                   self._save_dir,
                                   prefix=prefix)
        return proposal
コード例 #11
0
def load_rollouts_from_dir(
    ex_dir: str,
    key: Optional[str] = "rollout",
    file_exts: Tuple[str] = ("pt", "pkl")
) -> Tuple[List[StepSequence], List[str]]:
    """
    Crawl through the given directory, sort the files, and load all rollouts, i.e. all files that include the key.

    :param ex_dir: directory, e.g. and experiment folder
    :param key: word or part of a word that needs to the in the name of a file for it to be loaded
    :param file_exts: file extensions to be considered for loading
    :return: list of loaded rollouts, and list of file names without extension
    """
    if not osp.isdir(ex_dir):
        raise pyrado.PathErr(given=ex_dir)
    if not isinstance(key, str):
        raise pyrado.TypeErr(given=key, expected_type=str)
    if not is_iterable(file_exts):
        raise pyrado.TypeErr(given=file_exts, expected_type=Iterable)

    rollouts = []
    names = []
    for root, dirs, files in os.walk(ex_dir):
        dirs.clear()  # prevents walk() from going into subdirectories
        natural_sort(files)
        for f in files:
            f_ext = f[f.rfind(".") + 1:]
            if key in f and f_ext in file_exts:
                name = f[:f.rfind(".")]
                names.append(name)
                rollouts.append(pyrado.load(f"{name}.{f_ext}", load_dir=root))

    if not rollouts:
        raise pyrado.ValueErr(msg="No rollouts have been found!")

    if isinstance(rollouts[0], list):
        if not check_all_types_equal(rollouts):
            raise pyrado.TypeErr(
                msg=
                "Some rollout savings contain lists of rollouts, others don't!"
            )
        # The rollout files contain lists of rollouts, flatten them
        rollouts = list(itertools.chain(*rollouts))

    return rollouts, names
コード例 #12
0
    def __init__(
        self,
        rollouts_dir: str,
        embedding: Embedding,
        num_segments: int = None,
        len_segments: int = None,
        rand_init_rollout: bool = True,
    ):
        """
        Constructor

        :param rollouts_dir: directory where to find the of pre-recorded rollouts
        :param num_segments: number of segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param embedding: embedding used for pre-processing the data before (later) passing it to the posterior
        :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                            state of the simulation is reset, and thus for every set the features of the trajectories
                            are computed separately. Either specify `num_segments` or `len_segments`.
        :param rand_init_rollout: if `True`, chose the first rollout at random, and then cycle through the list
        """
        if not os.path.isdir(rollouts_dir):
            raise pyrado.PathErr(given=rollouts_dir)

        Serializable._init(self, locals())

        super().__init__(None, None, embedding, num_segments, len_segments)

        # Crawl through the directory and load every file that starts with the word rollout
        rollouts_rec = []
        for root, dirs, files in os.walk(rollouts_dir):
            dirs.clear()  # prevents walk() from going into subdirectories
            rollouts_rec = [pyrado.load(name=f, load_dir=root) for f in files if f.startswith("rollout")]
            check_all_lengths_equal(rollouts_rec)
        if not rollouts_rec:
            raise pyrado.ValueErr(msg="No rollouts have been found!")

        self.rollouts_dir = rollouts_dir
        self.rollouts_rec = rollouts_rec
        self._ring_idx = np.random.randint(0, len(rollouts_rec)) if rand_init_rollout else 0
        self._set_action_field(self.rollouts_rec)
コード例 #13
0
    def init_modules(self,
                     warmstart: bool,
                     suffix: str = '',
                     prefix: str = None,
                     **kwargs):
        """
        Initialize the algorithm's learnable modules, e.g. a policy or value function.
        Overwrite this method if the algorithm uses a learnable module aside the policy, e.g. a value function.

        :param warmstart: if `True`, the algorithm starts learning with an initialization. This can either be the a
                          fixed parameter vector, or the results of the previous iteration
        :param suffix: keyword for `meta_info` when loading from previous iteration
        :param prefix: keyword for `meta_info` when loading from previous iteration
        :param kwargs: keyword arguments for initialization, e.g. `policy_param_init` or `valuefcn_param_init`
        """
        if prefix is None:
            prefix = f'iter_{self._curr_iter - 1}'

        ppi = kwargs.get('policy_param_init', None)

        if warmstart and ppi is not None:
            self._policy.init_param(ppi)
            print_cbt('Learning given an fixed parameter initialization.', 'w')

        elif warmstart and ppi is None and self._curr_iter > 0:
            self._policy = pyrado.load(self._policy,
                                       'policy',
                                       'pt',
                                       self.save_dir,
                                       meta_info=dict(prefix=prefix,
                                                      suffix=suffix))
            print_cbt(
                f'Learning given the results from iteration {self._curr_iter - 1}',
                'w')

        else:
            # Reset the policy
            self._policy.init_param()
            print_cbt('Learning from scratch.', 'w')
コード例 #14
0
from pyrado.plotting.distribution import draw_posterior_pairwise_scatter
from pyrado.utils.argparser import get_argparser

if __name__ == "__main__":
    # Parse command line arguments
    args = get_argparser().parse_args()
    plt.rc("text", usetex=args.use_tex)
    if not isinstance(args.num_samples, int) or args.num_samples < 1:
        raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1")

    # NPDR
    ex_dir_npdr = os.path.join(pyrado.TEMP_DIR, "mg-ik", "npdr_time", "")
    algo = Algorithm.load_snapshot(ex_dir_npdr)
    if not isinstance(algo, NPDR):
        raise pyrado.TypeErr(given=algo, expected_type=NPDR)
    env_sim = inner_env(pyrado.load("env_sim.pkl", ex_dir_npdr))
    prior_npdr = pyrado.load("prior.pt", ex_dir_npdr)
    posterior_npdr = algo.load_posterior(ex_dir_npdr,
                                         idx_iter=0,
                                         idx_round=6,
                                         obj=None,
                                         verbose=True)  # CHOICE
    data_real_npdr = pyrado.load(f"data_real.pt",
                                 ex_dir_npdr,
                                 prefix="iter_0",
                                 verbose=True)  # CHOICE
    domain_params_npdr, log_probs = SBIBase.eval_posterior(
        posterior_npdr,
        data_real_npdr,
        args.num_samples,
        normalize_posterior=False,  # not necessary here
コード例 #15
0
        args.policy_name = f"iter_{args.iter}_policy"
    if args.init:
        args.policy_name = "init_policy"
    env_sim, policy, extra = load_experiment(ex_dir, args)

    # Create the domain parameter mapping
    dp_mapping = dict()
    if extra is not None:
        dp_counter = 0
        for key in sorted(extra["hparams"]["dp_mapping"].keys()):
            dp = extra["hparams"]["dp_mapping"][key]
            if dp in extra["hparams"]["dp_selection"]:
                dp_mapping[dp_counter] = dp
                dp_counter += 1

    pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=policy)

    # Reset the policy's domain parameter if desired
    prior, posterior = None, None
    if args.src_domain_param == "ml":
        ml_domain_param = pyrado.load("ml_domain_param.pkl",
                                      ex_dir,
                                      prefix=f"iter_{args.iter}")
        policy.reset(**dict(domain_param=ml_domain_param))
    elif args.src_domain_param == "posterior":
        prefix_str = "" if args.iter == -1 and args.round == -1 else f"iter_{args.iter}_round_{args.round}"
        posterior = pyrado.load("posterior.pt", ex_dir, prefix=prefix_str)
    elif args.src_domain_param == "prior":
        prior = pyrado.load("prior.pt", ex_dir)
    elif args.src_domain_param == "nominal":
        policy.reset(**dict(domain_param=env_sim.get_nominal_domain_param()))
コード例 #16
0
ファイル: sbi_base.py プロジェクト: fdamken/SimuRLacra
    def train_policy_sim(self,
                         domain_params: to.Tensor,
                         prefix: str,
                         cnt_rep: int,
                         use_rec_init_states: bool = True) -> float:
        """
        Train a policy in simulation for given hyper-parameters from the domain randomizer.

        :param domain_params: domain parameters sampled from the posterior [shape N x D where N is the number of
                              samples and D is the number of domain parameters]
        :param prefix: set a prefix to the saved file name, use "" for no prefix
        :param cnt_rep: current repetition count, coming from the wrapper function
        :param use_rec_init_states: if `True`, the previous rollout will be loaded to extract the initial states, and
                                    sync them with the recorded ones
        :return: estimated return of the trained policy in the target domain
        """
        if not (domain_params.ndim == 2
                and domain_params.shape[1] == len(self.dp_mapping)):
            raise pyrado.ShapeErr(given=domain_params,
                                  expected_match=(-1, len(self.dp_mapping)))

        # Insert the domain parameters into the wrapped environment's buffer
        self.fill_domain_param_buffer(self._env_sim_trn, self.dp_mapping,
                                      domain_params)

        # Set the initial state spaces of the simulation environment to match the observed initial states
        if use_rec_init_states:
            rollouts_real = pyrado.load("rollouts_real.pkl",
                                        self._save_dir,
                                        prefix=prefix)
            init_states_real = np.stack(
                [ro.states[0, :] for ro in rollouts_real])
            if not init_states_real.shape == (
                    len(rollouts_real),
                    self._env_sim_trn.state_space.flat_dim):
                raise pyrado.ShapeErr(
                    given=init_states_real,
                    expected_match=(len(rollouts_real),
                                    self._env_sim_trn.state_space.flat_dim))
            inner_env(
                self._env_sim_trn).init_space = DiscreteSpace(init_states_real)
            print_cbt(
                "The simulation environment's initial states have been set to the recorded ones.",
                "w")

        # Reset the subroutine algorithm which includes resetting the exploration
        self._cnt_samples += self._subrtn_policy.sample_count
        self._subrtn_policy.reset()

        # Propagate the updated training environment to the SamplerPool's workers
        if hasattr(self._subrtn_policy, "sampler"):
            self._subrtn_policy.sampler.reinit(env=self._env_sim_trn)
        else:
            raise pyrado.KeyErr(keys="sampler", container=self._subrtn_policy)

        # Do a warm start, but randomly reset the policy parameters if training failed once
        self._subrtn_policy.init_modules(self.warmstart and cnt_rep == 0)

        # Train a policy in simulation using the subroutine
        self._subrtn_policy.train(
            snapshot_mode=self._subrtn_policy_snapshot_mode,
            meta_info=dict(prefix=prefix))

        # Return the estimated return of the trained policy in simulation
        assert len(self._env_sim_trn.buffer) == self.num_eval_samples
        self._env_sim_trn.ring_idx = 0  # don't reset the buffer to eval on the same domains as trained
        avg_ret_sim = self.eval_policy(None, self._env_sim_trn,
                                       self._subrtn_policy.policy, prefix,
                                       self.num_eval_samples)
        return float(avg_ret_sim)
コード例 #17
0
.. seealso::
    [1] https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html
    [2] https://pytorch.org/tutorials/advanced/cpp_export.html
    [3[ https://pytorch.org/docs/stable/jit.html
"""

import pyrado
from pyrado.logger.experiment import ask_for_experiment
from pyrado.utils.argparser import get_argparser
from pyrado.utils.experiments import cpp_export, load_experiment


if __name__ == "__main__":
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment(hparam_list=args.show_hparams) if args.dir is None else args.dir

    # Load the policy (trained in simulation)
    try:
        # First try to load a "proper" experiment
        env, policy, _ = load_experiment(ex_dir)
    except (pyrado.PathErr, FileNotFoundError):
        # Try to load the policy and environment directly
        policy = pyrado.load("policy.pt", ex_dir, verbose=True)  # no state_dict loading
        env = pyrado.load("env.pkl", ex_dir, verbose=True)

    # Export the policy to C++ and the experiment's config
    cpp_export(ex_dir, policy, env)
コード例 #18
0
ファイル: sbi_base.py プロジェクト: fdamken/SimuRLacra
    def collect_data_real(
        save_dir: Optional[pyrado.PathLike],
        env: Union[Env, str],
        policy: Policy,
        embedding: Embedding,
        num_rollouts: int,
        num_segments: int = None,
        len_segments: int = None,
        prefix: str = "",
    ) -> Tuple[to.Tensor, List[StepSequence]]:
        """
        Roll-out a (behavioral) policy on the target system for later use with the sbi module, and save the data
        computed from the recorded rollouts.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance,
                    in case you want to use pre-recorded rollouts pass the path to the parent folder as string
        :param policy: policy to evaluate
        :param embedding: embedding used for pre-processing the data before passing it to the posterior
        :param num_rollouts: number of rollouts to collect on the target system
        :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial
                             state of the simulation is reset, and thus for every set the features of the trajectories
                             are computed separately. Either specify `num_segments` or `len_segments`.
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :return: data from the real-world rollouts a.k.a. set of $x_o$ of shape [num_iter, num_rollouts_per_iter,
                 time_series_length, dim_data], and the real-world rollouts
        """
        if not (isinstance(inner_env(env), RealEnv)
                or isinstance(inner_env(env), SimEnv) or isinstance(env, str)):
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv, str])

        # Evaluate sequentially (necessary for sim-to-real experiments)
        if isinstance(env, str):
            rollout_worker = RecRolloutSamplerForSBI(env,
                                                     embedding,
                                                     num_segments,
                                                     len_segments,
                                                     rand_init_rollout=False)
        else:
            rollout_worker = RealRolloutSamplerForSBI(env, policy, embedding,
                                                      num_segments,
                                                      len_segments)

        # Initialize data containers
        data_real = None
        rollouts_real = None
        num_found_rollouts = 0
        if save_dir is not None:
            try:
                data_real = pyrado.load("data_real.pt",
                                        save_dir,
                                        prefix=prefix)
                rollouts_real = pyrado.load("rollouts_real.pkl",
                                            save_dir,
                                            prefix=prefix)
                if not data_real.shape[0] == len(rollouts_real):
                    raise pyrado.ShapeErr(
                        msg=
                        f"Found {data_real.shape[0]} entries in data_real.pt, but {len(rollouts_real)} rollouts in "
                        f"rollouts_real.pkl!")
                num_found_rollouts = len(rollouts_real)
                print_cbt(
                    f"Found {num_found_rollouts} rollout(s) in {save_dir}.",
                    "w")
            except FileNotFoundError:
                pass  # in the first attempt no files can be found

        collect_str = f"Collecting data" if prefix == "" else f"Collecting data using {prefix}_policy"
        for _ in tqdm(
                range(num_found_rollouts, num_rollouts),
                total=num_rollouts,
                desc=Fore.CYAN + Style.BRIGHT + collect_str + Style.RESET_ALL,
                unit="rollouts",
                file=sys.stdout,
        ):
            # Do the rollout
            data, rollout = rollout_worker()

            # Fill data container
            if data_real is None or rollouts_real is None:
                data_real = data  # data is of shape [1, dim_feat]
                rollouts_real = [rollout]
            else:
                data_real = to.cat(
                    [data_real, data],
                    dim=1)  # stack to final shape [1, num_rollouts * dim_feat]
                rollouts_real.append(rollout)

            # Optionally save the data (do this at every iteration to continue)
            if save_dir is not None:
                pyrado.save(data_real, "data_real.pt", save_dir, prefix=prefix)
                pyrado.save(rollouts_real,
                            "rollouts_real.pkl",
                            save_dir,
                            prefix=prefix)

        if data_real.shape != (1, num_rollouts * embedding.dim_output):
            raise pyrado.ShapeErr(given=data_real,
                                  expected_match=(1, num_rollouts *
                                                  embedding.dim_output))

        return data_real, rollouts_real
コード例 #19
0
ファイル: sbi_base.py プロジェクト: fdamken/SimuRLacra
    def load_posterior(
        load_dir: pyrado.PathLike,
        idx_iter: int = -1,
        idx_round: int = -1,
        obj: Optional[Any] = None,
        verbose: bool = False,
    ) -> Optional[DirectPosterior]:
        """
        Load the posterior of a given iteration (and round).

        :param load_dir: experiment's directory to crawl through
        :param idx_iter: iteration to load, to load the latest pass -1
        :param idx_round: round to load, to load the latest pass -1, ignored if the experiment was not multi-round
        :param obj: object for state dict loading, forwarded to `pyrado.load()`, by default no state dict loading
        :param verbose: if `True`, print the path of what has been loaded, forwarded to `pyrado.load()`
        :return: loaded sbi posterior, or `None` if there is no posterior with the given iteration / round index
        """
        if not os.path.isdir(load_dir):
            raise pyrado.PathErr(given=load_dir)
        if not isinstance(idx_iter, int):
            raise pyrado.TypeErr(given=idx_iter, expected_type=int)
        if not isinstance(idx_round, int):
            raise pyrado.TypeErr(given=idx_round, expected_type=int)

        if idx_iter == -1:
            # Check what is the latest iteration
            cnt_iter_max = -1
            for root, dirs, files in os.walk(load_dir):
                dirs.clear()  # prevents walk() from going into subdirectories
                for f in files:
                    if f.startswith("iter_") and f.endswith("_posterior.pt"):
                        cnt_iter = int(f[f.find("iter_") + len("iter_")])
                        cnt_iter_max = cnt_iter if cnt_iter > cnt_iter_max else cnt_iter_max
            idx_iter = cnt_iter_max

        # Check if the experiment was run in a multi-round setting
        multi_round_setting = False
        for root, dirs, files in os.walk(load_dir):
            dirs.clear()  # prevents walk() from going into subdirectories
            for f in files:
                if f.startswith(f"iter_") and "round" in f:
                    multi_round_setting = True
                    break

        if multi_round_setting:
            if idx_round == -1:
                # Check what is the latest round
                cnt_round_max = -1
                for root, dirs, files in os.walk(load_dir):
                    dirs.clear(
                    )  # prevents walk() from going into subdirectories
                    for f in files:
                        if "round" in f and f.endswith("_posterior.pt"):
                            cnt_round = int(f[f.find("round_") +
                                              len("round_")])
                            cnt_round_max = cnt_round if cnt_round > cnt_round_max else cnt_round_max
                idx_round = cnt_round_max

        # Check before loading, and print a warning message if there can not be a posterior with the obtained indices
        if idx_iter == -1:
            print_cbt(
                f"Invalid iteration index {idx_iter}! Check if there is a posterior in {load_dir}.",
                "r")
        if idx_round == -1 and multi_round_setting:
            print_cbt(
                f"Invalid round index {idx_round}! Check if there is a posterior in {load_dir}.",
                "r")

        # Load the current posterior
        str_round = f"_round_{idx_round}" if multi_round_setting else ""
        try:
            posterior = pyrado.load(
                name=f"iter_{idx_iter}{str_round}_posterior.pt",
                load_dir=load_dir,
                obj=obj,
                verbose=verbose)
        except FileNotFoundError:
            print_cbt("No posterior was loaded.", "y")
            posterior = None

        return posterior
コード例 #20
0
    # Experiment
    ex_dir = setup_experiment(TSPred.name, LSTMPolicy.name)

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Load the data
    if data_set_name == "skyline":
        dt = 0.01
        _, vals = skyline(dt=dt,
                          t_end=20.0,
                          t_intvl_space=BoxSpace(0.5, 3, shape=(1, )),
                          val_space=BoxSpace(-2.0, 3.0, shape=(1, )))
        data = to.from_numpy(vals).to(dtype=to.get_default_dtype()).view(-1, 1)
    elif "qq-su" in data_set_name:
        data = pyrado.load("rollout_real_2021-04-14_18-34-53.pkl",
                           osp.join(pyrado.EVAL_DIR, "qq-su_ectrl_250Hz"))
        assert isinstance(data, StepSequence)
        assert hasattr(data, "states")
        states = to.from_numpy(data.states).to(dtype=to.get_default_dtype())
        actions = to.from_numpy(data.actions).to(dtype=to.get_default_dtype())
        data = to.cat([states[:-1], actions], dim=1)  # truncate final state
    else:
        data = pd.read_csv(
            osp.join(pyrado.PERMA_DIR, "misc", f"{data_set_name}.csv"))
        if data_set_name == "daily_min_temperatures":
            data = to.tensor(data["Temp"].values,
                             dtype=to.get_default_dtype()).view(-1, 1)
        elif data_set_name == "monthly_sunspots":
            data = to.tensor(data["Sunspots"].values,
                             dtype=to.get_default_dtype()).view(-1, 1)
        elif "oscillation" in data_set_name:
コード例 #21
0
ファイル: spota.py プロジェクト: arlene-kuehn/SimuRLacra
    def _estimate_ucbog(self, nr: int):
        """
        Collect the returns with synchronized random seeds and estimate the pessimistic and optimistic bound.
        
        :param nr: number of domains used for training the reference solutions
        :return: upper confidence bound on the optimality gap (UCBOG)
        """
        # Init containers
        cand_rets = np.zeros((self.nG, nr))
        refs_rets = np.zeros((self.nG, nr))

        # Loop over all reference solutions
        for k in range(self.nG):
            print_cbt(
                f'Estimating the UCBOG | Reference {k + 1} of {self.nG} ...',
                'c')
            # Load the domain parameters corresponding to the k-th reference solution
            env_params_ref = joblib.load(
                osp.join(self.save_dir,
                         f'iter_{self._curr_iter}_env_params_ref_{k}.pkl'))
            self.env_dr.buffer = env_params_ref

            # Load the policies (makes a difference for snapshot_mode = best)
            self._subrtn_cand._policy = pyrado.load(
                self._subrtn_cand._policy, 'policy', 'pt', self.save_dir,
                dict(prefix=f'iter_{self._curr_iter}', suffix='cand'))
            self._subrtn_refs._policy = pyrado.load(
                self._subrtn_refs._policy, 'policy', 'pt', self.save_dir,
                dict(prefix=f'iter_{self._curr_iter}', suffix=f'ref_{k}'))

            # Loop over all domain realizations of the reference solutions
            for i in tqdm(range(nr),
                          total=nr,
                          desc=f'Reference {k + 1}',
                          unit='domains',
                          file=sys.stdout,
                          leave=False):
                # Evaluate solutions
                cand_rets[k, i], refs_rets[
                    k, i] = self._eval_cand_and_ref_one_domain(i)

                # Process negative optimality samples
                refs_rets = self._handle_neg_samples(cand_rets, refs_rets, k,
                                                     i)

        # --------------
        # Optimality Gap
        # --------------

        # This is similar to the difference of the means that is used to calculate the optimality gap in eq. (9) in [2]
        self.Gn_diffs = np.subtract(
            refs_rets,
            cand_rets)  # optimistic bound - pessimistic bound; dim = nG x nr
        Gn_samples = np.mean(self.Gn_diffs, axis=1)  # dim = 1 x nr
        Gn_est = np.mean(
            Gn_samples
        )  # sample mean of the original (non-bootstrapped) samples

        ratio_neg_diffs = 1 - np.count_nonzero(
            self.Gn_diffs
        ) / self.Gn_diffs.size  # assuming zero come from clipping

        print_cbt(f'diffs (optimistic - pessimistic bound):\n{self.Gn_diffs}',
                  'y')
        print_cbt(
            f'\n{100*ratio_neg_diffs}% of the diffs would have been negative and were set to 0\n',
            'r',
            bright=True)

        if ratio_neg_diffs == 1:
            # All diffs are negative
            ci_bs = [
                0, float('inf')
            ]  # such that the UCBOG comparison in stopping_criterion_met() does not break
            log_dict = {
                'Gn_est': np.NaN,
                'UCBOG': np.NaN,
                'ratio_neg_diffs': np.NaN
            }
        else:
            # Apply bootstrapping
            m_bs, ci_bs = bootstrap_ci(np.ravel(self.Gn_diffs), np.mean,
                                       self.num_bs_reps, self.alpha, 1,
                                       self.studentized_ci)
            print(f'm_bs: {m_bs}, ci_bs: {ci_bs}')
            print_cbt(f'\nOG (point estimate): {Gn_est} \nUCBOG: {ci_bs[1]}\n',
                      'y',
                      bright=True)
            log_dict = {
                'Gn_est': Gn_est,
                'UCBOG': ci_bs[1],
                'ratio_neg_diffs': ratio_neg_diffs
            }

        # Log the optimality gap data
        mode = 'w' if self.curr_iter == 0 else 'a'
        with open(osp.join(self.save_dir, 'OG_log.csv'), mode,
                  newline='') as csvfile:
            fieldnames = list(log_dict.keys())
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if self.curr_iter == 0:
                writer.writeheader()
            writer.writerow(log_dict)

        # Store the current UCBOG estimated from all samples
        self.ucbog = ci_bs[1]
コード例 #22
0
    def step(self, snapshot_mode: str = "latest", meta_info: dict = None):
        # Save snapshot to save the correct iteration count
        self.save_snapshot()

        if self.curr_checkpoint == -1:
            if self._subrtn_policy is not None and self._train_initial_policy:
                # Add dummy values of variables that are logger later
                self.logger.add_value("avg log prob", -pyrado.inf)

                # Train the behavioral policy using the samples obtained from the prior.
                # Repeat the training if the resulting policy did not exceed the success threshold.
                domain_params = self._sbi_prior.sample(
                    sample_shape=(self.num_eval_samples, ))
                print_cbt(
                    "Training the initial policy using domain parameter sets sampled from prior.",
                    "c")
                wrapped_trn_fcn = until_thold_exceeded(
                    self.thold_succ_subrtn,
                    self.max_subrtn_rep)(self.train_policy_sim)
                wrapped_trn_fcn(
                    domain_params, prefix="init",
                    use_rec_init_states=False)  # overrides policy.pt

            self.reached_checkpoint()  # setting counter to 0

        if self.curr_checkpoint == 0:
            # Check if the rollout files already exist
            if (osp.isfile(
                    osp.join(self._save_dir,
                             f"iter_{self.curr_iter}_data_real.pt"))
                    and osp.isfile(osp.join(self._save_dir, "data_real.pt"))
                    and osp.isfile(
                        osp.join(self._save_dir, "rollouts_real.pkl"))):
                # Rollout files do exist (can be when continuing a previous experiment)
                self._curr_data_real = pyrado.load(
                    "data_real.pt",
                    self._save_dir,
                    prefix=f"iter_{self.curr_iter}")
                print_cbt(
                    f"Loaded existing rollout data for iteration {self.curr_iter}.",
                    "w")

            else:
                # If the policy depends on the domain-parameters, reset the policy with the
                # most likely dp-params from the previous round.
                pyrado.load(
                    "policy.pt",
                    self._save_dir,
                    prefix=f"iter_{self._curr_iter - 1}"
                    if self.curr_iter != 0 else "init",
                    obj=self._policy,
                )
                if self.curr_iter != 0:
                    ml_domain_param = pyrado.load(
                        "ml_domain_param.pkl",
                        self.save_dir,
                        prefix=f"iter_{self._curr_iter - 1}")
                    self._policy.reset(**dict(domain_param=ml_domain_param))

                # Rollout files do not exist yet (usual case)
                self._curr_data_real, _ = SBIBase.collect_data_real(
                    self.save_dir,
                    self._env_real,
                    self._policy,
                    self._embedding,
                    prefix=f"iter_{self._curr_iter}",
                    num_rollouts=self.num_real_rollouts,
                    num_segments=self.num_segments,
                    len_segments=self.len_segments,
                )

                # Save the target domain data
                if self._curr_iter == 0:
                    # Append the first set of data
                    pyrado.save(self._curr_data_real, "data_real.pt",
                                self._save_dir)
                else:
                    # Append and save all data
                    prev_data = pyrado.load("data_real.pt", self._save_dir)
                    data_real_hist = to.cat([prev_data, self._curr_data_real],
                                            dim=0)
                    pyrado.save(data_real_hist, "data_real.pt", self._save_dir)

            # Initialize sbi simulator and prior
            self._setup_sbi(
                prior=self._sbi_prior,
                rollouts_real=pyrado.load("rollouts_real.pkl",
                                          self._save_dir,
                                          prefix=f"iter_{self._curr_iter}"),
            )

            self.reached_checkpoint()  # setting counter to 1

        if self.curr_checkpoint == 1:
            # Instantiate the sbi subroutine to retrain from scratch each iteration
            if self.reset_sbi_routine_each_iter:
                self._initialize_subrtn_sbi(
                    subrtn_sbi_class=SNPE_A,
                    num_components=self._num_components)

            # Initialize the proposal with the prior
            proposal = self._sbi_prior

            # Multi-round sbi
            for idx_r in range(self.num_sbi_rounds):
                # Sample parameters proposal, and simulate these parameters to obtain the data
                domain_param, data_sim = simulate_for_sbi(
                    simulator=self._sbi_simulator,
                    proposal=proposal,
                    num_simulations=self.num_sim_per_round,
                    simulation_batch_size=self.simulation_batch_size,
                    num_workers=self.num_workers,
                )
                self._cnt_samples += self.num_sim_per_round * self._env_sim_sbi.max_steps

                # Append simulations and proposals for sbi
                self._subrtn_sbi.append_simulations(
                    domain_param,
                    data_sim,
                    proposal=
                    proposal,  # do not pass proposal arg for SNLE or SNRE
                )

                # Train the posterior
                density_estimator = self._subrtn_sbi.train(
                    final_round=idx_r == self.num_sbi_rounds - 1,
                    component_perturbation=self._component_perturbation,
                    **self.subrtn_sbi_training_hparam,
                )
                posterior = self._subrtn_sbi.build_posterior(
                    density_estimator=density_estimator,
                    **self.subrtn_sbi_sampling_hparam)

                # Save the posterior of this iteration before tailoring it to the data (when it is still amortized)
                if idx_r == 0:
                    pyrado.save(
                        posterior,
                        "posterior.pt",
                        self._save_dir,
                        prefix=f"iter_{self._curr_iter}",
                    )

                # Set proposal of the next round to focus on the next data set.
                # set_default_x() expects dim [1, num_rollouts * data_samples]
                proposal = posterior.set_default_x(self._curr_data_real)

                # Save the posterior tailored to each round
                pyrado.save(
                    posterior,
                    "posterior.pt",
                    self._save_dir,
                    prefix=f"iter_{self._curr_iter}_round_{idx_r}",
                )

                # Override the latest posterior
                pyrado.save(posterior, "posterior.pt", self._save_dir)

            self.reached_checkpoint()  # setting counter to 2

        if self.curr_checkpoint == 2:
            # Logging (the evaluation can be time-intensive)
            posterior = pyrado.load("posterior.pt", self._save_dir)
            self._curr_domain_param_eval, log_probs = SBIBase.eval_posterior(
                posterior,
                self._curr_data_real,
                self.num_eval_samples,
                calculate_log_probs=True,
                normalize_posterior=self.normalize_posterior,
                subrtn_sbi_sampling_hparam=self.subrtn_sbi_sampling_hparam,
            )
            self.logger.add_value("avg log prob", to.mean(log_probs), 4)
            self.logger.add_value("num total samples", self._cnt_samples)

            # Extract the most likely domain parameter set out of all target domain data sets
            current_domain_param = self._env_sim_sbi.domain_param
            idx_ml = to.argmax(log_probs).item()
            dp_vals = self._curr_domain_param_eval[idx_ml //
                                                   self.num_eval_samples,
                                                   idx_ml %
                                                   self.num_eval_samples, :]
            dp_vals = to.atleast_1d(dp_vals).numpy()
            ml_domain_param = dict(
                zip(self.dp_mapping.values(), dp_vals.tolist()))

            # Update the unchanged domain parameters with the most likely ones obtained from the posterior
            current_domain_param.update(ml_domain_param)
            pyrado.save(current_domain_param,
                        "ml_domain_param.pkl",
                        self.save_dir,
                        prefix=f"iter_{self._curr_iter}")

            self.reached_checkpoint()  # setting counter to 3

        if self.curr_checkpoint == 3:
            # Policy optimization
            if self._subrtn_policy is not None:
                pyrado.load(
                    "policy.pt",
                    self._save_dir,
                    prefix=f"iter_{self._curr_iter - 1}"
                    if self.curr_iter != 0 else "init",
                    obj=self._policy,
                )
                # Train the behavioral policy using the posterior samples obtained before.
                # Repeat the training if the resulting policy did not exceed the success threshold.
                print_cbt(
                    "Training the next policy using domain parameter sets sampled from the current posterior.",
                    "c")
                wrapped_trn_fcn = until_thold_exceeded(
                    self.thold_succ_subrtn,
                    self.max_subrtn_rep)(self.train_policy_sim)
                wrapped_trn_fcn(self._curr_domain_param_eval.squeeze(0),
                                prefix=f"iter_{self._curr_iter}",
                                use_rec_init_states=True)
            else:
                # save prefixed policy either way
                pyrado.save(self.policy,
                            "policy.pt",
                            self.save_dir,
                            prefix=f"iter_{self._curr_iter}",
                            use_state_dict=True)

            self.reached_checkpoint()  # setting counter to 0

        # Save snapshot data
        self.make_snapshot(snapshot_mode, None, meta_info)
コード例 #23
0
ファイル: simopt.py プロジェクト: arlene-kuehn/SimuRLacra
    def step(self, snapshot_mode: str = 'latest', meta_info: dict = None):
        # Save snapshot to save the correct iteration count
        self.save_snapshot()

        if self.curr_checkpoint == 0:
            if self._curr_iter == 0:
                # First iteration, use the policy parameters (initialized from a prior)
                cand = self._subrtn_distr.policy.transform_to_ddp_space(
                    self._subrtn_distr.policy.param_values)
                self.cands = cand.unsqueeze(0)
            else:
                # Select the latest domain distribution parameter set
                assert isinstance(self.cands, to.Tensor)
                cand = self.cands[-1, :].clone()
            print_cbt(
                f'Current domain distribution parameters: {cand.detach().cpu().numpy()}',
                'g')

            # Train and evaluate the behavioral policy, repeat if the policy did not exceed the success threshold
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subrtn.item(),
                self.max_subrtn_rep)(self.train_policy_sim)
            wrapped_trn_fcn(cand, prefix=f'iter_{self._curr_iter}')

            # Save the latest behavioral policy
            self._subrtn_policy.save_snapshot()
            self.reached_checkpoint()  # setting counter to 1

        if self.curr_checkpoint == 1:
            # Evaluate the current policy in the target domain
            policy = pyrado.load(
                self.policy,
                'policy',
                'pt',
                self.save_dir,
                meta_info=dict(prefix=f'iter_{self._curr_iter}'))
            self.eval_behav_policy(self.save_dir, self._env_real, policy,
                                   f'iter_{self._curr_iter}',
                                   self.num_eval_rollouts, None)
            # if self._curr_iter == 0:
            #     # First iteration, also evaluate the random initialization
            #     self.cands_values = SimOpt.eval_ddp_policy(
            #         rollouts_real, self._env_sim, self.num_eval_rollouts, self._subrtn_distr, self._subrtn_policy
            #     )
            #     self.cands_values = to.tensor(self.cands_values).unsqueeze(0)
            self.reached_checkpoint()  # setting counter to 2

        if self.curr_checkpoint == 2:
            # Train and evaluate the policy that represents domain parameter distribution
            rollouts_real = pyrado.load(
                None,
                'rollouts_real',
                'pkl',
                self.save_dir,
                meta_info=dict(prefix=f'iter_{self._curr_iter}'))
            curr_cand_value = self.train_ddp_policy(
                rollouts_real, prefix=f'iter_{self._curr_iter}')
            if self._curr_iter == 0:
                self.cands_values = to.tensor(curr_cand_value).unsqueeze(0)
            else:
                self.cands_values = to.cat([
                    self.cands_values,
                    to.tensor(curr_cand_value).unsqueeze(0)
                ],
                                           dim=0)
            pyrado.save(self.cands_values, 'candidates_values', 'pt',
                        self.save_dir, meta_info)

            # The next candidate is the current search distribution and not the best policy parameter set (is saved)
            next_cand = self._subrtn_distr.policy.transform_to_ddp_space(
                self._subrtn_distr.policy.param_values)
            self.cands = to.cat([self.cands, next_cand.unsqueeze(0)], dim=0)
            pyrado.save(self.cands, 'candidates', 'pt', self.save_dir,
                        meta_info)

            # Save the latest domain distribution parameter policy
            self._subrtn_distr.save_snapshot(
                meta_info=dict(prefix='ddp', rollouts_real=rollouts_real))
            self.reached_checkpoint()  # setting counter to 0
コード例 #24
0
if __name__ == "__main__":
    # Parse command line arguments
    parser = get_argparser()
    parser.set_defaults(
        animation=True)  # different default value for this script
    args = parser.parse_args()
    if not isinstance(args.num_samples, int) or args.num_samples < 1:
        raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1")

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment(
        hparam_list=args.show_hparams) if args.dir is None else args.dir

    # Load the experiment
    env, policy, kwout = load_experiment(ex_dir, args)
    env_real = pyrado.load("env_real.pkl", ex_dir)
    data_real = kwout["data_real"]
    if args.iter == -1:
        # This script is not made to evaluate multiple iterations at once, thus we always select the data one iteration
        data_real = to.atleast_2d(data_real[args.iter])

    # Override the time step size if specified
    if args.dt is not None:
        env.dt = args.dt

    # Use the environments number of steps in case of the default argument (inf)
    max_steps = env.max_steps if args.max_steps == pyrado.inf else args.max_steps

    # Check which algorithm was used in the experiment
    algo = Algorithm.load_snapshot(load_dir=ex_dir, load_name="algo")
    if not isinstance(algo, (NPDR, BayesSim)):
コード例 #25
0
    if isinstance(inner_env(env_real), SimEnv):
        # Use actual ground truth domain param if sim-2-sim setting
        domain_params = env_real.domain_param
    else:
        # Use nominal domain param if sim-2-real setting
        domain_params = inner_env(env_sim).get_nominal_domain_param()
    for dp_name, dp_val in domain_params.items():
        if dp_name in labels_sel_dims[0]:
            gt_val_x = dp_val
        try:
            if dp_name == labels_sel_dims[1]:
                gt_val_y = dp_val
        except Exception:
            gt_val_y = None

    cands = pyrado.load("candidates.pt", ex_dir)
    cands_values = pyrado.load("candidates_values.pt", ex_dir).unsqueeze(1)
    ddp_space = pyrado.load("ddp_space.pkl", ex_dir)

    dim_cand = cands.shape[1]  # number of domain distribution parameters
    if dim_cand % 2 != 0:
        raise pyrado.ShapeErr(msg="The dimension of domain distribution parameters must be a multiple of 2!")

    # Select dimensions to plot (ignored for 1D mode)
    if len(args.idcs) == 1:
        # Plot 1D
        x_label = labels_sel_dims[0]  # could override manually here
        y_label = r"$\hat{J}^{\textrm{real}}$"
        fig, ax = plt.subplots(1, figsize=(6, 4), constrained_layout=True)

    elif len(args.idcs) == 2:
コード例 #26
0
    plt.rc("text", usetex=args.use_tex)
    if not isinstance(args.num_samples, int) or args.num_samples < 1:
        raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1")

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment(
        hparam_list=args.show_hparams) if args.dir is None else args.dir

    # Load the algorithm
    algo = Algorithm.load_snapshot(ex_dir)
    if not isinstance(algo, (NPDR, BayesSim)):
        raise pyrado.TypeErr(given=algo, expected_type=(NPDR, BayesSim))

    # Load the environments, the policy, and the posterior
    env_sim, policy, kwout = load_experiment(ex_dir, args)
    env_real = pyrado.load("env_real.pkl", ex_dir)
    prior = kwout["prior"]
    posterior = kwout["posterior"]
    data_real = kwout["data_real"]

    if args.mode.lower() == "evolution-round" and args.iter == -1:
        args.iter = algo.curr_iter
        print_cbt(
            "Set the evaluation iteration to the latest iteration of the algorithm.",
            "y")

    # Load the sequence of posteriors if desired
    if args.mode.lower() == "evolution-iter":
        posterior = [
            SBIBase.load_posterior(ex_dir, idx_iter=i, verbose=True)
            for i in range(algo.max_iter)
コード例 #27
0
def load_experiment(
        ex_dir: str,
        args: Any = None) -> (Union[SimEnv, EnvWrapper], Policy, dict):
    """
    Load the (training) environment and the policy.
    This helper function first tries to read the hyper-parameters yaml-file in the experiment's directory to infer
    why entities should be loaded. If no file was found, we fall back to some heuristic and hope for the best.

    :param ex_dir: experiment's parent directory
    :param args: arguments from the argument parser, pass `None` to fall back to the values from the default argparser
    :return: environment, policy, and optional output (e.g. valuefcn)
    """
    env, policy, extra = None, None, dict()

    if args is None:
        # Fall back to default arguments. By passing [], we ignore the command line arguments
        args = get_argparser().parse_args([])

    # Hyper-parameters
    hparams_file_name = 'hyperparams.yaml'
    try:
        hparams = load_dict_from_yaml(osp.join(ex_dir, hparams_file_name))
        extra['hparams'] = hparams
    except (pyrado.PathErr, FileNotFoundError, KeyError):
        print_cbt(
            f'Did not find {hparams_file_name} in {ex_dir} or could not crawl the loaded hyper-parameters.',
            'y',
            bright=True)

    # Algorithm specific
    algo = Algorithm.load_snapshot(load_dir=ex_dir, load_name='algo')
    if isinstance(algo, BayRn):
        # Environment
        env = pyrado.load(None, 'env_sim', 'pkl', ex_dir, None)
        print_cbt(f"Loaded {osp.join(ex_dir, 'env_sim.pkl')}.", 'g')
        if hasattr(env, 'randomizer'):
            last_cand = to.load(osp.join(ex_dir, 'candidates.pt'))[-1, :]
            env.adapt_randomizer(last_cand.numpy())
            print_cbt(f'Loaded the domain randomizer\n{env.randomizer}', 'w')
        else:
            print_cbt('Loaded environment has no randomizer.', 'r')
        # Policy
        policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir,
                             None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')
        # Extra (value function)
        if isinstance(algo.subroutine, ActorCritic):
            extra['vfcn'] = pyrado.load(algo.subroutine.critic.vfcn,
                                        f'{args.vfcn_name}', 'pt', ex_dir,
                                        None)
            print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}",
                      'g')

    elif isinstance(algo, SPOTA):
        # Environment
        env = pyrado.load(None, 'env', 'pkl', ex_dir, None)
        print_cbt(f"Loaded {osp.join(ex_dir, 'env.pkl')}.", 'g')
        if hasattr(env, 'randomizer'):
            if not isinstance(env.randomizer, DomainRandWrapperBuffer):
                raise pyrado.TypeErr(given=env.randomizer,
                                     expected_type=DomainRandWrapperBuffer)
            typed_env(env, DomainRandWrapperBuffer).fill_buffer(100)
            print_cbt(
                f"Loaded {osp.join(ex_dir, 'env.pkl')} and filled it with 100 random instances.",
                'g')
        else:
            print_cbt('Loaded environment has no randomizer.', 'r')
        # Policy
        policy = pyrado.load(algo.subroutine_cand.policy,
                             f'{args.policy_name}', 'pt', ex_dir, None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')
        # Extra (value function)
        if isinstance(algo.subroutine_cand, ActorCritic):
            extra['vfcn'] = pyrado.load(algo.subroutine_cand.critic.vfcn,
                                        f'{args.vfcn_name}', 'pt', ex_dir,
                                        None)
            print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}",
                      'g')

    elif isinstance(algo, SimOpt):
        # Environment
        env = pyrado.load(None, 'env_sim', 'pkl', ex_dir, None)
        print_cbt(f"Loaded {osp.join(ex_dir, 'env_sim.pkl')}.", 'g')
        if hasattr(env, 'randomizer'):
            last_cand = to.load(osp.join(ex_dir, 'candidates.pt'))[-1, :]
            env.adapt_randomizer(last_cand.numpy())
            print_cbt(f'Loaded the domain randomizer\n{env.randomizer}', 'w')
        else:
            print_cbt('Loaded environment has no randomizer.', 'r')
        # Policy
        policy = pyrado.load(algo.subroutine_policy.policy,
                             f'{args.policy_name}', 'pt', ex_dir, None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')
        # Extra (domain parameter distribution policy)
        extra['ddp_policy'] = pyrado.load(algo.subroutine_distr.policy,
                                          'ddp_policy', 'pt', ex_dir, None)

    elif isinstance(algo, (EPOpt, UDR)):
        # Environment
        env = pyrado.load(None, 'env_sim', 'pkl', ex_dir, None)
        if hasattr(env, 'randomizer'):
            if not isinstance(env.randomizer, DomainRandWrapperLive):
                raise pyrado.TypeErr(given=env.randomizer,
                                     expected_type=DomainRandWrapperLive)
            print_cbt(
                f"Loaded {osp.join(ex_dir, 'env.pkl')} with DomainRandWrapperLive randomizer.",
                'g')
        else:
            print_cbt('Loaded environment has no randomizer.', 'y')
        # Policy
        policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir,
                             None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')
        # Extra (value function)
        if isinstance(algo.subroutine, ActorCritic):
            extra['vfcn'] = pyrado.load(algo.subroutine.critic.vfcn,
                                        f'{args.vfcn_name}', 'pt', ex_dir,
                                        None)
            print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}",
                      'g')

    elif isinstance(algo, ActorCritic):
        # Environment
        env = pyrado.load(None, 'env', 'pkl', ex_dir, None)
        # Policy
        policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir,
                             None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')
        # Extra (value function)
        extra['vfcn'] = pyrado.load(algo.critic.vfcn, f'{args.vfcn_name}',
                                    'pt', ex_dir, None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.vfcn_name}.pt')}", 'g')

    elif isinstance(algo, ParameterExploring):
        # Environment
        env = pyrado.load(None, 'env', 'pkl', ex_dir, None)
        # Policy
        policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir,
                             None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')

    elif isinstance(algo, ValueBased):
        # Environment
        env = pyrado.load(None, 'env', 'pkl', ex_dir, None)
        # Policy
        policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir,
                             None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')
        # Target value functions
        if isinstance(algo, DQL):
            extra['qfcn_target'] = pyrado.load(algo.qfcn_targ, 'qfcn_target',
                                               'pt', ex_dir, None)
            print_cbt(f"Loaded {osp.join(ex_dir, 'qfcn_target.pt')}", 'g')
        elif isinstance(algo, SAC):
            extra['qfcn_target1'] = pyrado.load(algo.qfcn_targ_1,
                                                'qfcn_target1', 'pt', ex_dir,
                                                None)
            extra['qfcn_target2'] = pyrado.load(algo.qfcn_targ_2,
                                                'qfcn_target2', 'pt', ex_dir,
                                                None)
            print_cbt(
                f"Loaded {osp.join(ex_dir, 'qfcn_target1.pt')} and {osp.join(ex_dir, 'qfcn_target2.pt')}",
                'g')
        else:
            raise NotImplementedError

    elif isinstance(algo, SVPG):
        # Environment
        env = pyrado.load(None, 'env', 'pkl', ex_dir, None)
        # Policy
        policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir,
                             None)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", 'g')
        # Extra (particles)
        for idx, p in enumerate(algo.particles):
            extra[f'particle{idx}'] = pyrado.load(algo.particles[idx],
                                                  f'particle_{idx}', 'pt',
                                                  ex_dir, None)

    elif isinstance(algo, TSPred):
        # Dataset
        extra['dataset'] = to.load(osp.join(ex_dir, 'dataset.pt'))
        # Policy
        policy = pyrado.load(algo.policy, f'{args.policy_name}', 'pt', ex_dir,
                             None)

    else:
        raise pyrado.TypeErr(
            msg=
            'No matching algorithm name found during loading the experiment!')

    # Check if the return types are correct. They can be None, too.
    if env is not None and not isinstance(env, (SimEnv, EnvWrapper)):
        raise pyrado.TypeErr(given=env, expected_type=[SimEnv, EnvWrapper])
    if policy is not None and not isinstance(policy, Policy):
        raise pyrado.TypeErr(given=policy, expected_type=Policy)
    if extra is not None and not isinstance(extra, dict):
        raise pyrado.TypeErr(given=extra, expected_type=dict)

    return env, policy, extra
コード例 #28
0
def load_experiment(
    ex_dir: str,
    args: Any = None
) -> Tuple[Optional[Union[SimEnv, EnvWrapper]], Optional[Policy],
           Optional[dict]]:
    """
    Load the (training) environment and the policy.
    This helper function first tries to read the hyper-parameters yaml-file in the experiment's directory to infer
    why entities should be loaded. If no file was found, we fall back to some heuristic and hope for the best.

    :param ex_dir: experiment's parent directory
    :param args: arguments from the argument parser, pass `None` to fall back to the values from the default argparser
    :return: environment, policy, and optional output (e.g. valuefcn)
    """
    env, policy, extra = None, None, dict()

    if args is None:
        # Fall back to default arguments. By passing [], we ignore the command line arguments
        args = get_argparser().parse_args([])

    # Hyper-parameters
    extra["hparams"] = load_hyperparameters(ex_dir)

    # Algorithm specific
    algo = Algorithm.load_snapshot(load_dir=ex_dir, load_name="algo")

    if algo.name == "spota":
        # Environment
        env = pyrado.load("env.pkl", ex_dir)
        if getattr(env, "randomizer", None) is not None:
            if not isinstance(env, DomainRandWrapperBuffer):
                raise pyrado.TypeErr(given=env,
                                     expected_type=DomainRandWrapperBuffer)
            typed_env(env, DomainRandWrapperBuffer).fill_buffer(10)
            print_cbt(
                f"Loaded the domain randomizer\n{env.randomizer}\nand filled it with 10 random instances.",
                "w")
        else:
            print_cbt("Loaded environment has no randomizer, or it is None.",
                      "r")
        # Policy
        policy = pyrado.load(algo.subroutine_cand.policy,
                             f"{args.policy_name}.pt",
                             ex_dir,
                             verbose=True)
        # Extra (value function)
        if isinstance(algo.subroutine_cand, ActorCritic):
            extra["vfcn"] = pyrado.load(algo.subroutine_cand.critic.vfcn,
                                        f"{args.vfcn_name}.pt",
                                        ex_dir,
                                        verbose=True)

    elif algo.name == "bayrn":
        # Environment
        env = pyrado.load("env_sim.pkl", ex_dir)
        if hasattr(env, "randomizer"):
            last_cand = to.load(osp.join(ex_dir, "candidates.pt"))[-1, :]
            env.adapt_randomizer(last_cand.numpy())
            print_cbt(f"Loaded the domain randomizer\n{env.randomizer}", "w")
        else:
            print_cbt("Loaded environment has no randomizer, or it is None.",
                      "r")
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)
        # Extra (value function)
        if isinstance(algo.subroutine, ActorCritic):
            extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt",
                                        ex_dir,
                                        obj=algo.subroutine.critic.vfcn,
                                        verbose=True)

    elif algo.name == "simopt":
        # Environment
        env = pyrado.load("env_sim.pkl", ex_dir)
        if getattr(env, "randomizer", None) is not None:
            last_cand = to.load(osp.join(ex_dir, "candidates.pt"))[-1, :]
            env.adapt_randomizer(last_cand.numpy())
            print_cbt(f"Loaded the domain randomizer\n{env.randomizer}", "w")
        else:
            print_cbt("Loaded environment has no randomizer, or it is None.",
                      "r")
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.subroutine_policy.policy,
                             verbose=True)
        # Extra (domain parameter distribution policy)
        extra["ddp_policy"] = pyrado.load("ddp_policy.pt",
                                          ex_dir,
                                          obj=algo.subroutine_distr.policy,
                                          verbose=True)

    elif algo.name in ["epopt", "udr"]:
        # Environment
        env = pyrado.load("env_sim.pkl", ex_dir)
        if getattr(env, "randomizer", None) is not None:
            if not isinstance(env, DomainRandWrapperLive):
                raise pyrado.TypeErr(given=env,
                                     expected_type=DomainRandWrapperLive)
            print_cbt(f"Loaded the domain randomizer\n{env.randomizer}", "w")
        else:
            print_cbt("Loaded environment has no randomizer, or it is None.",
                      "y")
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)
        # Extra (value function)
        if isinstance(algo.subroutine, ActorCritic):
            extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt",
                                        ex_dir,
                                        obj=algo.subroutine.critic.vfcn,
                                        verbose=True)

    elif algo.name in ["bayessim", "npdr"]:
        # Environment
        env = pyrado.load("env_sim.pkl", ex_dir)
        if getattr(env, "randomizer", None) is not None:
            if not isinstance(env, DomainRandWrapperBuffer):
                raise pyrado.TypeErr(given=env,
                                     expected_type=DomainRandWrapperBuffer)
            typed_env(env, DomainRandWrapperBuffer).fill_buffer(10)
            print_cbt(
                f"Loaded the domain randomizer\n{env.randomizer}\nand filled it with 10 random instances.",
                "w")
        else:
            print_cbt("Loaded environment has no randomizer, or it is None.",
                      "y")
            env = remove_all_dr_wrappers(env, verbose=True)
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)
        # Extra (prior, posterior, data)
        extra["prior"] = pyrado.load("prior.pt", ex_dir, verbose=True)
        # By default load the latest posterior (latest iteration and the last round)
        try:
            extra["posterior"] = algo.load_posterior(ex_dir,
                                                     args.iter,
                                                     args.round,
                                                     obj=None,
                                                     verbose=True)
            # Load the complete data or the data of the given iteration
            prefix = "" if args.iter == -1 else f"iter_{args.iter}"
            extra["data_real"] = pyrado.load(f"data_real.pt",
                                             ex_dir,
                                             prefix=prefix,
                                             verbose=True)
        except FileNotFoundError:
            pass

    elif algo.name in ["a2c", "ppo", "ppo2"]:
        # Environment
        env = pyrado.load("env.pkl", ex_dir)
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)
        # Extra (value function)
        extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt",
                                    ex_dir,
                                    obj=algo.critic.vfcn,
                                    verbose=True)

    elif algo.name in ["hc", "pepg", "power", "cem", "reps", "nes"]:
        # Environment
        env = pyrado.load("env.pkl", ex_dir)
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)

    elif algo.name in ["dql", "sac"]:
        # Environment
        env = pyrado.load("env.pkl", ex_dir)
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)
        # Target value functions
        if algo.name == "dql":
            extra["qfcn_target"] = pyrado.load("qfcn_target.pt",
                                               ex_dir,
                                               obj=algo.qfcn_targ,
                                               verbose=True)
        elif algo.name == "sac":
            extra["qfcn_target1"] = pyrado.load("qfcn_target1.pt",
                                                ex_dir,
                                                obj=algo.qfcn_targ_1,
                                                verbose=True)
            extra["qfcn_target2"] = pyrado.load("qfcn_target2.pt",
                                                ex_dir,
                                                obj=algo.qfcn_targ_2,
                                                verbose=True)
        else:
            raise NotImplementedError

    elif algo.name == "svpg":
        # Environment
        env = pyrado.load("env.pkl", ex_dir)
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)
        # Extra (particles)
        for idx, p in enumerate(algo.particles):
            extra[f"particle{idx}"] = pyrado.load(f"particle_{idx}.pt",
                                                  ex_dir,
                                                  obj=algo.particles[idx],
                                                  verbose=True)

    elif algo.name == "tspred":
        # Dataset
        extra["dataset"] = to.load(osp.join(ex_dir, "dataset.pt"))
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)

    elif algo.name == "sprl":
        # Environment
        env = pyrado.load("env.pkl", ex_dir)
        print_cbt(f"Loaded {osp.join(ex_dir, 'env.pkl')}.", "g")
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt", ex_dir, obj=algo.policy)
        print_cbt(f"Loaded {osp.join(ex_dir, f'{args.policy_name}.pt')}", "g")
        # Extra (value function)
        if isinstance(algo._subroutine, ActorCritic):
            extra["vfcn"] = pyrado.load(f"{args.vfcn_name}.pt",
                                        ex_dir,
                                        obj=algo._subroutine.critic.vfcn,
                                        verbose=True)

    elif algo.name == "pddr":
        # Environment
        env = pyrado.load("env.pkl", ex_dir)
        # Policy
        policy = pyrado.load(f"{args.policy_name}.pt",
                             ex_dir,
                             obj=algo.policy,
                             verbose=True)
        # Teachers
        extra["teacher_policies"] = algo.teacher_policies
        extra["teacher_envs"] = algo.teacher_envs
        extra["teacher_expl_strats"] = algo.teacher_expl_strats
        extra["teacher_critics"] = algo.teacher_critics
        extra["teacher_ex_dirs"] = algo.teacher_ex_dirs

    else:
        raise pyrado.TypeErr(
            msg=
            "No matching algorithm name found during loading the experiment!")

    # Check if the return types are correct. They can be None, too.
    if env is not None and not isinstance(env, (SimEnv, EnvWrapper)):
        raise pyrado.TypeErr(given=env, expected_type=[SimEnv, EnvWrapper])
    if policy is not None and not isinstance(policy, Policy):
        raise pyrado.TypeErr(given=policy, expected_type=Policy)
    if extra is not None and not isinstance(extra, dict):
        raise pyrado.TypeErr(given=extra, expected_type=dict)

    return env, policy, extra
コード例 #29
0
    def step(self, snapshot_mode: str = 'latest', meta_info: dict = None):
        # Save snapshot to save the correct iteration count
        self.save_snapshot()

        if self.curr_checkpoint == -2:
            # Train the initial policies in the source domain
            self.train_init_policies()
            self.reached_checkpoint()  # setting counter to -1

        if self.curr_checkpoint == -1:
            # Evaluate the initial policies in the target domain
            self.eval_init_policies()
            self.reached_checkpoint()  # setting counter to 0

        if self.curr_checkpoint == 0:
            # Normalize the input data and standardize the output data
            cands_norm = self.ddp_projector.project_to(self.cands)
            cands_values_stdized = standardize(self.cands_values).unsqueeze(1)

            # Create and fit the GP model
            gp = SingleTaskGP(cands_norm, cands_values_stdized)
            gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5))
            mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
            fit_gpytorch_model(mll)
            print_cbt('Fitted the GP.', 'g')

            # Acquisition functions
            if self.acq_fcn_type == 'UCB':
                acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True)
            elif self.acq_fcn_type == 'EI':
                acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            elif self.acq_fcn_type == 'PI':
                acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            else:
                raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'")

            # Optimize acquisition function and get new candidate point
            cand_norm, acq_value = optimize_acqf(
                acq_function=acq_fcn,
                bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]),
                q=1,
                num_restarts=self.acq_restarts,
                raw_samples=self.acq_samples
            )
            next_cand = self.ddp_projector.project_back(cand_norm)
            print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g')
            self.cands = to.cat([self.cands, next_cand], dim=0)
            pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 1

        if self.curr_checkpoint == 1:
            # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subrtn.item(), self.max_subrtn_rep
            )(self.train_policy_sim)
            wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}')
            self.reached_checkpoint()  # setting counter to 2

        if self.curr_checkpoint == 2:
            # Evaluate the current policy in the target domain
            policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir,
                                        meta_info=dict(prefix=f'iter_{self._curr_iter}'))
            self.curr_cand_value = self.eval_policy(
                self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}',
                self.num_eval_rollouts_real
            )
            self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0)
            pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info)

            # Store the argmax after training and evaluating
            curr_argmax_cand = BayRn.argmax_posterior_mean(
                self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples
            )
            self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0)
            pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 0
コード例 #30
0
    if isinstance(inner_env(env_real), SimEnv):
        # Use actual ground truth domain param if sim-2-sim setting
        domain_params = env_real.domain_param
    else:
        # Use nominal domain param if sim-2-real setting
        domain_params = inner_env(env_sim).get_nominal_domain_param()
    for dp_name, dp_val in domain_params.items():
        if dp_name in labels_sel_dims[0]:
            gt_val_x = dp_val
        try:
            if dp_name == labels_sel_dims[1]:
                gt_val_y = dp_val
        except Exception:
            gt_val_y = None

    cands = pyrado.load(None, 'candidates', 'pt', ex_dir)
    cands_values = pyrado.load(None, 'candidates_values', 'pt',
                               ex_dir).unsqueeze(1)
    ddp_space = pyrado.load(None, 'ddp_space', 'pkl', ex_dir)

    dim_cand = cands.shape[1]  # number of domain distribution parameters
    if dim_cand % 2 != 0:
        raise pyrado.ShapeErr(
            msg=
            'The dimension of domain distribution parameters must be a multiple of 2!'
        )

    # Select dimensions to plot (ignored for 1D mode)
    if len(args.idcs) == 1:
        # Plot 1D
        x_label = labels_sel_dims[0]  # could override manually here