def eval_init_policies(self): """ Execute the trained initial policies on the target device and store the estimated return per candidate. The number of initial policies to evaluate is the number of found policies. """ # Crawl through the experiment's directory for root, dirs, files in os.walk(self.save_dir): dirs.clear() # prevents walk() from going into subdirectories found_policies = [p for p in files if p.startswith('init_') and p.endswith('_policy.pt')] found_cands = [c for c in files if c.startswith('init_') and c.endswith('_candidate.pt')] if not len(found_policies) == len(found_cands): raise pyrado.ValueErr(msg='Found a different number of initial policies than candidates!') elif len(found_policies) == 0: raise pyrado.ValueErr(msg='No policies or candidates found!') num_init_cand = len(found_cands) cands_values = to.empty(num_init_cand) # Load all found candidates to save them into a single tensor found_cands = natural_sort(found_cands) # the order is important since it determines the rows of the tensor cands = to.stack([to.load(osp.join(self.save_dir, c)) for c in found_cands]) # Evaluate learned policies from random candidates on the target environment (real-world) system for i in range(num_init_cand): policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'init_{i}')) cands_values[i] = self.eval_policy(self.save_dir, self._env_real, policy, self.mc_estimator, prefix=f'init_{i}', num_rollouts=self.num_eval_rollouts_real) # Save candidates's and their returns into tensors (policy is saved during training or exists already) # pyrado.save(cands, 'candidates', 'pt', self._save_dir, meta_info) pyrado.save(cands_values, 'candidates_values', 'pt', self.save_dir, meta_info=None) self.cands, self.cands_values = cands, cands_values
def train_init_policies(self): """ Initialize the algorithm with a number of random distribution parameter sets a.k.a. candidates specified by the user. Train a policy for every candidate. Finally, store the policies and candidates. """ cands = to.empty(self.num_init_cand, self.ddp_space.shape[0]) for i in range(self.num_init_cand): print_cbt( f"Generating initial domain instance and policy {i + 1} of {self.num_init_cand} ...", "g", bright=True) # Sample random domain distribution parameters cands[i, :] = to.from_numpy(self.ddp_space.sample_uniform()) # Train a policy for each candidate, repeat if the resulting policy did not exceed the success threshold print_cbt( f"Randomly sampled the next candidate: {cands[i].numpy()}", "g") wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn.item(), self.max_subrtn_rep)(self.train_policy_sim) wrapped_trn_fcn(cands[i], prefix=f"init_{i}") # Save candidates into a single tensor (policy is saved during training or exists already) pyrado.save(cands, "candidates.pt", self.save_dir) self.cands = cands
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) pyrado.save(self.policy, "policy.pt", self.save_dir) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self.env_real, "env.pkl", self.save_dir)
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) pyrado.save(self._expl_strat.policy, 'policy', 'pt', self.save_dir, meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
def eval_policy( save_dir: Optional[pyrado.PathLike], env: Env, policy: Policy, prefix: str, num_rollouts: int, num_workers: int = 1, ) -> to.Tensor: """ Evaluate a policy either in the source or in the target domain. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_workers: number of environments for the parallel sampler (only used for SimEnv) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f"Executing {prefix}_policy ...", "c", bright=True) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when evaluating on a real-world device rets_real = [] for i in range(num_rollouts): rets_real.append( rollout(env, policy, eval=True).undiscounted_return()) elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when evaluating in a simulation sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=num_rollouts) ros = sampler.sample(eval=True) rets_real = [ro.undiscounted_return() for ro in ros] else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype()) if save_dir is not None: # Save and print the evaluation results pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix) print_cbt("Target domain performance", bright=True) print( tabulate([ ["mean return", to.mean(rets_real).item()], ["std return", to.std(rets_real)], ["min return", to.min(rets_real)], ["max return", to.max(rets_real)], ])) return to.mean(rets_real)
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) for idx, p in enumerate(self.particles): pyrado.save(p, f'particle_{idx}', 'pt', self.save_dir, meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subrtn of another algorithm pyrado.save(self.env, "env.pkl", self.save_dir) self.svpg.save_snapshot(meta_info) else: raise pyrado.ValueErr( msg=f"{self.name} is not supposed be run as a subrtn!")
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) # The subroutines are saving their snapshots during their training if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env_sim, "env_sim.pkl", self._save_dir) else: raise pyrado.ValueErr( msg=f"{self.name} is not supposed be run as a subrtn!")
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) # Save the best element of the current population best_policy = deepcopy(self._policy) best_policy.param_values = self.best_policy_param pyrado.save(best_policy, 'policy', 'pt', self.save_dir, meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) # Policies of every iteration are saved by the subroutine in train_policy_sim() if meta_info is None: # This algorithm instance is not a subroutine of another algorithm joblib.dump(self._env_sim, osp.join(self.save_dir, 'env_sim.pkl')) joblib.dump(self._env_real, osp.join(self.save_dir, 'env_real.pkl')) pyrado.save(self.policy, 'policy', 'pt', self.save_dir, None) else: raise pyrado.ValueErr(msg=f'{self.name} is not supposed be run as a subroutine!')
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self.env_dr, 'env', 'pkl', self.save_dir, meta_info) pyrado.save(self.env_dr.randomizer, 'randomizer', 'pkl', self.save_dir, meta_info) else: raise pyrado.ValueErr( msg=f'{self.name} is not supposed be run as a subroutine!')
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm if self._subrtn_policy is None: # The policy is not being updated by a policy optimization subroutine pyrado.save(self._policy, "policy.pt", self.save_dir, use_state_dict=True) else: self._subrtn_policy.save_snapshot() else: raise pyrado.ValueErr( msg=f"{self.name} is not supposed be run as a subroutine!")
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, "env.pkl", self.save_dir) pyrado.save(self._expl_strat.policy, "policy.pt", self.save_dir, use_state_dict=True) pyrado.save(self._critic.vfcn, "vfcn.pt", self.save_dir, use_state_dict=True) else: # This algorithm instance is a subroutine of another algorithm prefix = meta_info.get("prefix", "") suffix = meta_info.get("suffix", "") pyrado.save( self._expl_strat.policy, "policy.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True ) pyrado.save(self._critic.vfcn, "vfcn.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True)
def train_policy_sim(self, cand: to.Tensor, prefix: str, cnt_rep: int) -> float: """ Train a policy in simulation for given hyper-parameters from the domain randomizer. :param cand: hyper-parameters for the domain parameter distribution (need be compatible with the randomizer) :param prefix: set a prefix to the saved file name, use "" for no prefix :param cnt_rep: current repetition count, coming from the wrapper function :return: estimated return of the trained policy in the target domain """ # Save the current candidate pyrado.save(cand.view(-1), "candidate.pt", self.save_dir, prefix=prefix) # Set the domain randomizer self._env_sim.adapt_randomizer(cand.detach().cpu().numpy()) # Reset the subroutine algorithm which includes resetting the exploration self._cnt_samples += self._subrtn_policy.sample_count self._subrtn_policy.reset() # Do a warm start if desired, but randomly reset the policy parameters if training failed once self._subrtn_policy.init_modules( self.warmstart and cnt_rep == 0, policy_param_init=self.policy_param_init, valuefcn_param_init=self.valuefcn_param_init, ) # Train a policy in simulation using the subroutine self._subrtn_policy.train(snapshot_mode=self.subrtn_snapshot_mode, meta_info=dict(prefix=prefix)) # Return the estimated return of the trained policy in simulation ros = self.eval_behav_policy(None, self._env_sim, self._subrtn_policy.policy, prefix, self.num_eval_rollouts) avg_ret_sim = to.mean(to.tensor([r.undiscounted_return() for r in ros])) return float(avg_ret_sim)
def save_snapshot(self, meta_info: Optional[dict] = None): super().save_snapshot(meta_info) # Save the best element of the current population best_policy = deepcopy(self._policy) best_policy.param_values = self.best_policy_param if meta_info is not None: # This algorithm instance is a subroutine of another alogrithm pyrado.save( best_policy, "policy.pt", self.save_dir, prefix=meta_info.get("prefix", ""), suffix=meta_info.get("suffix", ""), use_state_dict=True, ) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(best_policy, "policy.pt", self.save_dir, use_state_dict=True) pyrado.save(self._env, "env.pkl", self.save_dir)
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) # ParameterExploring subroutine saves the best policy (in this case a DomainDistrParamPolicy) prefix = meta_info.get("prefix", "") if prefix != "": self._subrtn.save_snapshot(meta_info=dict( prefix=f"{prefix}_ddp")) # save iter_X_ddp_policy.pt self._subrtn.save_snapshot( meta_info=dict(prefix="ddp")) # override ddp_policy.pt joblib.dump(self._subrtn.env, osp.join(self.save_dir, "env_sim.pkl")) # Print the current search distribution's mean cpp = self._subrtn.policy.transform_to_ddp_space( self._subrtn.policy.param_values) self._subrtn.env.adapt_randomizer( domain_distr_param_values=cpp.detach().cpu().numpy()) print_cbt( f"Current policy domain parameter distribution\n{self._subrtn.env.randomizer}", "g") # Set the randomizer to best fitted domain distribution cbp = self._subrtn.policy.transform_to_ddp_space( self._subrtn.best_policy_param) self._subrtn.env.adapt_randomizer( domain_distr_param_values=cbp.detach().cpu().numpy()) print_cbt( f"Best fitted domain parameter distribution\n{self._subrtn.env.randomizer}", "g") if "rollouts_real" not in meta_info: raise pyrado.KeyErr(keys="rollouts_real", container=meta_info) pyrado.save(meta_info["rollouts_real"], "rollouts_real.pkl", self.save_dir, prefix=prefix)
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self.qfcn_targ_1, "qfcn_target1.pt", self.save_dir, use_state_dict=True) pyrado.save(self.qfcn_targ_2, "qfcn_target2.pt", self.save_dir, use_state_dict=True) else: # This algorithm instance is a subroutine of another algorithm prefix = meta_info.get("prefix", "") suffix = meta_info.get("suffix", "") pyrado.save( self.qfcn_targ_1, "qfcn_target1.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True ) pyrado.save( self.qfcn_targ_2, "qfcn_target2.pt", self.save_dir, prefix=prefix, suffix=suffix, use_state_dict=True )
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) # Policies of every iteration are saved by the subroutine in train_policy_sim() if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env_sim, "env_sim.pkl", self._save_dir) pyrado.save(self._env_real, "env_real.pkl", self._save_dir) pyrado.save(self.policy, "policy.pt", self.save_dir, use_state_dict=True) else: raise pyrado.ValueErr( msg=f"{self.name} is not supposed be run as a subroutine!")
def save_snapshot(self, meta_info: dict = None): super().save_snapshot() if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._policy, "policy.pt", self.save_dir, use_state_dict=False) pyrado.save(self.dataset, "dataset.pt", self.save_dir) else: # This algorithm instance is a subroutine of another algorithm pyrado.save( self._policy, "policy.pt", self.save_dir, prefix=meta_info.get("prefix", ""), suffix=meta_info.get("suffix", ""), use_state_dict=True, ) pyrado.save( self.dataset, "dataset.pt", self.save_dir, prefix=meta_info.get("prefix", ""), suffix=meta_info.get("suffix", ""), )
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) if meta_info is None: # This algorithm instance is not a subroutine of another algorithm pyrado.save(self._env, "env.pkl", self.save_dir) for idx, p in enumerate(self.particles): pyrado.save(p, f"particle_{idx}.pt", self.save_dir, use_state_dict=True) else: # This algorithm instance is a subroutine of another algorithm for idx, p in enumerate(self.particles): pyrado.save( p, f"particle_{idx}.pt", self.save_dir, prefix=meta_info.get("prefix", ""), suffix=meta_info.get("suffix", ""), use_state_dict=True, )
def __init__(self, save_dir: str, env_sim: MetaDomainRandWrapper, env_real: [RealEnv, EnvWrapper], subrtn: Algorithm, ddp_space: BoxSpace, max_iter: int, acq_fc: str, acq_restarts: int, acq_samples: int, acq_param: dict = None, num_init_cand: int = 5, mc_estimator: bool = True, num_eval_rollouts_real: int = 5, num_eval_rollouts_sim: int = 50, thold_succ: float = pyrado.inf, thold_succ_subrtn: float = -pyrado.inf, warmstart: bool = True, policy_param_init: Optional[to.Tensor] = None, valuefcn_param_init: Optional[to.Tensor] = None, subrtn_snapshot_mode: str = 'best', logger: Optional[StepLogger] = None): """ Constructor .. note:: If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to initialize every of the policies with a pre-trained policy parameters use `policy_param_init`. :param save_dir: directory to save the snapshots i.e. the results in :param env_sim: randomized simulation environment a.k.a. source domain :param env_real: real-world environment a.k.a. target domain :param subrtn: algorithm which performs the policy / value-function optimization :param ddp_space: space holding the boundaries for the domain distribution parameters :param max_iter: maximum number of iterations :param acq_fc: Acquisition Function 'UCB': Upper Confidence Bound (default $\beta = 0.1$) 'EI': Expected Improvement 'PI': Probability of Improvement :param acq_restarts: number of restarts for optimizing the acquisition function :param acq_samples: number of initial samples for optimizing the acquisition function :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence bound (`False`) obtained from bootstrapping :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the subroutine until the threshold is exceeded or the for a given number of iterations :param warmstart: initialize the policy parameters with the one of the previous iteration. This option has no effect for initial policies and can be overruled by passing init policy params explicitly. :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if typed_env(env_sim, MetaDomainRandWrapper) is None: raise pyrado.TypeErr(given=env_sim, expected_type=MetaDomainRandWrapper) if not isinstance(subrtn, Algorithm): raise pyrado.TypeErr(given=subrtn, expected_type=Algorithm) if not isinstance(ddp_space, BoxSpace): raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace) if num_init_cand < 1: raise pyrado.ValueErr(given=num_init_cand, ge_constraint='1') # Call InterruptableAlgorithm's constructor without specifying the policy super().__init__(num_checkpoints=2, init_checkpoint=-2, save_dir=save_dir, max_iter=max_iter, policy=subrtn.policy, logger=logger) self._env_sim = env_sim self._env_real = env_real self._subrtn = subrtn self._subrtn.save_name = 'subrtn' self.ddp_space = ddp_space self.ddp_projector = UnitCubeProjector(to.from_numpy(self.ddp_space.bound_lo), to.from_numpy(self.ddp_space.bound_up)) self.cands = None # called x in the context of GPs self.cands_values = None # called y in the context of GPs self.argmax_cand = to.Tensor() self.acq_fcn_type = acq_fc.upper() self.acq_restarts = acq_restarts self.acq_samples = acq_samples self.acq_param = acq_param self.num_init_cand = num_init_cand self.mc_estimator = mc_estimator self.policy_param_init = policy_param_init self.valuefcn_param_init = valuefcn_param_init.detach() if valuefcn_param_init is not None else None self.warmstart = warmstart self.num_eval_rollouts_real = num_eval_rollouts_real self.num_eval_rollouts_sim = num_eval_rollouts_sim self.subrtn_snapshot_mode = subrtn_snapshot_mode self.thold_succ = to.tensor([thold_succ]) self.thold_succ_subrtn = to.tensor([thold_succ_subrtn]) self.max_subrtn_rep = 3 # number of tries to exceed thold_succ_subrtn during training in simulation self.curr_cand_value = -pyrado.inf # for the stopping criterion if self.policy_param_init is not None: if to.is_tensor(self.policy_param_init): self.policy_param_init.detach() else: self.policy_param_init = to.tensor(self.policy_param_init) # Save initial environments and the domain distribution parameter space self.save_snapshot(meta_info=None) pyrado.save(self.ddp_space, 'ddp_space', 'pkl', self.save_dir)
def evaluate_policy(args, ex_dir): """Helper function to evaluate the policy from an experiment in the associated environment.""" env, policy, _ = load_experiment(ex_dir, args) # Create multi-dim evaluation grid param_spec = dict() param_spec_dim = None if isinstance(inner_env(env), BallOnPlateSim): param_spec["ball_radius"] = np.linspace(0.02, 0.08, num=2, endpoint=True) param_spec["ball_rolling_friction_coefficient"] = np.linspace(0.0295, 0.9, num=2, endpoint=True) elif isinstance(inner_env(env), QQubeSwingUpSim): eval_num = 200 # Use nominal values for all other parameters. for param, nominal_value in env.get_nominal_domain_param().items(): param_spec[param] = nominal_value # param_spec["gravity_const"] = np.linspace(5.0, 15.0, num=eval_num, endpoint=True) param_spec["damping_pend_pole"] = np.linspace(0.0, 0.0001, num=eval_num, endpoint=True) param_spec["damping_rot_pole"] = np.linspace(0.0, 0.0006, num=eval_num, endpoint=True) param_spec_dim = 2 elif isinstance(inner_env(env), QBallBalancerSim): # param_spec["gravity_const"] = np.linspace(7.91, 11.91, num=11, endpoint=True) # param_spec["ball_mass"] = np.linspace(0.003, 0.3, num=11, endpoint=True) # param_spec["ball_radius"] = np.linspace(0.01, 0.1, num=11, endpoint=True) param_spec["plate_length"] = np.linspace(0.275, 0.275, num=11, endpoint=True) param_spec["arm_radius"] = np.linspace(0.0254, 0.0254, num=11, endpoint=True) # param_spec["load_inertia"] = np.linspace(5.2822e-5*0.5, 5.2822e-5*1.5, num=11, endpoint=True) # param_spec["motor_inertia"] = np.linspace(4.6063e-7*0.5, 4.6063e-7*1.5, num=11, endpoint=True) # param_spec["gear_ratio"] = np.linspace(60, 80, num=11, endpoint=True) # param_spec["gear_efficiency"] = np.linspace(0.6, 1.0, num=11, endpoint=True) # param_spec["motor_efficiency"] = np.linspace(0.49, 0.89, num=11, endpoint=True) # param_spec["motor_back_emf"] = np.linspace(0.006, 0.066, num=11, endpoint=True) # param_spec["motor_resistance"] = np.linspace(2.6*0.5, 2.6*1.5, num=11, endpoint=True) # param_spec["combined_damping"] = np.linspace(0.0, 0.05, num=11, endpoint=True) # param_spec["friction_coeff"] = np.linspace(0, 0.015, num=11, endpoint=True) # param_spec["voltage_thold_x_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True) # param_spec["voltage_thold_x_neg"] = np.linspace(-1., 0.0, num=11, endpoint=True) # param_spec["voltage_thold_y_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True) # param_spec["voltage_thold_y_neg"] = np.linspace(-1.0, 0, num=11, endpoint=True) # param_spec["offset_th_x"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True) # param_spec["offset_th_y"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True) else: raise NotImplementedError # Always add an action delay wrapper (with 0 delay by default) if typed_env(env, ActDelayWrapper) is None: env = ActDelayWrapper(env) # param_spec['act_delay'] = np.linspace(0, 30, num=11, endpoint=True, dtype=int) add_info = "-".join(param_spec.keys()) # Create multidimensional results grid and ensure right number of rollouts param_list = param_grid(param_spec) param_list *= args.num_rollouts_per_config # Fix initial state (set to None if it should not be fixed) init_state = np.array([0.0, 0.0, 0.0, 0.0]) # Create sampler pool = SamplerPool(args.num_workers) if args.seed is not None: pool.set_seed(args.seed) print_cbt(f"Set the random number generators' seed to {args.seed}.", "w") else: print_cbt("No seed was set", "y") # Sample rollouts ros = eval_domain_params(pool, env, policy, param_list, init_state) # Compute metrics lod = [] for ro in ros: d = dict(**ro.rollout_info["domain_param"], ret=ro.undiscounted_return(), len=ro.length) # Simply remove the observation noise from the domain parameters try: d.pop("obs_noise_mean") d.pop("obs_noise_std") except KeyError: pass lod.append(d) df = pd.DataFrame(lod) metrics = dict( avg_len=df["len"].mean(), avg_ret=df["ret"].mean(), median_ret=df["ret"].median(), min_ret=df["ret"].min(), max_ret=df["ret"].max(), std_ret=df["ret"].std(), ) pprint(metrics, indent=4) # Create subfolder and save timestamp = datetime.datetime.now() add_info = timestamp.strftime(pyrado.timestamp_format) + "--" + add_info save_dir = osp.join(ex_dir, "eval_domain_grid", add_info) os.makedirs(save_dir, exist_ok=True) save_dicts_to_yaml( {"ex_dir": str(ex_dir)}, {"varied_params": list(param_spec.keys())}, {"num_rpp": args.num_rollouts_per_config, "seed": args.seed}, {"metrics": dict_arraylike_to_float(metrics)}, save_dir=save_dir, file_name="summary", ) pyrado.save(df, f"df_sp_grid_{len(param_spec) if param_spec_dim is None else param_spec_dim}d.pkl", save_dir)
def step(self, snapshot_mode: str = 'latest', meta_info: dict = None): # Save snapshot to save the correct iteration count self.save_snapshot() if self.curr_checkpoint == -2: # Train the initial policies in the source domain self.train_init_policies() self.reached_checkpoint() # setting counter to -1 if self.curr_checkpoint == -1: # Evaluate the initial policies in the target domain self.eval_init_policies() self.reached_checkpoint() # setting counter to 0 if self.curr_checkpoint == 0: # Normalize the input data and standardize the output data cands_norm = self.ddp_projector.project_to(self.cands) cands_values_stdized = standardize(self.cands_values).unsqueeze(1) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) print_cbt('Fitted the GP.', 'g') # Acquisition functions if self.acq_fcn_type == 'UCB': acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True) elif self.acq_fcn_type == 'EI': acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) elif self.acq_fcn_type == 'PI': acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) else: raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'") # Optimize acquisition function and get new candidate point cand_norm, acq_value = optimize_acqf( acq_function=acq_fcn, bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]), q=1, num_restarts=self.acq_restarts, raw_samples=self.acq_samples ) next_cand = self.ddp_projector.project_back(cand_norm) print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g') self.cands = to.cat([self.cands, next_cand], dim=0) pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 1 if self.curr_checkpoint == 1: # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn.item(), self.max_subrtn_rep )(self.train_policy_sim) wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}') self.reached_checkpoint() # setting counter to 2 if self.curr_checkpoint == 2: # Evaluate the current policy in the target domain policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'iter_{self._curr_iter}')) self.curr_cand_value = self.eval_policy( self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}', self.num_eval_rollouts_real ) self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0) pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info) # Store the argmax after training and evaluating curr_argmax_cand = BayRn.argmax_posterior_mean( self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples ) self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0) pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 0
if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() if not osp.isfile(args.file): raise pyrado.PathErr(given=args.file) if args.dir is None: # Use the file's directory by default args.dir = osp.dirname(args.file) elif not osp.isdir(args.dir): raise pyrado.PathErr(given=args.dir) df = pd.read_csv(args.file) if args.env_name == MiniGolfIKSim.name: env = MiniGolfIKSim() elif args.env_name == MiniGolfJointCtrlSim.name: env = MiniGolfJointCtrlSim() else: raise NotImplementedError # Cast the rollout from a DataFrame to a StepSequence reconstructed = StepSequence.from_pandas(df, env.spec, task=env.task) if args.dir is not None: suffix = args.file[args.file.rfind("/") + 1:-4] pyrado.save(reconstructed, f"rollout_{suffix}.pkl", args.dir, verbose=True)
def __init__( self, save_dir: pyrado.PathLike, env_sim: SimEnv, env_real: Union[Env, str], policy: Policy, dp_mapping: Mapping[int, str], prior: Distribution, embedding: Embedding, num_checkpoints: int, init_checkpoint: int, max_iter: int, num_real_rollouts: int, num_sim_per_round: int, num_segments: int = None, len_segments: int = None, stop_on_done: bool = True, use_rec_act: bool = True, num_sbi_rounds: int = 1, reset_sbi_routine_each_iter: bool = False, reset_proposal_each_iter: bool = False, num_eval_samples: Optional[int] = None, posterior_hparam: Optional[dict] = None, subrtn_sbi_training_hparam: Optional[dict] = None, subrtn_sbi_sampling_hparam: Optional[dict] = None, simulation_batch_size: int = 1, normalize_posterior: bool = True, subrtn_policy: Optional[Algorithm] = None, subrtn_policy_snapshot_mode: str = "latest", train_initial_policy: bool = True, thold_succ_subrtn: float = -pyrado.inf, warmstart: bool = True, num_workers: int = 4, logger: Optional[StepLogger] = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env_sim: randomized simulation environment a.k.a. source domain :param env_real: real-world environment a.k.a. target domain, this can be a `RealEnv` (sim-to-real setting), a `SimEnv` (sim-to-sim setting), or a directory to load a pre-recorded set of rollouts from :param policy: policy used for sampling the rollout, if subrtn_policy is not `None` this policy is not oly used for generating the target domain rollouts, but also optimized in simulation :param dp_mapping: mapping from subsequent integers (starting at 0) to domain parameter names (e.g. mass) :param prior: distribution used by sbi as a prior :param embedding: embedding used for pre-processing the data before passing it to the posterior :param num_checkpoints: total number of checkpoints :param init_checkpoint: initial value of the cyclic counter, defaults to 0, use negative values can to mark sections that should only be executed once :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_real_rollouts: number of real-world rollouts received by sbi, i.e. from every rollout exactly one data set is computed :param num_sim_per_round: number of simulations done by sbi per round (i.e. iteration over the same target domain data set) :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param stop_on_done: if `True`, the rollouts are stopped as soon as they hit the state or observation space boundaries. This behavior is save, but can lead to short trajectories which are eventually padded with zeroes. Chose `False` to ignore the boundaries (dangerous on the real system). :param use_rec_act: if `True` the recorded actions form the target domain are used to generate the rollout during simulation (feed-forward). If `False` there policy is used to generate (potentially) state-dependent actions (feed-back). :param reset_sbi_routine_each_iter: if `True` the sbi subroutine instance is recreated every iteration. Use this flag to train the posterior each iteration from scratch. :param num_sbi_rounds: set to an integer > 1 to use multi-round sbi. This way the posteriors (saved as `..._round_NUMBER...` will be tailored to the data of that round, where `NUMBER` counts up each round (modulo `num_real_rollouts`). If `num_sbi_rounds` = 1, the posterior is called amortized (it has never seen any target domain data). :param num_eval_samples: number of samples for evaluating the posterior in `eval_posterior()` :param posterior_hparam: hyper parameters for creating the posterior's density estimator :param subrtn_sbi_training_hparam: dict forwarded to sbi's `PosteriorEstimator.train()` function like `training_batch_size`, `learning_rate`, `retrain_from_scratch_each_round`, ect. :param simulation_batch_size: batch size forwarded to the sbi toolbox, requires batched simulator :param normalize_posterior: if `True` the normalization of the posterior density is enforced by sbi :param subrtn_policy: algorithm which performs the optimization of the behavioral policy (and value-function) :param subrtn_policy_snapshot_mode: snapshot mode for saving during policy optimization :param train_initial_policy: choose if a policy should be pretrained in the first iteration before collecting real rollouts. Choose `False`, if you want to use a pre-defined policy. :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the subroutine until the threshold is exceeded or the for a given number of iterations :param warmstart: initialize the policy (and value function) parameters with the one of the previous iteration. This behavior can also be overruled by passing `init_policy_params` (and `valuefcn_param_init`) explicitly. :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(inner_env(env_sim), SimEnv) or ( isinstance(env_sim, DomainRandWrapper) and not isinstance(env_sim, ActDelayWrapper)): raise pyrado.TypeErr( msg= "The given env_sim must be a non-randomized simulation environment, " "except for wrappers that add a domain parameter!") if isinstance(prior, Normal): raise pyrado.TypeErr( msg= "The sbi framework requires MultivariateNormal instead of Normal distributions for the prior." ) if not prior.event_shape[0] == len(dp_mapping): raise pyrado.ShapeErr(given=prior.event_shape, expected_match=dp_mapping) if posterior_hparam is None: posterior_hparam = dict() elif not isinstance(posterior_hparam, dict): raise pyrado.TypeErr(given=posterior_hparam, expected_type=dict) if subrtn_sbi_training_hparam is None: subrtn_sbi_training_hparam = dict() elif not isinstance(subrtn_sbi_training_hparam, dict): raise pyrado.TypeErr(given=subrtn_sbi_training_hparam, expected_type=dict) # Call InterruptableAlgorithm's constructor super().__init__( num_checkpoints=num_checkpoints, init_checkpoint=init_checkpoint, save_dir=save_dir, max_iter=max_iter, policy=policy, logger=logger, ) self._env_sim_sbi = env_sim # will be randomized explicitly by sbi self._env_sim_trn = DomainRandWrapperBuffer(copy.deepcopy(env_sim), randomizer=None, selection="cyclic") self._env_real = env_real self.dp_mapping = dp_mapping self._embedding = embedding self.num_sim_per_round = num_sim_per_round self.num_real_rollouts = num_real_rollouts self.num_segments = num_segments self.len_segments = len_segments self.stop_on_done = stop_on_done self.use_rec_act = use_rec_act self.reset_sbi_routine_each_iter = reset_sbi_routine_each_iter self.reset_proposal_each_iter = reset_proposal_each_iter self.num_sbi_rounds = num_sbi_rounds self.num_eval_samples = num_eval_samples or 10 * 2**len(dp_mapping) self.simulation_batch_size = simulation_batch_size self.normalize_posterior = normalize_posterior self._subrtn_sbi = None self.subrtn_sbi_training_hparam = subrtn_sbi_training_hparam or dict() self.posterior_hparam = posterior_hparam or dict() self.thold_succ_subrtn = float(thold_succ_subrtn) self.max_subrtn_rep = 3 # number of tries to exceed thold_succ_subrtn during training in simulation self.warmstart = warmstart self.num_workers = int(num_workers) # Temporary containers self._curr_data_real = None self._curr_domain_param_eval = None # Initialize sbi simulator and prior self._sbi_simulator = None # to be set in step() self._sbi_prior = None # to be set in step() self._setup_sbi(prior=prior) # Optional policy optimization subroutine self._subrtn_policy = subrtn_policy if isinstance(self._subrtn_policy, Algorithm): self._subrtn_policy_snapshot_mode = subrtn_policy_snapshot_mode self._subrtn_policy.save_name = "subrtn_policy" self._train_initial_policy = train_initial_policy # Check that the behavioral policy is the one that is being updated if self._subrtn_policy.policy is not self.policy: raise pyrado.ValueErr( msg= "The policy is the policy subroutine is not the same as the one used by " "the system identification (sbi) subroutine!") # Save initial environments, the embedding, and the prior pyrado.save(self._env_sim_trn, "env_sim.pkl", self._save_dir) pyrado.save(self._env_real, "env_real.pkl", self._save_dir) pyrado.save(embedding, "embedding.pt", self._save_dir) pyrado.save(prior, "prior.pt", self._save_dir) pyrado.save(policy, "init_policy.pt", self._save_dir, use_state_dict=True)
def step(self, snapshot_mode: str, meta_info: dict = None, parallel: bool = True): rand_trajs = [] ref_trajs = [] ros = [] visited = [] for i in range(self.svpg.num_particles): done = False svpg_env = self.svpg_wrapper state = svpg_env.reset() states = [] actions = [] rewards = [] infos = [] rand_trajs_now = [] if parallel: with to.no_grad(): for t in range(10): action = (self.svpg.expl_strats[i](to.as_tensor( state, dtype=to.get_default_dtype())).detach().cpu( ).numpy()) state = svpg_env.lite_step(action) states.append(state) actions.append(action) visited.append(states) rewards, rand_trajs_now, ref_trajs_now = svpg_env.eval_states( states) rand_trajs += rand_trajs_now ref_trajs += ref_trajs_now ros.append( StepSequence(observations=states, actions=actions, rewards=rewards)) else: with to.no_grad(): while not done: action = (self.svpg.expl_strats[i](to.as_tensor( state, dtype=to.get_default_dtype())).detach().cpu( ).numpy()) state, reward, done, info = svpg_env.step(action) print(self.params.array_to_dict(state), " => ", reward) states.append(state) rewards.append(reward) actions.append(action) infos.append(info) rand_trajs += info["rand"] ref_trajs += info["ref"] ros.append( StepSequence(observations=states, actions=actions, rewards=rewards)) self.logger.add_value(f"SVPG_agent_{i}_mean_reward", np.mean(rewards)) ros[i].torch(data_type=to.DoubleTensor) for rt in rand_trajs_now: rt.torch(data_type=to.double) rt.observations = rt.observations.double().detach() rt.actions = rt.actions.double().detach() self._subrtn.update(rand_trajs_now) # Logging rets = [ro.undiscounted_return() for ro in rand_trajs] ret_avg = np.mean(rets) ret_med = np.median(rets) ret_std = np.std(rets) self.logger.add_value("avg rollout len", np.mean([ro.length for ro in rand_trajs])) self.logger.add_value("avg return", ret_avg) self.logger.add_value("median return", ret_med) self.logger.add_value("std return", ret_std) # Flatten and combine all randomized and reference trajectories for discriminator flattened_randomized = StepSequence.concat(rand_trajs) flattened_randomized.torch(data_type=to.double) flattened_reference = StepSequence.concat(ref_trajs) flattened_reference.torch(data_type=to.double) self.reward_generator.train(flattened_reference, flattened_randomized, self.num_discriminator_epoch) pyrado.save(self.reward_generator.discriminator, "discriminator.pt", self.save_dir, prefix="adr", use_state_dict=True) if self.curr_time_step > self.warm_up_time: # Update the particles # List of lists to comply with interface self.svpg.update(list(map(lambda x: [x], ros))) flattened_randomized.torch(data_type=to.double) flattened_randomized.observations = flattened_randomized.observations.double( ).detach() flattened_randomized.actions = flattened_randomized.actions.double( ).detach() # np.save(f'{self.save_dir}actions{self.curr_iter}', flattened_randomized.actions) self.make_snapshot(snapshot_mode, float(ret_avg), meta_info) self._subrtn.make_snapshot(snapshot_mode="best", curr_avg_ret=float(ret_avg)) self.curr_time_step += 1
def collect_data_real( save_dir: Optional[pyrado.PathLike], env: Union[Env, str], policy: Policy, embedding: Embedding, num_rollouts: int, num_segments: int = None, len_segments: int = None, prefix: str = "", ) -> Tuple[to.Tensor, List[StepSequence]]: """ Roll-out a (behavioral) policy on the target system for later use with the sbi module, and save the data computed from the recorded rollouts. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance, in case you want to use pre-recorded rollouts pass the path to the parent folder as string :param policy: policy to evaluate :param embedding: embedding used for pre-processing the data before passing it to the posterior :param num_rollouts: number of rollouts to collect on the target system :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :return: data from the real-world rollouts a.k.a. set of $x_o$ of shape [num_iter, num_rollouts_per_iter, time_series_length, dim_data], and the real-world rollouts """ if not (isinstance(inner_env(env), RealEnv) or isinstance(inner_env(env), SimEnv) or isinstance(env, str)): raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv, str]) # Evaluate sequentially (necessary for sim-to-real experiments) if isinstance(env, str): rollout_worker = RecRolloutSamplerForSBI(env, embedding, num_segments, len_segments, rand_init_rollout=False) else: rollout_worker = RealRolloutSamplerForSBI(env, policy, embedding, num_segments, len_segments) # Initialize data containers data_real = None rollouts_real = None num_found_rollouts = 0 if save_dir is not None: try: data_real = pyrado.load("data_real.pt", save_dir, prefix=prefix) rollouts_real = pyrado.load("rollouts_real.pkl", save_dir, prefix=prefix) if not data_real.shape[0] == len(rollouts_real): raise pyrado.ShapeErr( msg= f"Found {data_real.shape[0]} entries in data_real.pt, but {len(rollouts_real)} rollouts in " f"rollouts_real.pkl!") num_found_rollouts = len(rollouts_real) print_cbt( f"Found {num_found_rollouts} rollout(s) in {save_dir}.", "w") except FileNotFoundError: pass # in the first attempt no files can be found collect_str = f"Collecting data" if prefix == "" else f"Collecting data using {prefix}_policy" for _ in tqdm( range(num_found_rollouts, num_rollouts), total=num_rollouts, desc=Fore.CYAN + Style.BRIGHT + collect_str + Style.RESET_ALL, unit="rollouts", file=sys.stdout, ): # Do the rollout data, rollout = rollout_worker() # Fill data container if data_real is None or rollouts_real is None: data_real = data # data is of shape [1, dim_feat] rollouts_real = [rollout] else: data_real = to.cat( [data_real, data], dim=1) # stack to final shape [1, num_rollouts * dim_feat] rollouts_real.append(rollout) # Optionally save the data (do this at every iteration to continue) if save_dir is not None: pyrado.save(data_real, "data_real.pt", save_dir, prefix=prefix) pyrado.save(rollouts_real, "rollouts_real.pkl", save_dir, prefix=prefix) if data_real.shape != (1, num_rollouts * embedding.dim_output): raise pyrado.ShapeErr(given=data_real, expected_match=(1, num_rollouts * embedding.dim_output)) return data_real, rollouts_real
def save_snapshot(self, meta_info: dict = None): super().save_snapshot(meta_info) pyrado.save(self.qfcn_targ, 'qfcn_target', 'pt', self.save_dir, meta_info)
def eval_policy( save_dir: Optional[pyrado.PathLike], env: [RealEnv, SimEnv, MetaDomainRandWrapper], policy: Policy, mc_estimator: bool, prefix: str, num_rollouts: int, num_workers: int = 4, ) -> to.Tensor: """ Evaluate a policy either in the source or in the target domain. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence bound (`False`) obtained from bootrapping :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_workers: number of environments for the parallel sampler (only used for a `SimEnv`) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f"Executing {prefix}_policy ...", "c", bright=True) rets_real = to.zeros(num_rollouts) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when conducting a sim-to-real experiment for i in range(num_rollouts): rets_real[i] = rollout(env, policy, eval=True).undiscounted_return() # If a reward of -1 is given, skip evaluation ahead and set all returns to zero if rets_real[i] == -1: print_cbt("Set all returns for this policy to zero.", color="c") rets_real = to.zeros(num_rollouts) break elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when conducting a sim-to-sim experiment sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=num_rollouts) ros = sampler.sample() for i in range(num_rollouts): rets_real[i] = ros[i].undiscounted_return() else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) if save_dir is not None: # Save and print the evaluation results pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix) print_cbt("Target domain performance", bright=True) print( tabulate([ ["mean return", to.mean(rets_real).item()], ["std return", to.std(rets_real)], ["min return", to.min(rets_real)], ["max return", to.max(rets_real)], ])) if mc_estimator: return to.mean(rets_real) else: _, ci_lo, _ = bootstrap_ci(rets_real.numpy(), np.mean, num_reps=1000, alpha=0.05, ci_sides=1, studentized=False) return to.from_numpy(ci_lo)
def __init__( self, save_dir: pyrado.PathLike, env: [SimEnv, StateAugmentationWrapper], subrtn: Algorithm, policy: Policy, expl_strat: StochasticActionExplStrat, max_iter: int, num_rollouts: int = None, steps_num: int = None, apply_dynamics_noise: bool = False, dyn_eps: float = 0.01, dyn_phi: float = 0.1, halfspan: float = 0.25, apply_proccess_noise: bool = False, proc_eps: float = 0.01, proc_phi: float = 0.05, apply_observation_noise: bool = False, obs_eps: float = 0.01, obs_phi: float = 0.05, torch_observation: bool = True, logger: StepLogger = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment in which the agent should be trained :param subrtn: algorithm which performs the policy / value-function optimization :param policy: policy to be updated :param expl_strat: the exploration strategy :param max_iter: the maximum number of iterations :param num_rollouts: the number of rollouts to be performed for each update step :param steps_num: the number of steps to be performed for each update step :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied :param dyn_eps: the intensity of generated dynamics noise :param dyn_phi: the probability of applying dynamics noise :param halfspan: the halfspan of the uniform random distribution used to sample :param apply_proccess_noise: whether adversarially generated process noise should be applied :param proc_eps: the intensity of generated process noise :param proc_phi: the probability of applying process noise :param apply_observation_noise: whether adversarially generated observation noise should be applied :param obs_eps: the intensity of generated observation noise :param obs_phi: the probability of applying observation noise :param torch_observation: a function to provide a differentiable observation :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ assert isinstance(subrtn, Algorithm) assert isinstance(max_iter, int) and max_iter > 0 super().__init__(save_dir, max_iter, policy, logger) # Initialize adversarial wrappers if apply_dynamics_noise: assert isinstance(env, StateAugmentationWrapper) env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps, dyn_phi, halfspan) if apply_proccess_noise: env = AdversarialStateWrapper(env, self.policy, proc_eps, proc_phi, torch_observation=torch_observation) if apply_observation_noise: env = AdversarialObservationWrapper(env, self.policy, obs_eps, obs_phi) self._env = env # TODO @Robin: how do you make sure that the newly wrapped env is used by the subroutine? # Subroutine self._subrtn = subrtn self._subrtn.save_name = "subrtn" pyrado.save(self._env, "env.pkl", self.save_dir)