def plot_actions(ro: StepSequence, env: Env): """ Plot all action trajectories of the given rollout. :param ro: input rollout :param env: environment (used for getting the clipped action values) """ if hasattr(ro, 'actions'): if not isinstance(ro.actions, np.ndarray): raise pyrado.TypeErr(given=ro.actions, expected_type=np.ndarray) dim_act = ro.actions.shape[1] # Use recorded time stamps if possible t = ro.env_infos.get('t', np.arange(0, ro.length)) if hasattr( ro, 'env_infos') else np.arange(0, ro.length) fig, axs = plt.subplots(dim_act, figsize=(8, 12)) fig.suptitle('Actions over Time') colors = plt.get_cmap('tab20')(np.linspace(0, 1, dim_act)) act_norm_wrapper = typed_env(env, ActNormWrapper) if act_norm_wrapper is not None: lb, ub = inner_env(env).act_space.bounds act_denorm = lb + (ro.actions[:] + 1.) * (ub - lb) / 2 act_clipped = np.array( [inner_env(env).limit_act(a) for a in act_denorm]) else: act_denorm = ro.actions act_clipped = np.array([env.limit_act(a) for a in ro.actions[:]]) if dim_act == 1: axs.plot(t, act_denorm, label=_get_act_label(ro, 0) + ' (to env)') axs.plot(t, act_clipped, label=_get_act_label(ro, 0) + ' (clipped)', c='k', ls='--') axs.legend(bbox_to_anchor=(0, 1.0, 1, -0.1), loc='lower left', mode='expand', ncol=2) else: for i in range(dim_act): axs[i].plot(t, act_denorm[:, i], label=_get_act_label(ro, i) + ' (to env)', c=colors[i]) axs[i].plot(t, act_clipped[:, i], label=_get_act_label(ro, i) + ' (clipped)', c='k', ls='--') axs[i].legend(bbox_to_anchor=(0, 1.0, 1, -0.1), loc='lower left', mode='expand', ncol=2) plt.subplots_adjust(hspace=1.2) plt.show()
def eval_policy(save_dir: [str, None], env: [RealEnv, SimEnv, MetaDomainRandWrapper], policy: Policy, mc_estimator: bool, prefix: str, num_rollouts: int, num_parallel_envs: int = 1) -> to.Tensor: """ Evaluate a policy on the target system (real-world platform). This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence bound (`False`) obtained from bootrapping :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True) rets_real = to.zeros(num_rollouts) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when conducting a sim-to-real experiment for i in range(num_rollouts): rets_real[i] = rollout(env, policy, eval=True).undiscounted_return() # If a reward of -1 is given, skip evaluation ahead and set all returns to zero if rets_real[i] == -1: print_cbt('Set all returns for this policy to zero.', color='c') rets_real = to.zeros(num_rollouts) break elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when conducting a sim-to-sim experiment sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts) ros = sampler.sample() for i in range(num_rollouts): rets_real[i] = ros[i].undiscounted_return() else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) if save_dir is not None: # Save the evaluation results to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt')) print_cbt('Target domain performance', bright=True) print(tabulate([['mean return', to.mean(rets_real).item()], ['std return', to.std(rets_real)], ['min return', to.min(rets_real)], ['max return', to.max(rets_real)]])) if mc_estimator: return to.mean(rets_real) else: return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean, num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
def cpp_export( save_dir: pyrado.PathLike, policy: Policy, env: Optional[SimEnv] = None, policy_export_name: str = "policy_export", write_policy_node: bool = True, policy_node_name: str = "policy", ): """ Convenience function to export the policy using PyTorch's scripting or tracing, and the experiment's XML configuration if the environment from RcsPySim. :param save_dir: directory to save in :param policy: (trained) policy :param env: environment the policy was trained in :param policy_export_name: name of the exported policy file without the file type ending :param write_policy_node: if `True`, write the PyTorch-based control policy into the experiment's XML configuration. This requires the experiment's XML configuration to be exported beforehand. :param policy_node_name: name of the control policies node in the XML file, e.g. 'policy' or 'preStrikePolicy' """ if not osp.isdir(save_dir): raise pyrado.PathErr(given=save_dir) if not isinstance(policy, Policy): raise pyrado.TypeErr(given=policy, expected_type=Policy) if not isinstance(policy_export_name, str): raise pyrado.TypeErr(given=policy_export_name, expected_type=str) # Use torch.jit.trace / torch.jit.script (the latter if recurrent) to generate a torch.jit.ScriptModule ts_module = policy.double().script( ) # can be evaluated like a regular PyTorch module # Serialize the script module to a file and save it in the same directory we loaded the policy from policy_export_file = osp.join(save_dir, f"{policy_export_name}.pt") ts_module.save(policy_export_file) # former .zip, and before that .pth print_cbt(f"Exported the loaded policy to {policy_export_file}", "g", bright=True) # Export the experiment config for C++ exp_export_file = osp.join(save_dir, "ex_config_export.xml") if env is not None and isinstance(inner_env(env), RcsSim): inner_env(env).save_config_xml(exp_export_file) print_cbt(f"Exported experiment configuration to {exp_export_file}", "g", bright=True) # Open the XML file again to add the policy node if write_policy_node and osp.isfile(exp_export_file): tree = et.parse(exp_export_file) root = tree.getroot() policy_node = et.Element(policy_node_name) policy_node.set("type", "torch") policy_node.set("file", f"{policy_export_name}.pt") root.append(policy_node) tree.write(exp_export_file) print_cbt( f"Added {policy_export_name}.pt to the experiment configuration.", "g")
def eval_policy( save_dir: Optional[pyrado.PathLike], env: Env, policy: Policy, prefix: str, num_rollouts: int, num_workers: int = 1, ) -> to.Tensor: """ Evaluate a policy either in the source or in the target domain. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance :param policy: policy to evaluate :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_rollouts: number of rollouts to collect on the target system :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :param num_workers: number of environments for the parallel sampler (only used for SimEnv) :return: estimated return in the target domain """ if save_dir is not None: print_cbt(f"Executing {prefix}_policy ...", "c", bright=True) if isinstance(inner_env(env), RealEnv): # Evaluate sequentially when evaluating on a real-world device rets_real = [] for i in range(num_rollouts): rets_real.append( rollout(env, policy, eval=True).undiscounted_return()) elif isinstance(inner_env(env), SimEnv): # Create a parallel sampler when evaluating in a simulation sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=num_rollouts) ros = sampler.sample(eval=True) rets_real = [ro.undiscounted_return() for ro in ros] else: raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv]) rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype()) if save_dir is not None: # Save and print the evaluation results pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix) print_cbt("Target domain performance", bright=True) print( tabulate([ ["mean return", to.mean(rets_real).item()], ["std return", to.std(rets_real)], ["min return", to.min(rets_real)], ["max return", to.max(rets_real)], ])) return to.mean(rets_real)
def step(self, act: np.ndarray) -> tuple: obs, reward, done, info = self.wrapped_env.step(act) saw = typed_env(self.wrapped_env, StateAugmentationWrapper) nonobserved = to.from_numpy(obs[saw.offset :]) adversarial = self.get_arpl_grad(self.state, nonobserved) if self.decide_apply(): self.state += adversarial.view(-1).numpy() if saw: obs[: saw.offset] = inner_env(self).observe(self.state) else: obs = inner_env(self).observe(self.state) return obs, reward, done, info
def eval_damping(): """ Plot joint trajectories for different joint damping parameters """ # Load experiment and remove possible randomization wrappers ex_dir = ask_for_experiment() env, policy, _ = load_experiment(ex_dir) env = inner_env(env) env.domain_param = WAMBallInCupSim.get_nominal_domain_param() data = [] t = [] dampings = [0., 1e-2, 1e-1, 1e0] print_cbt(f'Run policy for damping coefficients: {dampings}') for d in dampings: env.reset(domain_param=dict(joint_damping=d)) ro = rollout(env, policy, render_mode=RenderMode(video=False), eval=True) t.append(ro.env_infos['t']) data.append(ro.env_infos['qpos']) fig, ax = plt.subplots(3, sharex='all') ls = ['k-', 'b--', 'g-.', 'r:'] # line style setting for better visibility for i, idx in enumerate([1, 3, 5]): for j in range(len(dampings)): ax[i].plot(t[j], data[j][:, idx], ls[j], label=f'damping: {dampings[j]}') if i == 0: ax[i].legend() ax[i].set_ylabel(f'joint {idx} pos [rad]') ax[2].set_xlabel('time [s]') plt.suptitle('Evaluation of joint damping coefficient') plt.show()
def _params_as_tensor(self): if self.fixed: return self._nominal else: return np.array([ inner_env(self.wrapped_env).domain_param[k] for k in self._params ])
def test_domain_param_transforms(env: SimEnv, trafo_class: Type): pyrado.set_seed(0) # Create a mask for a random domain parameter offset = 1 idx = random.randint(0, len(env.supported_domain_param) - 1) sel_dp_change = list(env.supported_domain_param)[idx] sel_dp_fix = list( env.supported_domain_param)[(idx + offset) % len(env.supported_domain_param)] while (offset == 1 or any([ item in sel_dp_change for item in VORTEX_ONLY_DOMAIN_PARAM_LIST ]) or any([item in sel_dp_fix for item in VORTEX_ONLY_DOMAIN_PARAM_LIST])): idx = random.randint(0, len(env.supported_domain_param) - 1) sel_dp_change = list(env.supported_domain_param)[idx] sel_dp_fix = list( env.supported_domain_param)[(idx + offset) % len(env.supported_domain_param)] offset += 1 mask = (sel_dp_change, ) wenv = trafo_class(env, mask) assert isinstance(wenv, DomainParamTransform) # Check 5 random values for _ in range(5): # Change the selected domain parameter new_dp_val = random.random() * env.get_nominal_domain_param( )[sel_dp_change] new_dp_val = abs( new_dp_val) + 1e-6 # due to the domain of the new params transformed_new_dp_val = wenv.forward(new_dp_val) wenv.domain_param = { sel_dp_change: transformed_new_dp_val } # calls inverse transform if not isinstance(inner_env(wenv), SimPyEnv): wenv.reset( ) # the RcsPySim and MujocoSim classes need to be reset to apply the new domain param # Test the actual domain param and the the getters assert inner_env(wenv)._domain_param[sel_dp_change] == pytest.approx( new_dp_val, abs=1e-5) assert wenv.domain_param[sel_dp_change] == pytest.approx(new_dp_val, abs=1e-5) assert wenv.domain_param[sel_dp_fix] != pytest.approx(new_dp_val)
def __init__(self, env, params: Sequence[str] = None): self._params = None if isinstance(params, list) and len(params) == 0: params = None self._all_nominal = inner_env(env).get_nominal_domain_param() if params is not None: self.params = params else: self.params = self._all_nominal.keys()
def __init__(self, wrapped_env: Env, domain_param: Sequence[str] = None, fixed: bool = False): """ Constructor :param wrapped_env: the environment to be wrapped :param domain_param: list of domain parameter names to include in the observation, pass `None` to select all :param fixed: fix the parameters """ Serializable._init(self, locals()) EnvWrapper.__init__(self, wrapped_env) if domain_param is not None: self._params = domain_param else: self._params = list(inner_env(self.wrapped_env).domain_param.keys()) self._nominal = inner_env(self.wrapped_env).get_nominal_domain_param() self._nominal = np.array([self._nominal[k] for k in self._params]) self.fixed = fixed
def __init__(self, wrapped_env: Env, params=None, fixed=False): """ Constructor TODO :param wrapped_env: :param params: :param fixed: """ Serializable._init(self, locals()) EnvWrapper.__init__(self, wrapped_env) if params is not None: self._params = params else: self._params = list( inner_env(self.wrapped_env).domain_param.keys()) self._nominal = inner_env(self.wrapped_env).get_nominal_domain_param() self.nominal[ 'dt'] = 1 / 100. # TODO ATTENTION! THIS CAN BE DEADLY! @Robin, why are you doing this? self._nominal = np.array([self._nominal[k] for k in self._params]) self.fixed = fixed
def test_combination(): env = QCartPoleSwingUpSim(dt=1/50., max_steps=20) randomizer = create_default_randomizer(env) env_r = DomainRandWrapperBuffer(env, randomizer) env_r.fill_buffer(num_domains=3) dp_before = [] dp_after = [] for i in range(4): dp_before.append(env_r.domain_param) rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) dp_after.append(env_r.domain_param) assert dp_after[i] != dp_before[i] assert dp_after[0] == dp_after[3] env_rn = ActNormWrapper(env) elb = {'x_dot': -213., 'theta_dot': -42.} eub = {'x_dot': 213., 'theta_dot': 42., 'x': 0.123} env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub) alb, aub = env_rn.act_space.bounds assert all(alb == -1) assert all(aub == 1) olb, oub = env_rn.obs_space.bounds assert all(olb == -1) assert all(oub == 1) ro_r = rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) ro_rn = rollout(env_rn, DummyPolicy(env_rn.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(env_rn._process_obs(ro_r.observations), ro_rn.observations) env_rnp = ObsPartialWrapper(env_rn, idcs=['x_dot', r'cos_theta']) ro_rnp = rollout(env_rnp, DummyPolicy(env_rnp.spec), eval=True, seed=0, render_mode=RenderMode()) env_rnpa = GaussianActNoiseWrapper(env_rnp, noise_mean=0.5*np.ones(env_rnp.act_space.shape), noise_std=0.1*np.ones(env_rnp.act_space.shape)) ro_rnpa = rollout(env_rnpa, DummyPolicy(env_rnpa.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(ro_rnp.actions, ro_rnpa.actions) assert not np.allclose(ro_rnp.observations, ro_rnpa.observations) env_rnpd = ActDelayWrapper(env_rnp, delay=3) ro_rnpd = rollout(env_rnpd, DummyPolicy(env_rnpd.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(ro_rnp.actions, ro_rnpd.actions) assert not np.allclose(ro_rnp.observations, ro_rnpd.observations) assert isinstance(inner_env(env_rnpd), QCartPoleSwingUpSim) assert typed_env(env_rnpd, ObsPartialWrapper) is not None assert isinstance(env_rnpd, ActDelayWrapper) env_rnpdr = remove_env(env_rnpd, ActDelayWrapper) assert not isinstance(env_rnpdr, ActDelayWrapper)
def test_bayrn_power(ex_dir, env: SimEnv, bayrn_hparam: dict): pyrado.set_seed(0) # Environments and domain randomization env_real = deepcopy(env) env_sim = DomainRandWrapperLive(env, create_zero_var_randomizer(env)) dp_map = create_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real.domain_param = dict(mass_pend_pole=0.024 * 1.1, mass_rot_pole=0.095 * 1.1) env_real = wrap_like_other_env(env_real, env_sim) # Policy and subroutine policy_hparam = dict(energy_gain=0.587, ref_energy=0.827) policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam) subrtn_hparam = dict( max_iter=1, pop_size=8, num_init_states_per_domain=1, num_is_samples=4, expl_std_init=0.1, num_workers=1, ) subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam) # Set the boundaries for the GP dp_nom = inner_env(env_sim).get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array([ 0.8 * dp_nom["mass_pend_pole"], 1e-8, 0.8 * dp_nom["mass_rot_pole"], 1e-8 ]), bound_up=np.array([ 1.2 * dp_nom["mass_pend_pole"], 1e-7, 1.2 * dp_nom["mass_rot_pole"], 1e-7 ]), ) # Create algorithm and train algo = BayRn(ex_dir, env_sim, env_real, subrtn, ddp_space, **bayrn_hparam, num_workers=1) algo.train() assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
def get_arpl_grad(self, state, nonobserved): if isinstance(state, np.ndarray): state_tensor = to.tensor(state, requires_grad=True) elif isinstance(state, to.Tensor): state_tensor = state else: raise ValueError("state could not be converted to a torch tensor") if self.torch_observation: observation = inner_env(self).observe(state_tensor, dtype=to.Tensor) else: observation = state_tensor mean_arpl = self._policy.forward(to.cat((observation, nonobserved))) l2_norm_mean = -to.norm(mean_arpl, p=2, dim=0) l2_norm_mean.backward() state_grad = state_tensor.grad return self._eps * to.sign(state_grad)
def __init__(self, env: Env, policy: Policy, num_workers: int, num_rollouts_per_param: int, seed: int = None): """ Constructor :param env: environment to sample from :param policy: policy used for sampling :param num_workers: number of parallel samplers :param num_rollouts_per_param: number of rollouts per policy parameter set (and init state if specified) :param seed: seed value for the random number generators, pass `None` for no seeding """ if not isinstance(num_rollouts_per_param, int): raise pyrado.TypeErr(given=num_rollouts_per_param, expected_type=int) if num_rollouts_per_param < 1: raise pyrado.ValueErr(given=num_rollouts_per_param, ge_constraint='1') Serializable._init(self, locals()) # Check environment for domain randomization wrappers (stops after finding the outermost) self._dr_wrapper = typed_env(env, DomainRandWrapper) if self._dr_wrapper is not None: assert isinstance(inner_env(env), SimEnv) # Remove them all from the env chain since we sample the domain parameter later explicitly env = remove_all_dr_wrappers(env) self.env, self.policy = env, policy self.num_rollouts_per_param = num_rollouts_per_param # Create parallel pool. We use one thread per environment because it's easier. self.pool = SamplerPool(num_workers) # Set all rngs' seeds if seed is not None: self.pool.set_seed(seed) # Distribute environments. We use pickle to make sure a copy is created for n_envs = 1 self.pool.invoke_all(_pes_init, pickle.dumps(self.env), pickle.dumps(self.policy))
def create_default_randomizer(env: [SimEnv, EnvWrapper]) -> DomainRandomizer: """ Create the default randomizer depending on the passed environment. :param env: (wrapped) environment that should be perturbed :return: default randomizer """ env_type = type(inner_env(env)) # Try all env base types. This is more or less equivalent to isinstance for cand_type in env_type.__mro__: env_module = cand_type.__module__ env_class = cand_type.__name__ # Try to get it dp = default_randomizer_registry.get((env_module, env_class)) if dp: return dp() else: raise ValueError(f'No default randomizer settings for env of type {env_type}!')
def __init__(self, wrapped_env: Union[SimEnv, EnvWrapper], randomizer: Optional[DomainRandomizer]): """ Constructor :param wrapped_env: environment to wrap :param randomizer: `DomainRandomizer` object holding the probability distribution of all randomizable domain parameters, pass `None` if you want to subclass wrapping another `DomainRandWrapper` and use its randomizer """ if not isinstance(inner_env(wrapped_env), SimEnv): raise pyrado.TypeErr(given=wrapped_env, expected_type=SimEnv) if not isinstance(randomizer, DomainRandomizer) and randomizer is not None: raise pyrado.TypeErr(given=randomizer, expected_type=DomainRandomizer) Serializable._init(self, locals()) # Invoke EnvWrapper's constructor super().__init__(wrapped_env) self._randomizer = randomizer
from pyrado.plotting.distribution import draw_posterior_pairwise_scatter from pyrado.utils.argparser import get_argparser if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() plt.rc("text", usetex=args.use_tex) if not isinstance(args.num_samples, int) or args.num_samples < 1: raise pyrado.ValueErr(given=args.num_samples, ge_constraint="1") # NPDR ex_dir_npdr = os.path.join(pyrado.TEMP_DIR, "mg-ik", "npdr_time", "") algo = Algorithm.load_snapshot(ex_dir_npdr) if not isinstance(algo, NPDR): raise pyrado.TypeErr(given=algo, expected_type=NPDR) env_sim = inner_env(pyrado.load("env_sim.pkl", ex_dir_npdr)) prior_npdr = pyrado.load("prior.pt", ex_dir_npdr) posterior_npdr = algo.load_posterior(ex_dir_npdr, idx_iter=0, idx_round=6, obj=None, verbose=True) # CHOICE data_real_npdr = pyrado.load(f"data_real.pt", ex_dir_npdr, prefix="iter_0", verbose=True) # CHOICE domain_params_npdr, log_probs = SBIBase.eval_posterior( posterior_npdr, data_real_npdr, args.num_samples, normalize_posterior=False, # not necessary here
def train_policy_sim(self, domain_params: to.Tensor, prefix: str, cnt_rep: int, use_rec_init_states: bool = True) -> float: """ Train a policy in simulation for given hyper-parameters from the domain randomizer. :param domain_params: domain parameters sampled from the posterior [shape N x D where N is the number of samples and D is the number of domain parameters] :param prefix: set a prefix to the saved file name, use "" for no prefix :param cnt_rep: current repetition count, coming from the wrapper function :param use_rec_init_states: if `True`, the previous rollout will be loaded to extract the initial states, and sync them with the recorded ones :return: estimated return of the trained policy in the target domain """ if not (domain_params.ndim == 2 and domain_params.shape[1] == len(self.dp_mapping)): raise pyrado.ShapeErr(given=domain_params, expected_match=(-1, len(self.dp_mapping))) # Insert the domain parameters into the wrapped environment's buffer self.fill_domain_param_buffer(self._env_sim_trn, self.dp_mapping, domain_params) # Set the initial state spaces of the simulation environment to match the observed initial states if use_rec_init_states: rollouts_real = pyrado.load("rollouts_real.pkl", self._save_dir, prefix=prefix) init_states_real = np.stack( [ro.states[0, :] for ro in rollouts_real]) if not init_states_real.shape == ( len(rollouts_real), self._env_sim_trn.state_space.flat_dim): raise pyrado.ShapeErr( given=init_states_real, expected_match=(len(rollouts_real), self._env_sim_trn.state_space.flat_dim)) inner_env( self._env_sim_trn).init_space = DiscreteSpace(init_states_real) print_cbt( "The simulation environment's initial states have been set to the recorded ones.", "w") # Reset the subroutine algorithm which includes resetting the exploration self._cnt_samples += self._subrtn_policy.sample_count self._subrtn_policy.reset() # Propagate the updated training environment to the SamplerPool's workers if hasattr(self._subrtn_policy, "sampler"): self._subrtn_policy.sampler.reinit(env=self._env_sim_trn) else: raise pyrado.KeyErr(keys="sampler", container=self._subrtn_policy) # Do a warm start, but randomly reset the policy parameters if training failed once self._subrtn_policy.init_modules(self.warmstart and cnt_rep == 0) # Train a policy in simulation using the subroutine self._subrtn_policy.train( snapshot_mode=self._subrtn_policy_snapshot_mode, meta_info=dict(prefix=prefix)) # Return the estimated return of the trained policy in simulation assert len(self._env_sim_trn.buffer) == self.num_eval_samples self._env_sim_trn.ring_idx = 0 # don't reset the buffer to eval on the same domains as trained avg_ret_sim = self.eval_policy(None, self._env_sim_trn, self._subrtn_policy.policy, prefix, self.num_eval_samples) return float(avg_ret_sim)
def __init__( self, save_dir: pyrado.PathLike, env_sim: SimEnv, env_real: Union[Env, str], policy: Policy, dp_mapping: Mapping[int, str], prior: Distribution, embedding: Embedding, num_checkpoints: int, init_checkpoint: int, max_iter: int, num_real_rollouts: int, num_sim_per_round: int, num_segments: int = None, len_segments: int = None, stop_on_done: bool = True, use_rec_act: bool = True, num_sbi_rounds: int = 1, reset_sbi_routine_each_iter: bool = False, reset_proposal_each_iter: bool = False, num_eval_samples: Optional[int] = None, posterior_hparam: Optional[dict] = None, subrtn_sbi_training_hparam: Optional[dict] = None, subrtn_sbi_sampling_hparam: Optional[dict] = None, simulation_batch_size: int = 1, normalize_posterior: bool = True, subrtn_policy: Optional[Algorithm] = None, subrtn_policy_snapshot_mode: str = "latest", train_initial_policy: bool = True, thold_succ_subrtn: float = -pyrado.inf, warmstart: bool = True, num_workers: int = 4, logger: Optional[StepLogger] = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env_sim: randomized simulation environment a.k.a. source domain :param env_real: real-world environment a.k.a. target domain, this can be a `RealEnv` (sim-to-real setting), a `SimEnv` (sim-to-sim setting), or a directory to load a pre-recorded set of rollouts from :param policy: policy used for sampling the rollout, if subrtn_policy is not `None` this policy is not oly used for generating the target domain rollouts, but also optimized in simulation :param dp_mapping: mapping from subsequent integers (starting at 0) to domain parameter names (e.g. mass) :param prior: distribution used by sbi as a prior :param embedding: embedding used for pre-processing the data before passing it to the posterior :param num_checkpoints: total number of checkpoints :param init_checkpoint: initial value of the cyclic counter, defaults to 0, use negative values can to mark sections that should only be executed once :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_real_rollouts: number of real-world rollouts received by sbi, i.e. from every rollout exactly one data set is computed :param num_sim_per_round: number of simulations done by sbi per round (i.e. iteration over the same target domain data set) :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param stop_on_done: if `True`, the rollouts are stopped as soon as they hit the state or observation space boundaries. This behavior is save, but can lead to short trajectories which are eventually padded with zeroes. Chose `False` to ignore the boundaries (dangerous on the real system). :param use_rec_act: if `True` the recorded actions form the target domain are used to generate the rollout during simulation (feed-forward). If `False` there policy is used to generate (potentially) state-dependent actions (feed-back). :param reset_sbi_routine_each_iter: if `True` the sbi subroutine instance is recreated every iteration. Use this flag to train the posterior each iteration from scratch. :param num_sbi_rounds: set to an integer > 1 to use multi-round sbi. This way the posteriors (saved as `..._round_NUMBER...` will be tailored to the data of that round, where `NUMBER` counts up each round (modulo `num_real_rollouts`). If `num_sbi_rounds` = 1, the posterior is called amortized (it has never seen any target domain data). :param num_eval_samples: number of samples for evaluating the posterior in `eval_posterior()` :param posterior_hparam: hyper parameters for creating the posterior's density estimator :param subrtn_sbi_training_hparam: dict forwarded to sbi's `PosteriorEstimator.train()` function like `training_batch_size`, `learning_rate`, `retrain_from_scratch_each_round`, ect. :param simulation_batch_size: batch size forwarded to the sbi toolbox, requires batched simulator :param normalize_posterior: if `True` the normalization of the posterior density is enforced by sbi :param subrtn_policy: algorithm which performs the optimization of the behavioral policy (and value-function) :param subrtn_policy_snapshot_mode: snapshot mode for saving during policy optimization :param train_initial_policy: choose if a policy should be pretrained in the first iteration before collecting real rollouts. Choose `False`, if you want to use a pre-defined policy. :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the subroutine until the threshold is exceeded or the for a given number of iterations :param warmstart: initialize the policy (and value function) parameters with the one of the previous iteration. This behavior can also be overruled by passing `init_policy_params` (and `valuefcn_param_init`) explicitly. :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(inner_env(env_sim), SimEnv) or ( isinstance(env_sim, DomainRandWrapper) and not isinstance(env_sim, ActDelayWrapper)): raise pyrado.TypeErr( msg= "The given env_sim must be a non-randomized simulation environment, " "except for wrappers that add a domain parameter!") if isinstance(prior, Normal): raise pyrado.TypeErr( msg= "The sbi framework requires MultivariateNormal instead of Normal distributions for the prior." ) if not prior.event_shape[0] == len(dp_mapping): raise pyrado.ShapeErr(given=prior.event_shape, expected_match=dp_mapping) if posterior_hparam is None: posterior_hparam = dict() elif not isinstance(posterior_hparam, dict): raise pyrado.TypeErr(given=posterior_hparam, expected_type=dict) if subrtn_sbi_training_hparam is None: subrtn_sbi_training_hparam = dict() elif not isinstance(subrtn_sbi_training_hparam, dict): raise pyrado.TypeErr(given=subrtn_sbi_training_hparam, expected_type=dict) # Call InterruptableAlgorithm's constructor super().__init__( num_checkpoints=num_checkpoints, init_checkpoint=init_checkpoint, save_dir=save_dir, max_iter=max_iter, policy=policy, logger=logger, ) self._env_sim_sbi = env_sim # will be randomized explicitly by sbi self._env_sim_trn = DomainRandWrapperBuffer(copy.deepcopy(env_sim), randomizer=None, selection="cyclic") self._env_real = env_real self.dp_mapping = dp_mapping self._embedding = embedding self.num_sim_per_round = num_sim_per_round self.num_real_rollouts = num_real_rollouts self.num_segments = num_segments self.len_segments = len_segments self.stop_on_done = stop_on_done self.use_rec_act = use_rec_act self.reset_sbi_routine_each_iter = reset_sbi_routine_each_iter self.reset_proposal_each_iter = reset_proposal_each_iter self.num_sbi_rounds = num_sbi_rounds self.num_eval_samples = num_eval_samples or 10 * 2**len(dp_mapping) self.simulation_batch_size = simulation_batch_size self.normalize_posterior = normalize_posterior self._subrtn_sbi = None self.subrtn_sbi_training_hparam = subrtn_sbi_training_hparam or dict() self.posterior_hparam = posterior_hparam or dict() self.thold_succ_subrtn = float(thold_succ_subrtn) self.max_subrtn_rep = 3 # number of tries to exceed thold_succ_subrtn during training in simulation self.warmstart = warmstart self.num_workers = int(num_workers) # Temporary containers self._curr_data_real = None self._curr_domain_param_eval = None # Initialize sbi simulator and prior self._sbi_simulator = None # to be set in step() self._sbi_prior = None # to be set in step() self._setup_sbi(prior=prior) # Optional policy optimization subroutine self._subrtn_policy = subrtn_policy if isinstance(self._subrtn_policy, Algorithm): self._subrtn_policy_snapshot_mode = subrtn_policy_snapshot_mode self._subrtn_policy.save_name = "subrtn_policy" self._train_initial_policy = train_initial_policy # Check that the behavioral policy is the one that is being updated if self._subrtn_policy.policy is not self.policy: raise pyrado.ValueErr( msg= "The policy is the policy subroutine is not the same as the one used by " "the system identification (sbi) subroutine!") # Save initial environments, the embedding, and the prior pyrado.save(self._env_sim_trn, "env_sim.pkl", self._save_dir) pyrado.save(self._env_real, "env_real.pkl", self._save_dir) pyrado.save(embedding, "embedding.pt", self._save_dir) pyrado.save(prior, "prior.pt", self._save_dir) pyrado.save(policy, "init_policy.pt", self._save_dir, use_state_dict=True)
def collect_data_real( save_dir: Optional[pyrado.PathLike], env: Union[Env, str], policy: Policy, embedding: Embedding, num_rollouts: int, num_segments: int = None, len_segments: int = None, prefix: str = "", ) -> Tuple[to.Tensor, List[StepSequence]]: """ Roll-out a (behavioral) policy on the target system for later use with the sbi module, and save the data computed from the recorded rollouts. This method is static to facilitate evaluation of specific policies in hindsight. :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance, in case you want to use pre-recorded rollouts pass the path to the parent folder as string :param policy: policy to evaluate :param embedding: embedding used for pre-processing the data before passing it to the posterior :param num_rollouts: number of rollouts to collect on the target system :param num_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param len_segments: length of the segments in which the rollouts are split into. For every segment, the initial state of the simulation is reset, and thus for every set the features of the trajectories are computed separately. Either specify `num_segments` or `len_segments`. :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate :return: data from the real-world rollouts a.k.a. set of $x_o$ of shape [num_iter, num_rollouts_per_iter, time_series_length, dim_data], and the real-world rollouts """ if not (isinstance(inner_env(env), RealEnv) or isinstance(inner_env(env), SimEnv) or isinstance(env, str)): raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv, str]) # Evaluate sequentially (necessary for sim-to-real experiments) if isinstance(env, str): rollout_worker = RecRolloutSamplerForSBI(env, embedding, num_segments, len_segments, rand_init_rollout=False) else: rollout_worker = RealRolloutSamplerForSBI(env, policy, embedding, num_segments, len_segments) # Initialize data containers data_real = None rollouts_real = None num_found_rollouts = 0 if save_dir is not None: try: data_real = pyrado.load("data_real.pt", save_dir, prefix=prefix) rollouts_real = pyrado.load("rollouts_real.pkl", save_dir, prefix=prefix) if not data_real.shape[0] == len(rollouts_real): raise pyrado.ShapeErr( msg= f"Found {data_real.shape[0]} entries in data_real.pt, but {len(rollouts_real)} rollouts in " f"rollouts_real.pkl!") num_found_rollouts = len(rollouts_real) print_cbt( f"Found {num_found_rollouts} rollout(s) in {save_dir}.", "w") except FileNotFoundError: pass # in the first attempt no files can be found collect_str = f"Collecting data" if prefix == "" else f"Collecting data using {prefix}_policy" for _ in tqdm( range(num_found_rollouts, num_rollouts), total=num_rollouts, desc=Fore.CYAN + Style.BRIGHT + collect_str + Style.RESET_ALL, unit="rollouts", file=sys.stdout, ): # Do the rollout data, rollout = rollout_worker() # Fill data container if data_real is None or rollouts_real is None: data_real = data # data is of shape [1, dim_feat] rollouts_real = [rollout] else: data_real = to.cat( [data_real, data], dim=1) # stack to final shape [1, num_rollouts * dim_feat] rollouts_real.append(rollout) # Optionally save the data (do this at every iteration to continue) if save_dir is not None: pyrado.save(data_real, "data_real.pt", save_dir, prefix=prefix) pyrado.save(rollouts_real, "rollouts_real.pkl", save_dir, prefix=prefix) if data_real.shape != (1, num_rollouts * embedding.dim_output): raise pyrado.ShapeErr(given=data_real, expected_match=(1, num_rollouts * embedding.dim_output)) return data_real, rollouts_real
# init_state = np.array([1/np.sqrt(2), -1/np.sqrt(2)]) # init_state = np.array([0., -1.]) # init_state = np.array([-1/np.sqrt(2), -1/np.sqrt(2)]) # init_state = np.array([-1., 0.]) # init_state = np.array([-1/np.sqrt(2), 1/np.sqrt(2)]) init_state = np.array([0., 1.]) init_state *= 0.103125 # distance scaling [m] pdctrl.reset(state_des=init_state) print_cbt( f'Set up the PD-controller for the QBallBalancerReal environment.\nDesired state: {init_state}', 'c') ros = [] for r in range(args.num_runs): # Run PD-controller on the device to get the ball into position env_real = inner_env(env_real) # since we are reusing it print_cbt('Running the PD-controller ...', 'c', bright=True) rollout(env_real, pdctrl, eval=True, max_steps=2000, render_mode=RenderMode()) env_real.reset() # Wrap the real environment in the same way as done during training (do this after the PD controller finished) env_real = wrap_like_other_env(env_real, env_sim) # Run learned policy on the device print_cbt('Running the evaluation policy ...', 'c', bright=True) ros.append( rollout(env_real,
from pyrado.utils.experiments import wrap_like_other_env, load_experiment from pyrado.utils.input_output import print_cbt from pyrado.utils.argparser import get_argparser if __name__ == '__main__': # Parse command line arguments args = get_argparser().parse_args() # Get the experiment's directory to load from ex_dir = ask_for_experiment() # Load the policy (trained in simulation) and the environment (for constructing the real-world counterpart) env_sim, policy, _ = load_experiment(ex_dir) # Detect the correct real-world counterpart and create it if isinstance(inner_env(env_sim), QBallBalancerSim): env_real = QBallBalancerReal(dt=args.dt, max_steps=args.max_steps) elif isinstance(inner_env(env_sim), QCartPoleSim): env_real = QCartPoleReal(dt=args.dt, max_steps=args.max_steps) elif isinstance(inner_env(env_sim), QQubeSim): env_real = QQubeReal(dt=args.dt, max_steps=args.max_steps) else: raise pyrado.TypeErr( given=env_sim, expected_type=[QBallBalancerSim, QCartPoleSim, QQubeSim]) print_cbt(f'Set up env {env_real.name}.', 'c') # Finally wrap the env in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run on device
def set_adv(self, params): for key, value in zip(self._params, params): inner_env(self.wrapped_env).domain_param[key] = self._nominal[key] + value
from pyrado.utils.experiments import wrap_like_other_env, load_experiment from pyrado.utils.input_output import print_cbt from pyrado.utils.argparser import get_argparser if __name__ == '__main__': # Parse command line arguments args = get_argparser().parse_args() # Get the experiment's directory to load from ex_dir = ask_for_experiment() # Load the policy (trained in simulation) and the environment (for constructing the real-world counterpart) env_sim, policy, _ = load_experiment(ex_dir) # Detect the correct real-world counterpart and create it if isinstance(inner_env(env_sim), WAMBallInCupSim): # If `max_steps` (or `dt`) are not explicitly set using `args`, use the same as in the simulation max_steps = args.max_steps if args.max_steps < pyrado.inf else env_sim.max_steps dt = args.dt if args.dt is not None else env_sim.dt env_real = WAMBallInCupReal(dt=dt, max_steps=max_steps) else: raise pyrado.TypeErr(given=env_sim, expected_type=WAMBallInCupSim) # Finally wrap the env in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run on device done = False while not done: ro = rollout(env_real, policy, eval=True) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True)
def plot_actions(ro: StepSequence, env: Env): """ Plot all action trajectories of the given rollout. :param ro: input rollout :param env: environment (used for getting the clipped action values) """ if hasattr(ro, "actions"): if not isinstance(ro.actions, np.ndarray): raise pyrado.TypeErr(given=ro.actions, expected_type=np.ndarray) dim_act = ro.actions.shape[1] # Use recorded time stamps if possible t = getattr(ro, "time", np.arange(0, ro.length + 1))[:-1] num_rows, num_cols = num_rows_cols_from_length(dim_act, transposed=True) fig, axs = plt.subplots(num_rows, num_cols, figsize=(10, 8), tight_layout=True) fig.canvas.manager.set_window_title("Actions over Time") axs = np.atleast_2d(axs) axs = correct_atleast_2d(axs) colors = plt.get_cmap("tab20")(np.linspace(0, 1, dim_act)) act_norm_wrapper = typed_env(env, ActNormWrapper) if act_norm_wrapper is not None: lb, ub = inner_env(env).act_space.bounds act_denorm = lb + (ro.actions + 1.0) * (ub - lb) / 2 act_clipped = np.array( [inner_env(env).limit_act(a) for a in act_denorm]) else: act_denorm = ro.actions act_clipped = np.array([env.limit_act(a) for a in ro.actions]) if dim_act == 1: axs[0, 0].plot(t, act_denorm, label="to env") axs[0, 0].plot(t, act_clipped, label="clipped", c="k", ls="--") axs[0, 0].legend(ncol=2) axs[0, 0].set_ylabel(_get_act_label(ro, 0)) else: for idx_a in range(dim_act): axs[idx_a // num_cols, idx_a % num_cols].plot(t, act_denorm[:, idx_a], label="to env", c=colors[idx_a]) axs[idx_a // num_cols, idx_a % num_cols].plot(t, act_clipped[:, idx_a], label="clipped", c="k", ls="--") axs[idx_a // num_cols, idx_a % num_cols].legend(ncol=2) axs[idx_a // num_cols, idx_a % num_cols].set_ylabel(_get_act_label(ro, idx_a)) # Put legends to the right of the plot if dim_act < 8: # otherwise it gets too cluttered for a in fig.get_axes(): a.legend(ncol=2) plt.subplots_adjust(hspace=0.2)
def experiment_w_distruber(env_real: RealEnv, env_sim: SimEnv): # Wrap the environment in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run learned policy on the device print_cbt('Running the evaluation policy ...', 'c', bright=True) ro1 = rollout(env_real, policy, eval=True, max_steps=args.max_steps // 3, render_mode=RenderMode(), no_reset=True, no_close=True) # Run disturber env_real = inner_env(env_real) # since we are reusing it print_cbt('Running the 1st disturber ...', 'c', bright=True) rollout(env_real, disturber_pos, eval=True, max_steps=steps_disturb, render_mode=RenderMode(), no_reset=True, no_close=True) # Wrap the environment in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run learned policy on the device print_cbt('Running the evaluation policy ...', 'c', bright=True) ro2 = rollout(env_real, policy, eval=True, max_steps=args.max_steps // 3, render_mode=RenderMode(), no_reset=True, no_close=True) # Run disturber env_real = inner_env(env_real) # since we are reusing it print_cbt('Running the 2nd disturber ...', 'c', bright=True) rollout(env_real, disturber_neg, eval=True, max_steps=steps_disturb, render_mode=RenderMode(), no_reset=True, no_close=True) # Wrap the environment in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run learned policy on the device print_cbt('Running the evaluation policy ...', 'c', bright=True) ro3 = rollout(env_real, policy, eval=True, max_steps=args.max_steps // 3, render_mode=RenderMode(), no_reset=True, no_close=True) return StepSequence.concat([ro1, ro2, ro3])
# Parse command line arguments parser = get_argparser() parser.add_argument("--render3D", action="store_true", default=False, help="render the GP in 3D") args = parser.parse_args() plt.rc("text", usetex=args.use_tex) # Get the experiment's directory to load from ex_dir = ask_for_experiment(hparam_list=args.show_hparams) if args.dir is None else args.dir env_sim = joblib.load(osp.join(ex_dir, "env_sim.pkl")) if not typed_env(env_sim, MetaDomainRandWrapper): raise pyrado.TypeErr(given_name=env_sim, expected_type=MetaDomainRandWrapper) labels_sel_dims = [env_sim.dp_mapping[args.idcs[i]][0] for i in range(len(args.idcs))] env_real = joblib.load(osp.join(ex_dir, "env_real.pkl")) if isinstance(inner_env(env_real), SimEnv): # Use actual ground truth domain param if sim-2-sim setting domain_params = env_real.domain_param else: # Use nominal domain param if sim-2-real setting domain_params = inner_env(env_sim).get_nominal_domain_param() for dp_name, dp_val in domain_params.items(): if dp_name in labels_sel_dims[0]: gt_val_x = dp_val try: if dp_name == labels_sel_dims[1]: gt_val_y = dp_val except Exception: gt_val_y = None cands = pyrado.load("candidates.pt", ex_dir)
from pyrado.environments.rcspysim.base import RcsSim from pyrado.logger.experiment import ask_for_experiment from pyrado.utils.argparser import get_argparser from pyrado.utils.experiments import load_experiment from pyrado.utils.input_output import print_cbt if __name__ == '__main__': # Parse command line arguments args = get_argparser().parse_args() # Get the experiment's directory to load from ex_dir = ask_for_experiment() if args.ex_dir is None else args.ex_dir # Load the policy (trained in simulation) env, policy, _ = load_experiment(ex_dir) # Use torch.jit.trace / torch.jit.script (the latter if recurrent) to generate a torch.jit.ScriptModule ts_module = policy.script() # can be evaluated like a regular PyTorch module # Serialize the script module to a file and save it in the same directory we loaded the policy from policy_export_file = osp.join(ex_dir, 'policy_export.pt') ts_module.save(policy_export_file) # former .zip, and before that .pth print_cbt(f'Exported the loaded policy to\n{policy_export_file}', 'g', bright=True) # Export the experiment config for C++ if isinstance(inner_env(env), RcsSim): exp_export_file = osp.join(ex_dir, f'ex_{env.name}_export.xml') inner_env(env).save_config_xml(exp_export_file) print_cbt(f'Exported experiment configuration to\n{exp_export_file}', 'g', bright=True)
def __init__( self, env: Union[SimEnv, EnvWrapper], policy: Policy, num_init_states_per_domain: int, num_domains: int, num_workers: int, seed: Optional[int] = None, ): """ Constructor :param env: environment to sample from :param policy: policy used for sampling :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param num_workers: number of parallel samplers :param seed: seed value for the random number generators, pass `None` for no seeding; defaults to the last seed that was set with `pyrado.set_seed` """ if not isinstance(num_init_states_per_domain, int): raise pyrado.TypeErr(given=num_init_states_per_domain, expected_type=int) if num_init_states_per_domain < 1: raise pyrado.ValueErr(given=num_init_states_per_domain, ge_constraint="1") if not isinstance(num_domains, int): raise pyrado.TypeErr(given=num_domains, expected_type=int) if num_domains < 1: raise pyrado.ValueErr(given=num_domains, ge_constraint="1") Serializable._init(self, locals()) # Check environment for domain randomization wrappers (stops after finding the outermost) self._dr_wrapper = typed_env(env, DomainRandWrapper) if self._dr_wrapper is not None: assert isinstance(inner_env(env), SimEnv) # Remove them all from the env chain since we sample the domain parameter later explicitly env = remove_all_dr_wrappers(env) self.env, self.policy = env, policy self.num_init_states_per_domain = num_init_states_per_domain self.num_domains = num_domains # Set method to spawn if using cuda if mp.get_start_method(allow_none=True) != "spawn": mp.set_start_method("spawn", force=True) # Create parallel pool. We use one thread per environment because it's easier. self.pool = SamplerPool(num_workers) if seed is NO_SEED_PASSED: seed = pyrado.get_base_seed() self._seed = seed # Initialize with -1 such that we start with the 0-th sample. Incrementing after sampling may cause issues when # the sampling crashes and the sample count is not incremented. self._sample_count = -1 # Distribute environments. We use pickle to make sure a copy is created for n_envs = 1 self.pool.invoke_all(_pes_init, pickle.dumps(self.env), pickle.dumps(self.policy))