def __init__( self, env, policy, num_workers: int, *, min_rollouts: int = None, min_steps: int = None, show_progress_bar: bool = True, seed: int = NO_SEED_PASSED, ): """ Constructor :param env: environment to sample from :param policy: policy to act in the environment (can also be an exploration strategy) :param num_workers: number of parallel samplers :param min_rollouts: minimum number of complete rollouts to sample :param min_steps: minimum total number of steps to sample :param show_progress_bar: it `True`, display a progress bar using `tqdm` :param seed: seed value for the random number generators, pass `None` for no seeding; defaults to the last seed that was set with `pyrado.set_seed` """ Serializable._init(self, locals()) super().__init__(min_rollouts=min_rollouts, min_steps=min_steps) self.env = env self.policy = policy self.show_progress_bar = show_progress_bar # Set method to spawn if using cuda if mp.get_start_method(allow_none=True) != "spawn": mp.set_start_method("spawn", force=True) # Create parallel pool. We use one thread per env because it's easier. self.pool = SamplerPool(num_workers) if seed is NO_SEED_PASSED: seed = pyrado.get_base_seed() self._seed = seed # Initialize with -1 such that we start with the 0-th sample. Incrementing after sampling may cause issues when # the sampling crashes and the sample count is not incremented. self._sample_count = -1 # Distribute environments. We use pickle to make sure a copy is created for n_envs=1 self.pool.invoke_all(_ps_init, pickle.dumps(self.env), pickle.dumps(self.policy))
def __init__(self, env: Env, policy: Policy, num_workers: int, num_rollouts_per_param: int, seed: int = None): """ Constructor :param env: environment to sample from :param policy: policy used for sampling :param num_workers: number of parallel samplers :param num_rollouts_per_param: number of rollouts per policy parameter set (and init state if specified) :param seed: seed value for the random number generators, pass `None` for no seeding """ if not isinstance(num_rollouts_per_param, int): raise pyrado.TypeErr(given=num_rollouts_per_param, expected_type=int) if num_rollouts_per_param < 1: raise pyrado.ValueErr(given=num_rollouts_per_param, ge_constraint='1') Serializable._init(self, locals()) # Check environment for domain randomization wrappers (stops after finding the outermost) self._dr_wrapper = typed_env(env, DomainRandWrapper) if self._dr_wrapper is not None: assert isinstance(inner_env(env), SimEnv) # Remove them all from the env chain since we sample the domain parameter later explicitly env = remove_all_dr_wrappers(env) self.env, self.policy = env, policy self.num_rollouts_per_param = num_rollouts_per_param # Create parallel pool. We use one thread per environment because it's easier. self.pool = SamplerPool(num_workers) # Set all rngs' seeds if seed is not None: self.pool.set_seed(seed) # Distribute environments. We use pickle to make sure a copy is created for n_envs = 1 self.pool.invoke_all(_pes_init, pickle.dumps(self.env), pickle.dumps(self.policy))
def __init__( self, wrapped_env: Env, parameters: Sequence[DomainParam], inner_policy: Policy, discriminator, step_length: float = 0.01, horizon: int = 50, num_rollouts_per_config: int = 8, num_workers: int = 4, ): """ Constructor :param wrapped_env: the environment to wrap :param parameters: which physics parameters should be randomized :param inner_policy: the policy to train the subrtn on :param discriminator: the discriminator to distinguish reference environments from randomized ones :param step_length: the step size :param horizon: an svpg horizon :param num_rollouts_per_config: number of trajectories to sample per physics configuration :param num_workers: number of environments for parallel sampling """ Serializable._init(self, locals()) EnvWrapper.__init__(self, wrapped_env) self.parameters: Sequence[DomainParam] = parameters self.pool = SamplerPool(num_workers) self.inner_policy = inner_policy self.svpg_state = None self.count = 0 self.num_trajs = num_rollouts_per_config self.svpg_max_step_length = step_length self.discriminator = discriminator self.max_steps = 8 self._adapter_obs_space = BoxSpace(-np.ones(len(parameters)), np.ones(len(parameters))) self._adapter_act_space = BoxSpace(-np.ones(len(parameters)), np.ones(len(parameters))) self.horizon = horizon self.horizon_count = 0
def __init__(self, env, policy, num_workers: int, *, min_rollouts: int = None, min_steps: int = None, show_progress_bar: bool = True, seed: int = None): """ Constructor :param env: environment to sample from :param policy: policy to act in the environment (can also be an exploration strategy) :param num_workers: number of parallel samplers :param min_rollouts: minimum number of complete rollouts to sample :param min_steps: minimum total number of steps to sample :param show_progress_bar: it `True`, display a progress bar using `tqdm` :param seed: seed value for the random number generators, pass `None` for no seeding """ Serializable._init(self, locals()) super().__init__(min_rollouts=min_rollouts, min_steps=min_steps) self.env = env self.policy = policy self.show_progress_bar = show_progress_bar # Set method to spawn if using cuda if self.policy.device == 'cuda': mp.set_start_method('spawn', force=True) # Create parallel pool. We use one thread per env because it's easier. self.pool = SamplerPool(num_workers) # Set all rngs' seeds if seed is not None: self.set_seed(seed) # Distribute environments. We use pickle to make sure a copy is created for n_envs=1 self.pool.invoke_all(_ps_init, pickle.dumps(self.env), pickle.dumps(self.policy))
def __init__(self, env, policy, num_envs: int, *, min_rollouts: int = None, min_steps: int = None, bernoulli_reset: bool = None, seed: int = None): """ Constructor :param env: environment to sample from :param policy: policy to act in the environment (can also be an exploration strategy) :param num_envs: number of parallel samplers :param min_rollouts: minimum number of complete rollouts to sample. :param min_steps: minimum total number of steps to sample. :param bernoulli_reset: probability for resetting after the current time step :param seed: Seed to use. Every subprocess is seeded with seed+thread_number """ Serializable._init(self, locals()) super().__init__(min_rollouts=min_rollouts, min_steps=min_steps) self.env = env self.policy = policy self.bernoulli_reset = bernoulli_reset # Set method to spawn if using cuda if self.policy.device == 'cuda': mp.set_start_method('spawn', force=True) # Create parallel pool. We use one thread per env because it's easier. self.pool = SamplerPool(num_envs) if seed is not None: self.pool.set_seed(seed) # Distribute environments. We use pickle to make sure a copy is created for n_envs=1 self.pool.invoke_all(_ps_init, pickle.dumps(self.env), pickle.dumps(self.policy), pickle.dumps(self.bernoulli_reset))
def __init__(self, wrapped_env: Env, parameters: ADR.PhysicsParameters, inner_policy: Policy, discriminator: RewardGenerator, step_length=0.01, horizon=50, num_trajs_per_config=8, num_sampler_envs: int = 4): """ Constructor :param wrapped_env: the environment to wrap :param parameters: which physics parameters should be randomized :param inner_policy: the policy to train the subroutine on :param discriminator: the discriminator to distinguish reference envs from randomized ones :param step_length: the step size :param horizon: an svpg horizon :param num_trajs_per_config: number of trajectories to sample per physics configuration :param num_sampler_envs: the number of samplers operating in parallel """ Serializable._init(self, locals()) EnvWrapper.__init__(self, wrapped_env) self.parameters = parameters self.pool = SamplerPool(num_sampler_envs) self.inner_policy = inner_policy self.state = None self.count = 0 self.num_trajs = num_trajs_per_config self.svpg_max_step_length = step_length self.discriminator = discriminator self.max_steps = 8 self._adapter_obs_space = BoxSpace(-np.ones(self.parameters.length), np.ones(self.parameters.length)) self._adapter_act_space = BoxSpace(-np.ones(self.parameters.length), np.ones(self.parameters.length)) self.horizon = horizon self.horizon_count = 0
def __init__(self, save_dir: str, env: Env, subroutine: Algorithm, max_iter: int, svpg_particle_hparam: dict, num_svpg_particles: int, num_discriminator_epoch: int, batch_size: int, svpg_learning_rate: float = 3e-4, svpg_temperature: float = 10, svpg_evaluation_steps: int = 10, svpg_horizon: int = 50, svpg_kl_factor: float = 0.03, svpg_warmup: int = 0, svpg_serial: bool = False, num_sampler_envs: int = 4, num_trajs_per_config: int = 8, max_step_length: float = 0.05, randomized_params=None, logger: StepLogger = None): """ Constructor TODO @Robin :param save_dir: directory to save the snapshots i.e. the results in :param env: :param subroutine: algorithm which performs the policy / value-function optimization :param max_iter: :param svpg_particle_hparam: :param num_svpg_particles: :param num_discriminator_epoch: :param batch_size: :param svpg_learning_rate: :param svpg_temperature: :param svpg_evaluation_steps: :param svpg_horizon: :param svpg_kl_factor: :param svpg_warmup: :param svpg_serial: :param num_sampler_envs: :param num_trajs_per_config: :param max_step_length: :param randomized_params: :param logger: """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(subroutine, Algorithm): raise pyrado.TypeErr(given=subroutine, expected_type=Algorithm) if not isinstance(subroutine.policy, Policy): raise pyrado.TypeErr(given=subroutine.policy, expected_type=Policy) # Call Algorithm's constructor super().__init__(save_dir, max_iter, subroutine.policy, logger) self.log_loss = True # Store the inputs self.env = env self.subroutine = subroutine self.num_particles = num_svpg_particles self.num_discriminator_epoch = num_discriminator_epoch self.batch_size = batch_size self.num_trajs_per_config = num_trajs_per_config self.warm_up_time = svpg_warmup self.svpg_evaluation_steps = svpg_evaluation_steps self.svpg_temperature = svpg_temperature self.svpg_lr = svpg_learning_rate self.svpg_max_step_length = max_step_length self.svpg_horizon = svpg_horizon self.svpg_kl_factor = svpg_kl_factor # Get the number of params self.params = self.PhysicsParameters(env, randomized_params) self.num_params = self.params.length # Initialize the sampler self.pool = SamplerPool(num_sampler_envs) # Initialize reward generator self.reward_generator = RewardGenerator( env.spec, self.batch_size, reward_multiplier=1, lr=1e-3, logger=self.logger ) # Initialize step counter self.curr_time_step = 0 # Initialize logbook self.sim_instances_full_horizon = np.random.random_sample( (self.num_particles, self.svpg_horizon, self.svpg_evaluation_steps, self.num_params) ) self.svpg_wrapper = SVPGAdapter( env, self.params, subroutine.expl_strat, self.reward_generator, horizon=self.svpg_horizon, num_trajs_per_config=self.num_trajs_per_config, num_sampler_envs=num_sampler_envs, ) # Initialize SVPG self.svpg = SVPG( save_dir, self.svpg_wrapper, svpg_particle_hparam, max_iter, self.num_particles, self.svpg_temperature, self.svpg_lr, self.svpg_horizon, serial=svpg_serial, num_sampler_envs=num_sampler_envs, logger=logger )
def __init__( self, env: Union[SimEnv, EnvWrapper], policy: Policy, num_init_states_per_domain: int, num_domains: int, num_workers: int, seed: Optional[int] = None, ): """ Constructor :param env: environment to sample from :param policy: policy used for sampling :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param num_workers: number of parallel samplers :param seed: seed value for the random number generators, pass `None` for no seeding; defaults to the last seed that was set with `pyrado.set_seed` """ if not isinstance(num_init_states_per_domain, int): raise pyrado.TypeErr(given=num_init_states_per_domain, expected_type=int) if num_init_states_per_domain < 1: raise pyrado.ValueErr(given=num_init_states_per_domain, ge_constraint="1") if not isinstance(num_domains, int): raise pyrado.TypeErr(given=num_domains, expected_type=int) if num_domains < 1: raise pyrado.ValueErr(given=num_domains, ge_constraint="1") Serializable._init(self, locals()) # Check environment for domain randomization wrappers (stops after finding the outermost) self._dr_wrapper = typed_env(env, DomainRandWrapper) if self._dr_wrapper is not None: assert isinstance(inner_env(env), SimEnv) # Remove them all from the env chain since we sample the domain parameter later explicitly env = remove_all_dr_wrappers(env) self.env, self.policy = env, policy self.num_init_states_per_domain = num_init_states_per_domain self.num_domains = num_domains # Set method to spawn if using cuda if mp.get_start_method(allow_none=True) != "spawn": mp.set_start_method("spawn", force=True) # Create parallel pool. We use one thread per environment because it's easier. self.pool = SamplerPool(num_workers) if seed is NO_SEED_PASSED: seed = pyrado.get_base_seed() self._seed = seed # Initialize with -1 such that we start with the 0-th sample. Incrementing after sampling may cause issues when # the sampling crashes and the sample count is not incremented. self._sample_count = -1 # Distribute environments. We use pickle to make sure a copy is created for n_envs = 1 self.pool.invoke_all(_pes_init, pickle.dumps(self.env), pickle.dumps(self.policy))
env_sim_list = [] policy_list = [] for ex_dir in ex_dirs: env_sim, policy, _ = load_experiment(ex_dir, args) policy_list.append(policy) # Fix initial state (set to None if it should not be fixed) init_state_list = [None]*args.num_ro_per_config # Crate empty data frame df = pd.DataFrame(columns=['policy', 'ret', 'len']) # Evaluate all policies for i, (env_sim, policy) in enumerate(zip(env_sim_list, policy_list)): # Create a new sampler pool for every policy to synchronize the random seeds i.e. init states pool = SamplerPool(args.num_workers) # Seed the sampler if args.seed is not None: pool.set_seed(args.seed) print_cbt(f"Set the random number generators' seed to {args.seed}.", 'w') else: print_cbt('No seed was set', 'y') # Add the same wrappers as during training env = wrap_like_other_env(env, env_sim) # Sample rollouts ros = eval_randomized_domain(pool, env, pert, policy, init_state_list) # internally calls DomainRandWrapperLive # Compute results metrics
policies.append(policy) # Create one-dim results grid and ensure right number of rollouts param_list = param_grid(param_spec) param_list *= args.num_ro_per_config # Fix initial state (set to None if it should not be fixed) init_state = None # Crate empty data frame df = pd.DataFrame(columns=['policy', 'ret', 'len', varied_param_key]) # Evaluate all policies for i, policy in enumerate(policies): # Create a new sampler pool for every policy to synchronize the random seeds i.e. init states pool = SamplerPool(args.num_envs) # Seed the sampler if args.seed is not None: pool.set_seed(args.seed) print_cbt(f'Set seed to {args.seed}', 'y') else: print_cbt('No seed was set', 'r', bright=True) # Add an action normalization wrapper if the policy was trained with one env = conditional_actnorm_wrapper(env, ex_dirs, i) # Sample rollouts ros = eval_domain_params(pool, env, policy, param_list, init_state) # Compute results metrics
def __init__( self, save_dir: pyrado.PathLike, env: Env, subrtn: Algorithm, max_iter: int, svpg_particle_hparam: dict, num_svpg_particles: int, num_discriminator_epoch: int, batch_size: int, svpg_learning_rate: float = 3e-4, svpg_temperature: float = 10, svpg_evaluation_steps: int = 10, svpg_horizon: int = 50, svpg_kl_factor: float = 0.03, svpg_warmup: int = 0, svpg_serial: bool = False, num_workers: int = 4, num_trajs_per_config: int = 8, max_step_length: float = 0.05, randomized_params: Sequence[str] = None, logger: Optional[StepLogger] = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment to train in :param subrtn: algorithm which performs the policy / value-function optimization :param max_iter: maximum number of iterations :param svpg_particle_hparam: SVPG particle hyperparameters :param num_svpg_particles: number of SVPG particles :param num_discriminator_epoch: epochs in discriminator training :param batch_size: batch size for training :param svpg_learning_rate: SVPG particle optimizers' learning rate :param svpg_temperature: SVPG temperature coefficient (how strong is the influence of the particles on each other) :param svpg_evaluation_steps: how many configurations to sample between training :param svpg_horizon: how many steps until the particles are reset :param svpg_kl_factor: kl reward coefficient :param svpg_warmup: number of iterations without SVPG training in the beginning :param svpg_serial: serial mode (see SVPG) :param num_workers: number of environments for parallel sampling :param num_trajs_per_config: number of trajectories to sample from each config :param max_step_length: maximum change of physics parameters per step :param randomized_params: which parameters to randomize :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(subrtn, Algorithm): raise pyrado.TypeErr(given=subrtn, expected_type=Algorithm) if not isinstance(subrtn.policy, Policy): raise pyrado.TypeErr(given=subrtn.policy, expected_type=Policy) # Call Algorithm's constructor super().__init__(save_dir, max_iter, subrtn.policy, logger) self.log_loss = True # Store the inputs self.env = env self._subrtn = subrtn self._subrtn.save_name = "subrtn" self.num_particles = num_svpg_particles self.num_discriminator_epoch = num_discriminator_epoch self.batch_size = batch_size self.num_trajs_per_config = num_trajs_per_config self.warm_up_time = svpg_warmup self.svpg_evaluation_steps = svpg_evaluation_steps self.svpg_temperature = svpg_temperature self.svpg_lr = svpg_learning_rate self.svpg_max_step_length = max_step_length self.svpg_horizon = svpg_horizon self.svpg_kl_factor = svpg_kl_factor self.pool = SamplerPool(num_workers) self.curr_time_step = 0 # Get the number of params if isinstance(randomized_params, list) and len(randomized_params) == 0: randomized_params = inner_env( self.env).get_nominal_domain_param().keys() self.params = [DomainParam(param, 1) for param in randomized_params] self.num_params = len(self.params) # Initialize reward generator self.reward_generator = RewardGenerator(env.spec, self.batch_size, reward_multiplier=1, lr=1e-3, logger=self.logger) # Initialize logbook self.sim_instances_full_horizon = np.random.random_sample( (self.num_particles, self.svpg_horizon, self.svpg_evaluation_steps, self.num_params)) # Initialize SVPG self.svpg_wrapper = SVPGAdapter( env, self.params, subrtn.expl_strat, self.reward_generator, horizon=self.svpg_horizon, num_rollouts_per_config=self.num_trajs_per_config, num_workers=num_workers, ) self.svpg = SVPG( save_dir, self.svpg_wrapper, svpg_particle_hparam, max_iter, self.num_particles, self.svpg_temperature, self.svpg_lr, self.svpg_horizon, serial=svpg_serial, num_workers=num_workers, logger=logger, ) self.svpg.save_name = "subrtn_svpg"
def evaluate_policy(args, ex_dir): """Helper function to evaluate the policy from an experiment in the associated environment.""" env, policy, _ = load_experiment(ex_dir, args) # Create multi-dim evaluation grid param_spec = dict() param_spec_dim = None if isinstance(inner_env(env), BallOnPlateSim): param_spec["ball_radius"] = np.linspace(0.02, 0.08, num=2, endpoint=True) param_spec["ball_rolling_friction_coefficient"] = np.linspace(0.0295, 0.9, num=2, endpoint=True) elif isinstance(inner_env(env), QQubeSwingUpSim): eval_num = 200 # Use nominal values for all other parameters. for param, nominal_value in env.get_nominal_domain_param().items(): param_spec[param] = nominal_value # param_spec["gravity_const"] = np.linspace(5.0, 15.0, num=eval_num, endpoint=True) param_spec["damping_pend_pole"] = np.linspace(0.0, 0.0001, num=eval_num, endpoint=True) param_spec["damping_rot_pole"] = np.linspace(0.0, 0.0006, num=eval_num, endpoint=True) param_spec_dim = 2 elif isinstance(inner_env(env), QBallBalancerSim): # param_spec["gravity_const"] = np.linspace(7.91, 11.91, num=11, endpoint=True) # param_spec["ball_mass"] = np.linspace(0.003, 0.3, num=11, endpoint=True) # param_spec["ball_radius"] = np.linspace(0.01, 0.1, num=11, endpoint=True) param_spec["plate_length"] = np.linspace(0.275, 0.275, num=11, endpoint=True) param_spec["arm_radius"] = np.linspace(0.0254, 0.0254, num=11, endpoint=True) # param_spec["load_inertia"] = np.linspace(5.2822e-5*0.5, 5.2822e-5*1.5, num=11, endpoint=True) # param_spec["motor_inertia"] = np.linspace(4.6063e-7*0.5, 4.6063e-7*1.5, num=11, endpoint=True) # param_spec["gear_ratio"] = np.linspace(60, 80, num=11, endpoint=True) # param_spec["gear_efficiency"] = np.linspace(0.6, 1.0, num=11, endpoint=True) # param_spec["motor_efficiency"] = np.linspace(0.49, 0.89, num=11, endpoint=True) # param_spec["motor_back_emf"] = np.linspace(0.006, 0.066, num=11, endpoint=True) # param_spec["motor_resistance"] = np.linspace(2.6*0.5, 2.6*1.5, num=11, endpoint=True) # param_spec["combined_damping"] = np.linspace(0.0, 0.05, num=11, endpoint=True) # param_spec["friction_coeff"] = np.linspace(0, 0.015, num=11, endpoint=True) # param_spec["voltage_thold_x_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True) # param_spec["voltage_thold_x_neg"] = np.linspace(-1., 0.0, num=11, endpoint=True) # param_spec["voltage_thold_y_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True) # param_spec["voltage_thold_y_neg"] = np.linspace(-1.0, 0, num=11, endpoint=True) # param_spec["offset_th_x"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True) # param_spec["offset_th_y"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True) else: raise NotImplementedError # Always add an action delay wrapper (with 0 delay by default) if typed_env(env, ActDelayWrapper) is None: env = ActDelayWrapper(env) # param_spec['act_delay'] = np.linspace(0, 30, num=11, endpoint=True, dtype=int) add_info = "-".join(param_spec.keys()) # Create multidimensional results grid and ensure right number of rollouts param_list = param_grid(param_spec) param_list *= args.num_rollouts_per_config # Fix initial state (set to None if it should not be fixed) init_state = np.array([0.0, 0.0, 0.0, 0.0]) # Create sampler pool = SamplerPool(args.num_workers) if args.seed is not None: pool.set_seed(args.seed) print_cbt(f"Set the random number generators' seed to {args.seed}.", "w") else: print_cbt("No seed was set", "y") # Sample rollouts ros = eval_domain_params(pool, env, policy, param_list, init_state) # Compute metrics lod = [] for ro in ros: d = dict(**ro.rollout_info["domain_param"], ret=ro.undiscounted_return(), len=ro.length) # Simply remove the observation noise from the domain parameters try: d.pop("obs_noise_mean") d.pop("obs_noise_std") except KeyError: pass lod.append(d) df = pd.DataFrame(lod) metrics = dict( avg_len=df["len"].mean(), avg_ret=df["ret"].mean(), median_ret=df["ret"].median(), min_ret=df["ret"].min(), max_ret=df["ret"].max(), std_ret=df["ret"].std(), ) pprint(metrics, indent=4) # Create subfolder and save timestamp = datetime.datetime.now() add_info = timestamp.strftime(pyrado.timestamp_format) + "--" + add_info save_dir = osp.join(ex_dir, "eval_domain_grid", add_info) os.makedirs(save_dir, exist_ok=True) save_dicts_to_yaml( {"ex_dir": str(ex_dir)}, {"varied_params": list(param_spec.keys())}, {"num_rpp": args.num_rollouts_per_config, "seed": args.seed}, {"metrics": dict_arraylike_to_float(metrics)}, save_dir=save_dir, file_name="summary", ) pyrado.save(df, f"df_sp_grid_{len(param_spec) if param_spec_dim is None else param_spec_dim}d.pkl", save_dir)