def test_parallel_sampling_deterministic_wo_min_steps( env: SimEnv, policy: Policy, min_rollouts: Optional[int], init_states: Optional[int], domain_params: Optional[List[dict]], ): env.max_steps = 20 if init_states is not None: init_states = [ env.spec.state_space.sample_uniform() for _ in range(init_states) ] nums_workers = (1, 2, 4) all_rollouts = [] for num_workers in nums_workers: # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed # anyway). all_rollouts.append( ParallelRolloutSampler( env, NormalActNoiseExplStrat(policy, std_init=1.0), num_workers=num_workers, min_rollouts=min_rollouts, seed=0, ).sample(init_states=init_states, domain_params=domain_params)) # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts. for ros in all_rollouts: for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]: # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions # are different when using the idle policy. if isinstance(policy, IdlePolicy): # The Quanser Ball Balancer is a deterministic environment (conditioned on the initial state). As the # idle policy is a deterministic policy, this will result in the rollouts being equivalent for each # initial state, so do not check for difference if the initial states where set. if init_states is None: assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx( ro_b.observations) else: assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx(ro_b.observations) assert ro_a.actions != pytest.approx(ro_b.actions) # Test that the rollouts for all number of workers are equal. for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]: assert len(ros_a) == len(ros_b) for ro_a, ro_b in zip(ros_a, ros_b): assert ro_a.rewards == pytest.approx(ro_b.rewards) assert ro_a.observations == pytest.approx(ro_b.observations) assert ro_a.actions == pytest.approx(ro_b.actions)
def test_parallel_sampling_deterministic_w_min_steps( env: SimEnv, policy: Policy, min_rollouts: Optional[int], min_steps: int, domain_params: Optional[List[dict]], ): env.max_steps = 20 nums_workers = (1, 2, 4) all_rollouts = [] for num_workers in nums_workers: # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed # anyway). all_rollouts.append( ParallelRolloutSampler( env, NormalActNoiseExplStrat(policy, std_init=1.0), num_workers=num_workers, min_rollouts=min_rollouts, min_steps=min_steps * env.max_steps, seed=0, ).sample(domain_params=domain_params)) # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts. for ros in all_rollouts: for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]: # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions # are different when using the idle policy. if not isinstance(policy, IdlePolicy): assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx(ro_b.observations) assert ro_a.actions != pytest.approx(ro_b.actions) # Test that the rollouts for all number of workers are equal. for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]: assert sum([len(ro) for ro in ros_a]) == sum([len(ro) for ro in ros_b]) assert sum([len(ro) for ro in ros_a]) >= min_steps * env.max_steps assert sum([len(ro) for ro in ros_b]) >= min_steps * env.max_steps assert len(ros_a) == len(ros_b) if min_rollouts is not None: assert len(ros_a) >= min_rollouts assert len(ros_b) >= min_rollouts for ro_a, ro_b in zip(ros_a, ros_b): assert ro_a.rewards == pytest.approx(ro_b.rewards) assert ro_a.observations == pytest.approx(ro_b.observations) assert ro_a.actions == pytest.approx(ro_b.actions)
def test_parallel_sampling_deterministic_smoke_test_w_min_steps( tmpdir_factory, env: SimEnv, policy: Policy, algo, min_rollouts: int, min_steps: int): env.max_steps = 20 seeds = (0, 1) nums_workers = (1, 2, 4) logging_results = [] rollout_results: List[List[List[List[StepSequence]]]] = [] for seed in seeds: logging_results.append((seed, [])) rollout_results.append([]) for num_workers in nums_workers: pyrado.set_seed(seed) policy.init_param(None) ex_dir = str( tmpdir_factory.mktemp( f"seed={seed}-num_workers={num_workers}")) set_log_prefix_dir(ex_dir) vfcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) critic = GAE(vfcn, gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False) alg = algo( ex_dir, env, policy, critic, max_iter=3, min_rollouts=min_rollouts, min_steps=min_steps * env.max_steps, num_workers=num_workers, ) alg.sampler = RolloutSavingWrapper(alg.sampler) alg.train() with open(f"{ex_dir}/progress.csv") as f: logging_results[-1][1].append(str(f.read())) rollout_results[-1].append(alg.sampler.rollouts) # Test that the observations for all number of workers are equal. for rollouts in rollout_results: for ros_a, ros_b in [(a, b) for a in rollouts for b in rollouts]: assert len(ros_a) == len(ros_b) for ro_a, ro_b in zip(ros_a, ros_b): assert len(ro_a) == len(ro_b) for r_a, r_b in zip(ro_a, ro_b): assert r_a.observations == pytest.approx(r_b.observations) # Test that different seeds actually produce different results. for results_a, results_b in [(a, b) for seed_a, a in logging_results for seed_b, b in logging_results if seed_a != seed_b]: for result_a, result_b in [(a, b) for a in results_a for b in results_b if a is not b]: assert result_a != result_b # Test that same seeds produce same results. for _, results in logging_results: for result_a, result_b in [(a, b) for a in results for b in results]: assert result_a == result_b
def test_npdr_and_bayessim( ex_dir, algo_name: str, env: SimEnv, num_segments: int, len_segments: int, num_real_rollouts: int, num_sbi_rounds: int, use_rec_act: bool, ): pyrado.set_seed(0) # Create a fake ground truth target domain env_real = deepcopy(env) dp_nom = env.get_nominal_domain_param() env_real.domain_param = dict(mass_pend_pole=dp_nom["mass_pend_pole"] * 1.2, length_pend_pole=dp_nom["length_pend_pole"] * 0.8) # Reduce the number of steps to make this test run faster env.max_steps = 40 env_real.max_steps = 40 # Policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Define a mapping: index - domain parameter dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"} # Prior prior_hparam = dict( low=to.tensor( [dp_nom["mass_pend_pole"] * 0.5, dp_nom["length_pend_pole"] * 0.5]), high=to.tensor( [dp_nom["mass_pend_pole"] * 1.5, dp_nom["length_pend_pole"] * 1.5]), ) prior = sbiutils.BoxUniform(**prior_hparam) # Time series embedding embedding = BayesSimEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), downsampling_factor=3, ) # Posterior (normalizing flow) posterior_hparam = dict(model="maf", embedding_net=nn.Identity(), hidden_features=20, num_transforms=3) # Policy optimization subroutine subrtn_policy_hparam = dict( max_iter=1, pop_size=2, num_init_states_per_domain=1, num_domains=2, expl_std_init=0.1, expl_factor=1.1, num_workers=1, ) subrtn_policy = HCNormal(ex_dir, env, policy, **subrtn_policy_hparam) # Algorithm algo_hparam = dict( max_iter=1, num_sim_per_round=20, num_real_rollouts=num_real_rollouts, num_sbi_rounds=num_sbi_rounds, simulation_batch_size=1, normalize_posterior=False, num_eval_samples=2, num_segments=num_segments, len_segments=len_segments, use_rec_act=use_rec_act, stop_on_done=True, subrtn_sbi_training_hparam=dict( max_num_epochs=1), # only train for 1 iteration # subrtn_sbi_sampling_hparam=dict(sample_with_mcmc=True, mcmc_parameters=dict(warmup_steps=20)), num_workers=1, ) skip = False if algo_name == NPDR.name: algo = NPDR( save_dir=ex_dir, env_sim=env, env_real=env_real, policy=policy, dp_mapping=dp_mapping, prior=prior, embedding=embedding, subrtn_sbi_class=SNPE_C, posterior_hparam=posterior_hparam, subrtn_policy=subrtn_policy, **algo_hparam, ) elif algo_name == BayesSim.name: # We are not checking multi-round SNPE-A since it has known issues if algo_hparam["num_sbi_rounds"] > 1: skip = True algo = BayesSim( save_dir=ex_dir, env_sim=env, env_real=env_real, policy=policy, dp_mapping=dp_mapping, embedding=embedding, prior=prior, subrtn_policy=subrtn_policy, **algo_hparam, ) else: raise NotImplementedError if not skip: algo.train() # Just checking the interface here assert algo.curr_iter == algo.max_iter
def test_sbi_embedding( ex_dir, env: SimEnv, embedding_name: str, num_segments: int, len_segments: int, stop_on_done: bool, state_mask_labels: Union[None, List[str]], act_mask_labels: Union[None, List[str]], ): pyrado.set_seed(0) # Reduce the number of steps to make this test run faster env.max_steps = 80 # Policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Define a mapping: index - domain parameter dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"} # Time series embedding if embedding_name == LastStepEmbedding.name: embedding = LastStepEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), state_mask_labels=state_mask_labels, act_mask_labels=act_mask_labels, ) elif embedding_name == AllStepsEmbedding.name: embedding = AllStepsEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), env.max_steps, downsampling_factor=3, state_mask_labels=state_mask_labels, act_mask_labels=act_mask_labels, ) elif embedding_name == DeltaStepsEmbedding.name: embedding = DeltaStepsEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), env.max_steps, downsampling_factor=3, state_mask_labels=state_mask_labels, act_mask_labels=act_mask_labels, ) elif embedding_name == BayesSimEmbedding.name: embedding = BayesSimEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), downsampling_factor=3, state_mask_labels=state_mask_labels, act_mask_labels=act_mask_labels, ) elif embedding_name == DynamicTimeWarpingEmbedding.name: embedding = DynamicTimeWarpingEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), downsampling_factor=3, state_mask_labels=state_mask_labels, act_mask_labels=act_mask_labels, ) elif embedding_name == RNNEmbedding.name: embedding = RNNEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), hidden_size=10, num_recurrent_layers=1, output_size=1, len_rollouts=env.max_steps, downsampling_factor=1, state_mask_labels=state_mask_labels, act_mask_labels=act_mask_labels, ) else: raise NotImplementedError sampler = SimRolloutSamplerForSBI( env, policy, dp_mapping, embedding, num_segments, len_segments, stop_on_done, rollouts_real=None, use_rec_act=False, ) # Test with 7 domain parameter sets data_sim = sampler(to.abs(to.randn(7, 2))) assert data_sim.shape == (7, embedding.dim_output)
def test_combination(env: SimEnv): pyrado.set_seed(0) env.max_steps = 20 randomizer = create_default_randomizer(env) env_r = DomainRandWrapperBuffer(env, randomizer) env_r.fill_buffer(num_domains=3) dp_before = [] dp_after = [] for i in range(4): dp_before.append(env_r.domain_param) rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) dp_after.append(env_r.domain_param) assert dp_after[i] != dp_before[i] assert dp_after[0] == dp_after[3] env_rn = ActNormWrapper(env) elb = {"x_dot": -213.0, "theta_dot": -42.0} eub = {"x_dot": 213.0, "theta_dot": 42.0, "x": 0.123} env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub) alb, aub = env_rn.act_space.bounds assert all(alb == -1) assert all(aub == 1) olb, oub = env_rn.obs_space.bounds assert all(olb == -1) assert all(oub == 1) ro_r = rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) ro_rn = rollout(env_rn, DummyPolicy(env_rn.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(env_rn._process_obs(ro_r.observations), ro_rn.observations) env_rnp = ObsPartialWrapper( env_rn, idcs=[env.obs_space.labels[2], env.obs_space.labels[3]]) ro_rnp = rollout(env_rnp, DummyPolicy(env_rnp.spec), eval=True, seed=0, render_mode=RenderMode()) env_rnpa = GaussianActNoiseWrapper( env_rnp, noise_mean=0.5 * np.ones(env_rnp.act_space.shape), noise_std=0.1 * np.ones(env_rnp.act_space.shape)) ro_rnpa = rollout(env_rnpa, DummyPolicy(env_rnpa.spec), eval=True, seed=0, render_mode=RenderMode()) assert not np.allclose( ro_rnp.observations, ro_rnpa.observations) # the action noise changed to rollout env_rnpd = ActDelayWrapper(env_rnp, delay=3) ro_rnpd = rollout(env_rnpd, DummyPolicy(env_rnpd.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(ro_rnp.actions, ro_rnpd.actions) assert not np.allclose(ro_rnp.observations, ro_rnpd.observations) assert type(inner_env(env_rnpd)) == type(env) assert typed_env(env_rnpd, ObsPartialWrapper) is not None assert isinstance(env_rnpd, ActDelayWrapper) env_rnpdr = remove_env(env_rnpd, ActDelayWrapper) assert not isinstance(env_rnpdr, ActDelayWrapper)