コード例 #1
0
def test_parallel_sampling_deterministic_wo_min_steps(
    env: SimEnv,
    policy: Policy,
    min_rollouts: Optional[int],
    init_states: Optional[int],
    domain_params: Optional[List[dict]],
):
    env.max_steps = 20

    if init_states is not None:
        init_states = [
            env.spec.state_space.sample_uniform() for _ in range(init_states)
        ]

    nums_workers = (1, 2, 4)

    all_rollouts = []
    for num_workers in nums_workers:
        # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed
        # anyway).
        all_rollouts.append(
            ParallelRolloutSampler(
                env,
                NormalActNoiseExplStrat(policy, std_init=1.0),
                num_workers=num_workers,
                min_rollouts=min_rollouts,
                seed=0,
            ).sample(init_states=init_states, domain_params=domain_params))

    # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts.
    for ros in all_rollouts:
        for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]:
            # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions
            # are different when using the idle policy.
            if isinstance(policy, IdlePolicy):
                # The Quanser Ball Balancer is a deterministic environment (conditioned on the initial state). As the
                # idle policy is a deterministic policy, this will result in the rollouts being equivalent for each
                # initial state, so do not check for difference if the initial states where set.
                if init_states is None:
                    assert ro_a.rewards != pytest.approx(ro_b.rewards)
                    assert ro_a.observations != pytest.approx(
                        ro_b.observations)
            else:
                assert ro_a.rewards != pytest.approx(ro_b.rewards)
                assert ro_a.observations != pytest.approx(ro_b.observations)
                assert ro_a.actions != pytest.approx(ro_b.actions)

    # Test that the rollouts for all number of workers are equal.
    for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]:
        assert len(ros_a) == len(ros_b)
        for ro_a, ro_b in zip(ros_a, ros_b):
            assert ro_a.rewards == pytest.approx(ro_b.rewards)
            assert ro_a.observations == pytest.approx(ro_b.observations)
            assert ro_a.actions == pytest.approx(ro_b.actions)
コード例 #2
0
def test_parallel_sampling_deterministic_w_min_steps(
    env: SimEnv,
    policy: Policy,
    min_rollouts: Optional[int],
    min_steps: int,
    domain_params: Optional[List[dict]],
):
    env.max_steps = 20

    nums_workers = (1, 2, 4)

    all_rollouts = []
    for num_workers in nums_workers:
        # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed
        # anyway).
        all_rollouts.append(
            ParallelRolloutSampler(
                env,
                NormalActNoiseExplStrat(policy, std_init=1.0),
                num_workers=num_workers,
                min_rollouts=min_rollouts,
                min_steps=min_steps * env.max_steps,
                seed=0,
            ).sample(domain_params=domain_params))

    # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts.
    for ros in all_rollouts:
        for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]:
            # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions
            # are different when using the idle policy.
            if not isinstance(policy, IdlePolicy):
                assert ro_a.rewards != pytest.approx(ro_b.rewards)
                assert ro_a.observations != pytest.approx(ro_b.observations)
                assert ro_a.actions != pytest.approx(ro_b.actions)

    # Test that the rollouts for all number of workers are equal.
    for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]:
        assert sum([len(ro) for ro in ros_a]) == sum([len(ro) for ro in ros_b])
        assert sum([len(ro) for ro in ros_a]) >= min_steps * env.max_steps
        assert sum([len(ro) for ro in ros_b]) >= min_steps * env.max_steps
        assert len(ros_a) == len(ros_b)
        if min_rollouts is not None:
            assert len(ros_a) >= min_rollouts
            assert len(ros_b) >= min_rollouts
        for ro_a, ro_b in zip(ros_a, ros_b):
            assert ro_a.rewards == pytest.approx(ro_b.rewards)
            assert ro_a.observations == pytest.approx(ro_b.observations)
            assert ro_a.actions == pytest.approx(ro_b.actions)
コード例 #3
0
def test_parallel_sampling_deterministic_smoke_test_w_min_steps(
        tmpdir_factory, env: SimEnv, policy: Policy, algo, min_rollouts: int,
        min_steps: int):
    env.max_steps = 20

    seeds = (0, 1)
    nums_workers = (1, 2, 4)

    logging_results = []
    rollout_results: List[List[List[List[StepSequence]]]] = []
    for seed in seeds:
        logging_results.append((seed, []))
        rollout_results.append([])
        for num_workers in nums_workers:
            pyrado.set_seed(seed)
            policy.init_param(None)
            ex_dir = str(
                tmpdir_factory.mktemp(
                    f"seed={seed}-num_workers={num_workers}"))
            set_log_prefix_dir(ex_dir)
            vfcn = FNN(input_size=env.obs_space.flat_dim,
                       output_size=1,
                       hidden_sizes=[16, 16],
                       hidden_nonlin=to.tanh)
            critic = GAE(vfcn,
                         gamma=0.98,
                         lamda=0.95,
                         batch_size=32,
                         lr=1e-3,
                         standardize_adv=False)
            alg = algo(
                ex_dir,
                env,
                policy,
                critic,
                max_iter=3,
                min_rollouts=min_rollouts,
                min_steps=min_steps * env.max_steps,
                num_workers=num_workers,
            )
            alg.sampler = RolloutSavingWrapper(alg.sampler)
            alg.train()
            with open(f"{ex_dir}/progress.csv") as f:
                logging_results[-1][1].append(str(f.read()))
            rollout_results[-1].append(alg.sampler.rollouts)

    # Test that the observations for all number of workers are equal.
    for rollouts in rollout_results:
        for ros_a, ros_b in [(a, b) for a in rollouts for b in rollouts]:
            assert len(ros_a) == len(ros_b)
            for ro_a, ro_b in zip(ros_a, ros_b):
                assert len(ro_a) == len(ro_b)
                for r_a, r_b in zip(ro_a, ro_b):
                    assert r_a.observations == pytest.approx(r_b.observations)

    # Test that different seeds actually produce different results.
    for results_a, results_b in [(a, b) for seed_a, a in logging_results
                                 for seed_b, b in logging_results
                                 if seed_a != seed_b]:
        for result_a, result_b in [(a, b) for a in results_a for b in results_b
                                   if a is not b]:
            assert result_a != result_b

    # Test that same seeds produce same results.
    for _, results in logging_results:
        for result_a, result_b in [(a, b) for a in results for b in results]:
            assert result_a == result_b
コード例 #4
0
def test_npdr_and_bayessim(
    ex_dir,
    algo_name: str,
    env: SimEnv,
    num_segments: int,
    len_segments: int,
    num_real_rollouts: int,
    num_sbi_rounds: int,
    use_rec_act: bool,
):
    pyrado.set_seed(0)

    # Create a fake ground truth target domain
    env_real = deepcopy(env)
    dp_nom = env.get_nominal_domain_param()
    env_real.domain_param = dict(mass_pend_pole=dp_nom["mass_pend_pole"] * 1.2,
                                 length_pend_pole=dp_nom["length_pend_pole"] *
                                 0.8)

    # Reduce the number of steps to make this test run faster
    env.max_steps = 40
    env_real.max_steps = 40

    # Policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Define a mapping: index - domain parameter
    dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"}

    # Prior
    prior_hparam = dict(
        low=to.tensor(
            [dp_nom["mass_pend_pole"] * 0.5,
             dp_nom["length_pend_pole"] * 0.5]),
        high=to.tensor(
            [dp_nom["mass_pend_pole"] * 1.5,
             dp_nom["length_pend_pole"] * 1.5]),
    )
    prior = sbiutils.BoxUniform(**prior_hparam)

    # Time series embedding
    embedding = BayesSimEmbedding(
        env.spec,
        RolloutSamplerForSBI.get_dim_data(env.spec),
        downsampling_factor=3,
    )

    # Posterior (normalizing flow)
    posterior_hparam = dict(model="maf",
                            embedding_net=nn.Identity(),
                            hidden_features=20,
                            num_transforms=3)

    # Policy optimization subroutine
    subrtn_policy_hparam = dict(
        max_iter=1,
        pop_size=2,
        num_init_states_per_domain=1,
        num_domains=2,
        expl_std_init=0.1,
        expl_factor=1.1,
        num_workers=1,
    )
    subrtn_policy = HCNormal(ex_dir, env, policy, **subrtn_policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=1,
        num_sim_per_round=20,
        num_real_rollouts=num_real_rollouts,
        num_sbi_rounds=num_sbi_rounds,
        simulation_batch_size=1,
        normalize_posterior=False,
        num_eval_samples=2,
        num_segments=num_segments,
        len_segments=len_segments,
        use_rec_act=use_rec_act,
        stop_on_done=True,
        subrtn_sbi_training_hparam=dict(
            max_num_epochs=1),  # only train for 1 iteration
        # subrtn_sbi_sampling_hparam=dict(sample_with_mcmc=True, mcmc_parameters=dict(warmup_steps=20)),
        num_workers=1,
    )
    skip = False
    if algo_name == NPDR.name:
        algo = NPDR(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            prior=prior,
            embedding=embedding,
            subrtn_sbi_class=SNPE_C,
            posterior_hparam=posterior_hparam,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    elif algo_name == BayesSim.name:
        # We are not checking multi-round SNPE-A since it has known issues
        if algo_hparam["num_sbi_rounds"] > 1:
            skip = True
        algo = BayesSim(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            embedding=embedding,
            prior=prior,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    else:
        raise NotImplementedError

    if not skip:
        algo.train()
        # Just checking the interface here
        assert algo.curr_iter == algo.max_iter
コード例 #5
0
def test_sbi_embedding(
    ex_dir,
    env: SimEnv,
    embedding_name: str,
    num_segments: int,
    len_segments: int,
    stop_on_done: bool,
    state_mask_labels: Union[None, List[str]],
    act_mask_labels: Union[None, List[str]],
):
    pyrado.set_seed(0)

    # Reduce the number of steps to make this test run faster
    env.max_steps = 80

    # Policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Define a mapping: index - domain parameter
    dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"}

    # Time series embedding
    if embedding_name == LastStepEmbedding.name:
        embedding = LastStepEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == AllStepsEmbedding.name:
        embedding = AllStepsEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            env.max_steps,
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == DeltaStepsEmbedding.name:
        embedding = DeltaStepsEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            env.max_steps,
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == BayesSimEmbedding.name:
        embedding = BayesSimEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == DynamicTimeWarpingEmbedding.name:
        embedding = DynamicTimeWarpingEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            downsampling_factor=3,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    elif embedding_name == RNNEmbedding.name:
        embedding = RNNEmbedding(
            env.spec,
            RolloutSamplerForSBI.get_dim_data(env.spec),
            hidden_size=10,
            num_recurrent_layers=1,
            output_size=1,
            len_rollouts=env.max_steps,
            downsampling_factor=1,
            state_mask_labels=state_mask_labels,
            act_mask_labels=act_mask_labels,
        )
    else:
        raise NotImplementedError

    sampler = SimRolloutSamplerForSBI(
        env,
        policy,
        dp_mapping,
        embedding,
        num_segments,
        len_segments,
        stop_on_done,
        rollouts_real=None,
        use_rec_act=False,
    )

    # Test with 7 domain parameter sets
    data_sim = sampler(to.abs(to.randn(7, 2)))
    assert data_sim.shape == (7, embedding.dim_output)
コード例 #6
0
def test_combination(env: SimEnv):
    pyrado.set_seed(0)
    env.max_steps = 20

    randomizer = create_default_randomizer(env)
    env_r = DomainRandWrapperBuffer(env, randomizer)
    env_r.fill_buffer(num_domains=3)

    dp_before = []
    dp_after = []
    for i in range(4):
        dp_before.append(env_r.domain_param)
        rollout(env_r,
                DummyPolicy(env_r.spec),
                eval=True,
                seed=0,
                render_mode=RenderMode())
        dp_after.append(env_r.domain_param)
        assert dp_after[i] != dp_before[i]
    assert dp_after[0] == dp_after[3]

    env_rn = ActNormWrapper(env)
    elb = {"x_dot": -213.0, "theta_dot": -42.0}
    eub = {"x_dot": 213.0, "theta_dot": 42.0, "x": 0.123}
    env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub)
    alb, aub = env_rn.act_space.bounds
    assert all(alb == -1)
    assert all(aub == 1)
    olb, oub = env_rn.obs_space.bounds
    assert all(olb == -1)
    assert all(oub == 1)

    ro_r = rollout(env_r,
                   DummyPolicy(env_r.spec),
                   eval=True,
                   seed=0,
                   render_mode=RenderMode())
    ro_rn = rollout(env_rn,
                    DummyPolicy(env_rn.spec),
                    eval=True,
                    seed=0,
                    render_mode=RenderMode())
    assert np.allclose(env_rn._process_obs(ro_r.observations),
                       ro_rn.observations)

    env_rnp = ObsPartialWrapper(
        env_rn, idcs=[env.obs_space.labels[2], env.obs_space.labels[3]])
    ro_rnp = rollout(env_rnp,
                     DummyPolicy(env_rnp.spec),
                     eval=True,
                     seed=0,
                     render_mode=RenderMode())

    env_rnpa = GaussianActNoiseWrapper(
        env_rnp,
        noise_mean=0.5 * np.ones(env_rnp.act_space.shape),
        noise_std=0.1 * np.ones(env_rnp.act_space.shape))
    ro_rnpa = rollout(env_rnpa,
                      DummyPolicy(env_rnpa.spec),
                      eval=True,
                      seed=0,
                      render_mode=RenderMode())
    assert not np.allclose(
        ro_rnp.observations,
        ro_rnpa.observations)  # the action noise changed to rollout

    env_rnpd = ActDelayWrapper(env_rnp, delay=3)
    ro_rnpd = rollout(env_rnpd,
                      DummyPolicy(env_rnpd.spec),
                      eval=True,
                      seed=0,
                      render_mode=RenderMode())
    assert np.allclose(ro_rnp.actions, ro_rnpd.actions)
    assert not np.allclose(ro_rnp.observations, ro_rnpd.observations)

    assert type(inner_env(env_rnpd)) == type(env)
    assert typed_env(env_rnpd, ObsPartialWrapper) is not None
    assert isinstance(env_rnpd, ActDelayWrapper)
    env_rnpdr = remove_env(env_rnpd, ActDelayWrapper)
    assert not isinstance(env_rnpdr, ActDelayWrapper)