Beispiel #1
0
    def eval_policy(save_dir: [str, None],
                    env: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy,
                    mc_estimator: bool,
                    prefix: str,
                    num_rollouts: int,
                    num_parallel_envs: int = 1) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env, policy, eval=True).undiscounted_return()
                # If a reward of -1 is given, skip evaluation ahead and set all returns to zero
                if rets_real[i] == -1:
                    print_cbt('Set all returns for this policy to zero.', color='c')
                    rets_real = to.zeros(num_rollouts)
                    break
        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('Target domain performance', bright=True)
            print(tabulate([['mean return', to.mean(rets_real).item()],
                            ['std return', to.std(rets_real)],
                            ['min return', to.min(rets_real)],
                            ['max return', to.max(rets_real)]]))

        if mc_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean,
                                              num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 250., max_steps=1500)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(feats=FeatureStack([
        identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat,
        ATan2Feat(1, 2),
        MultFeat([4, 5])
    ]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=50,
        pop_size=trial.suggest_int('pop_size', 50, 200),
        num_rollouts=trial.suggest_int('num_rollouts', 4, 10),
        num_is_samples=trial.suggest_int('num_is_samples', 5, 40),
        expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5),
        symm_sampling=trial.suggest_categorical('symm_sampling',
                                                [True, False]),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'),
                 env,
                 policy,
                 **algo_hparam,
                 logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Beispiel #3
0
    def eval_policy(
        save_dir: Optional[pyrado.PathLike],
        env: Env,
        policy: Policy,
        prefix: str,
        num_rollouts: int,
        num_workers: int = 1,
    ) -> to.Tensor:
        """
        Evaluate a policy either in the source or in the target domain.
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_workers: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f"Executing {prefix}_policy ...", "c", bright=True)

        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when evaluating on a real-world device
            rets_real = []
            for i in range(num_rollouts):
                rets_real.append(
                    rollout(env, policy, eval=True).undiscounted_return())

        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when evaluating in a simulation
            sampler = ParallelRolloutSampler(env,
                                             policy,
                                             num_workers=num_workers,
                                             min_rollouts=num_rollouts)
            ros = sampler.sample(eval=True)
            rets_real = [ro.undiscounted_return() for ro in ros]
        else:
            raise pyrado.TypeErr(given=inner_env(env),
                                 expected_type=[RealEnv, SimEnv])

        rets_real = to.as_tensor(rets_real, dtype=to.get_default_dtype())

        if save_dir is not None:
            # Save and print the evaluation results
            pyrado.save(rets_real, "returns_real.pt", save_dir, prefix=prefix)
            print_cbt("Target domain performance", bright=True)
            print(
                tabulate([
                    ["mean return", to.mean(rets_real).item()],
                    ["std return", to.std(rets_real)],
                    ["min return", to.min(rets_real)],
                    ["max return", to.max(rets_real)],
                ]))

        return to.mean(rets_real)
Beispiel #4
0
def test_parallel_rollout_sampler(env: SimEnv, policy: Policy,
                                  num_workers: int):
    min_rollouts = num_workers * 2  # make sure every worker samples at least once
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers,
                                     min_rollouts=min_rollouts)
    ros = sampler.sample()
    assert isinstance(ros, list)
    assert len(ros) >= min_rollouts
Beispiel #5
0
def test_cuda_sampling_w_dr(env: SimEnv, policy: Policy, num_workers: int):
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, randomizer)

    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=num_workers,
                                     min_rollouts=4)
    samples = sampler.sample()

    assert samples is not None
Beispiel #6
0
def test_parallel_sampling_deterministic_wo_min_steps(
    env: SimEnv,
    policy: Policy,
    min_rollouts: Optional[int],
    init_states: Optional[int],
    domain_params: Optional[List[dict]],
):
    env.max_steps = 20

    if init_states is not None:
        init_states = [
            env.spec.state_space.sample_uniform() for _ in range(init_states)
        ]

    nums_workers = (1, 2, 4)

    all_rollouts = []
    for num_workers in nums_workers:
        # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed
        # anyway).
        all_rollouts.append(
            ParallelRolloutSampler(
                env,
                NormalActNoiseExplStrat(policy, std_init=1.0),
                num_workers=num_workers,
                min_rollouts=min_rollouts,
                seed=0,
            ).sample(init_states=init_states, domain_params=domain_params))

    # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts.
    for ros in all_rollouts:
        for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]:
            # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions
            # are different when using the idle policy.
            if isinstance(policy, IdlePolicy):
                # The Quanser Ball Balancer is a deterministic environment (conditioned on the initial state). As the
                # idle policy is a deterministic policy, this will result in the rollouts being equivalent for each
                # initial state, so do not check for difference if the initial states where set.
                if init_states is None:
                    assert ro_a.rewards != pytest.approx(ro_b.rewards)
                    assert ro_a.observations != pytest.approx(
                        ro_b.observations)
            else:
                assert ro_a.rewards != pytest.approx(ro_b.rewards)
                assert ro_a.observations != pytest.approx(ro_b.observations)
                assert ro_a.actions != pytest.approx(ro_b.actions)

    # Test that the rollouts for all number of workers are equal.
    for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]:
        assert len(ros_a) == len(ros_b)
        for ro_a, ro_b in zip(ros_a, ros_b):
            assert ro_a.rewards == pytest.approx(ro_b.rewards)
            assert ro_a.observations == pytest.approx(ro_b.observations)
            assert ro_a.actions == pytest.approx(ro_b.actions)
Beispiel #7
0
    def __init__(self,
                 save_dir: str,
                 env: SimEnv,
                 policy: Policy,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_workers: int = 4,
                 logger: StepLogger = None,
                 ball_z_dim_mismatch: bool = True):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy which this algorithm is creating
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_workers: number of environments for parallel sampling
        :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim,
                                    set to True if the controller does not have the z component (relative position)
                                    of the ball in the state vector, i.e. state is 14-dim instead of 16-dim
        """
        if not isinstance(env, SimEnv):
            raise pyrado.TypeErr(given=env, expected_type=SimEnv)
        if not isinstance(policy, LinearPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=LinearPolicy)

        # Call Algorithm's constructor
        super().__init__(save_dir, 1, policy, logger)

        # Store the inputs
        self._env = env
        self.ball_z_dim_mismatch = ball_z_dim_mismatch

        self.sampler = ParallelRolloutSampler(
            env, self._policy,
            num_workers=num_workers,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.eigvals = np.array([pyrado.inf])  # initialize with sth positive
Beispiel #8
0
def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=256,
        reward_multiplier=1,
        lr=5e-3,
    )
    policy = FNNPolicy(reference_env.spec,
                       hidden_sizes=[16, 16],
                       hidden_nonlin=to.tanh)
    dr = create_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(fmt="dict", dtype="numpy")
    reference_sampler = ParallelRolloutSampler(reference_env,
                                               policy,
                                               num_workers=1,
                                               min_steps=1000)
    random_sampler = ParallelRolloutSampler(random_env,
                                            policy,
                                            num_workers=1,
                                            min_steps=1000)

    losses = []
    for i in range(200):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]
def test_sequential_equals_parallel(env: SimEnv, policy: Policy,
                                    num_simulations: int):
    # Do the rollouts explicitly sequentially without a sampler
    # Do not set the init state to check if this was sampled correctly
    ros_sequential = []
    for i in range(num_simulations):
        ros_sequential.append(rollout(env, policy, eval=True, seed=i))

    # Do the rollouts in parallel with a sampler. Create one worker for every rollout
    # Do not set the init state to check if this was sampled correctly
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=num_simulations,
                                     min_rollouts=num_simulations,
                                     seed=0)
    ros_parallel = sampler.sample()
    assert len(ros_parallel) == num_simulations

    for ro_s in ros_sequential:
        # The parallel rollouts are not necessarily in the same order as the sequential ones, thus compare to all
        assert any([
            ro_s.observations == pytest.approx(ro_p.observations)
            for ro_p in ros_parallel
        ])
Beispiel #10
0
def test_sequential_equals_parallel(env: SimEnv, policy: Policy,
                                    num_rollouts: int, num_workers: int):
    # Do the rollouts explicitly sequentially without a sampler.
    # Do not set the init state to check if this was sampled correctly.
    ros_sequential = []
    for i in range(num_rollouts):
        ros_sequential.append(
            rollout(env, policy, eval=True, seed=0, sub_seed=0,
                    sub_sub_seed=i))

    # Do the rollouts in parallel with a sampler.
    # Do not set the init state to check if this was sampled correctly.
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=num_workers,
                                     min_rollouts=num_rollouts,
                                     seed=0)
    ros_parallel = sampler.sample()
    assert len(ros_parallel) == num_rollouts

    for ro_s, ro_p in zip(ros_sequential, ros_parallel):
        assert ro_s.rewards == pytest.approx(ro_p.rewards)
        assert ro_s.observations == pytest.approx(ro_p.observations)
        assert ro_s.actions == pytest.approx(ro_p.actions)
Beispiel #11
0
def test_parallel_sampling_deterministic_w_min_steps(
    env: SimEnv,
    policy: Policy,
    min_rollouts: Optional[int],
    min_steps: int,
    domain_params: Optional[List[dict]],
):
    env.max_steps = 20

    nums_workers = (1, 2, 4)

    all_rollouts = []
    for num_workers in nums_workers:
        # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed
        # anyway).
        all_rollouts.append(
            ParallelRolloutSampler(
                env,
                NormalActNoiseExplStrat(policy, std_init=1.0),
                num_workers=num_workers,
                min_rollouts=min_rollouts,
                min_steps=min_steps * env.max_steps,
                seed=0,
            ).sample(domain_params=domain_params))

    # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts.
    for ros in all_rollouts:
        for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]:
            # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions
            # are different when using the idle policy.
            if not isinstance(policy, IdlePolicy):
                assert ro_a.rewards != pytest.approx(ro_b.rewards)
                assert ro_a.observations != pytest.approx(ro_b.observations)
                assert ro_a.actions != pytest.approx(ro_b.actions)

    # Test that the rollouts for all number of workers are equal.
    for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]:
        assert sum([len(ro) for ro in ros_a]) == sum([len(ro) for ro in ros_b])
        assert sum([len(ro) for ro in ros_a]) >= min_steps * env.max_steps
        assert sum([len(ro) for ro in ros_b]) >= min_steps * env.max_steps
        assert len(ros_a) == len(ros_b)
        if min_rollouts is not None:
            assert len(ros_a) >= min_rollouts
            assert len(ros_b) >= min_rollouts
        for ro_a, ro_b in zip(ros_a, ros_b):
            assert ro_a.rewards == pytest.approx(ro_b.rewards)
            assert ro_a.observations == pytest.approx(ro_b.observations)
            assert ro_a.actions == pytest.approx(ro_b.actions)
Beispiel #12
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: DiscreteActQValPolicy,
                 memory_size: int,
                 eps_init: float,
                 eps_schedule_gamma: float,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: int,
                 target_update_intvl: int = 5,
                 num_init_memory_steps: int = None,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 batch_size: int = 256,
                 num_workers: int = 4,
                 max_grad_norm: float = 0.5,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: (current) Q-network updated by this algorithm
        :param memory_size: number of transitions in the replay memory buffer
        :param eps_init: initial value for the probability of taking a random action, constant if `eps_schedule_gamma=1`
        :param eps_schedule_gamma: temporal discount factor for the exponential decay of epsilon
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of batch updates per algorithm steps
        :param target_update_intvl: number of iterations that pass before updating the qfcn_targ network
        :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to
                                      fill the buffer completely
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param num_workers: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, DiscreteActQValPolicy):
            raise pyrado.TypeErr(given=policy,
                                 expected_type=DiscreteActQValPolicy)

        # Call ValueBased's constructor
        super().__init__(save_dir, env, policy, memory_size, gamma, max_iter,
                         num_batch_updates, target_update_intvl,
                         num_init_memory_steps, min_rollouts, min_steps,
                         batch_size, num_workers, max_grad_norm, logger)

        self.qfcn_targ = deepcopy(
            self._policy).eval()  # will not be trained using the optimizer
        self.eps = eps_init

        # Create sampler for exploration during training
        self._expl_strat = EpsGreedyExplStrat(self._policy, eps_init,
                                              eps_schedule_gamma)
        self.sampler_trn = ParallelRolloutSampler(
            self._env,
            self._expl_strat,
            num_workers=num_workers if min_steps != 1 else 1,
            min_steps=min_steps,
            min_rollouts=min_rollouts)

        # Q-function optimizer
        self.optim = to.optim.RMSprop([{
            'params': self._policy.parameters()
        }],
                                      lr=lr)

        # Learning rate scheduler
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)
Beispiel #13
0
if __name__ == "__main__":
    # Set up environment
    dp_gt = dict(m=2.0, k=20.0, d=0.8)  # ground truth
    dp_init = dict(m=1.0, k=22.0, d=0.4)  # initial guess
    dt = 1 / 50.0
    env = OneMassOscillatorSim(dt=dt, max_steps=400)
    env.reset(domain_param=dp_gt)

    # Set up policy
    # policy = IdlePolicy(env.spec)
    policy = DummyPolicy(env.spec)

    # Sample
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=4,
                                     min_rollouts=50,
                                     seed=1)
    ros = sampler.sample()

    # Create a model for learning the domain parameters
    model = OneMassOscillatorDomainParamEstimator(dt=dt,
                                                  dp_init=dp_init,
                                                  num_epoch=50,
                                                  batch_size=10)

    model.update(ros)

    print_cbt(f"true domain param   : {dp_gt}", "g")
    print_cbt(f"initial domain param: {dp_init}", "y")
    print_cbt(f"learned domain param: {model.dp_est.detach().cpu().numpy()}",
Beispiel #14
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500)
    env = BallOnPlate2DSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64),
                                           (16, 16, 16), (32, 32, 32)]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("shared_hidden_nonlin_policy",
                                      ["to_tanh", "to_relu"])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    qfcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical("hidden_sizes_critic",
                                               [(16, 16), (32, 32), (64, 64),
                                                (16, 16, 16), (32, 32, 32)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_critic",
                                      ["to_tanh", "to_relu"])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                       **qfcn_hparam)
    qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                       **qfcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            "min_steps_algo", [1]),  # 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform("memory_size_algo",
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform("tau_algo", 0.99, 1.0),
        ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9),
        learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo",
                                                  [True, False]),
        standardize_rew=trial.suggest_categorical("standardize_rew_algo",
                                                  [False]),
        gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0),
        target_update_intvl=trial.suggest_categorical(
            "target_update_intvl_algo", [1, 5]),
        num_updates_per_step=trial.suggest_categorical(
            "num_batch_updates_algo", [1, 5]),
        batch_size=trial.suggest_categorical("batch_size_algo",
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f"trial_{trial.number}"))
    algo = SAC(study_dir,
               env,
               policy,
               qfcn_1,
               qfcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Beispiel #15
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        critic: GAE,
        max_iter: int,
        min_rollouts: int = None,
        min_steps: int = None,
        num_epoch: int = 3,
        eps_clip: float = 0.1,
        batch_size: int = 64,
        std_init: float = 1.0,
        num_workers: int = 4,
        max_grad_norm: Optional[float] = None,
        lr: float = 5e-4,
        lr_scheduler=None,
        lr_scheduler_hparam: [dict, None] = None,
        logger: StepLogger = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epoch: number of iterations over all gathered samples during one policy update
        :param eps_clip: max/min probability ratio, see [1]
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param num_workers: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created

        .. note::
            The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler
            schedules the maximum learning rate.
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        assert isinstance(policy, Policy)

        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.num_epoch = num_epoch
        self.eps_clip = eps_clip
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self.log_loss = True
        self._expl_strat = NormalActNoiseExplStrat(self._policy,
                                                   std_init=std_init)
        self._sampler = ParallelRolloutSampler(env,
                                               self._expl_strat,
                                               num_workers=num_workers,
                                               min_steps=min_steps,
                                               min_rollouts=min_rollouts)
        self.optim = to.optim.Adam(
            [{
                "params": self._expl_strat.policy.parameters()
            }, {
                "params": self._expl_strat.noise.parameters()
            }],
            lr=lr,
            eps=1e-5,
        )
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)
Beispiel #16
0
from tabulate import tabulate

from pyrado.environment_wrappers.action_normalization import ActNormWrapper
from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim
from pyrado.policies.features import FeatureStack, identity_feat, squared_feat
from pyrado.policies.feed_back.linear import LinearPolicy
from pyrado.sampling.parallel_rollout_sampler import ParallelRolloutSampler

if __name__ == "__main__":
    # Set up environment
    env = BallOnBeamSim(dt=0.02, max_steps=500)
    env = ActNormWrapper(env)

    # Set up policy
    feats = FeatureStack(identity_feat, squared_feat)
    policy = LinearPolicy(env.spec, feats)

    # Set up sampler
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=2,
                                     min_rollouts=2000)

    # Sample and print
    ros = sampler.sample()
    print(
        tabulate({
            "StepSequence count": len(ros),
            "Step count": sum(map(len, ros)),
        }.items()))
Beispiel #17
0
class ARPL(Algorithm):
    """
    Adversarially Robust Policy Learning (ARPL)

    .. seealso::
        A. Mandlekar, Y. Zhu, A. Garg, L. Fei-Fei, S. Savarese, "Adversarially Robust Policy Learning:
        Active Construction of Physically-Plausible Perturbations", IROS, 2017
    """

    name: str = 'arpl'

    def __init__(self,
                 save_dir: str,
                 env: [SimEnv, StateAugmentationWrapper],
                 subrtn: Algorithm,
                 policy: Policy,
                 expl_strat: StochasticActionExplStrat,
                 max_iter: int,
                 num_rollouts: int = None,
                 steps_num: int = None,
                 apply_dynamics_noise: bool = False,
                 dyn_eps: float = 0.01,
                 dyn_phi: float = 0.1,
                 halfspan: float = 0.25,
                 apply_proccess_noise: bool = False,
                 proc_eps: float = 0.01,
                 proc_phi: float = 0.05,
                 apply_observation_noise: bool = False,
                 obs_eps: float = 0.01,
                 obs_phi: float = 0.05,
                 torch_observation: bool = True,
                 num_workers: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment in which the agent should be trained
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param policy: policy to be updated
        :param expl_strat: the exploration strategy
        :param max_iter: the maximum number of iterations
        :param num_rollouts: the number of rollouts to be performed for each update step
        :param steps_num: the number of steps to be performed for each update step
        :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied
        :param dyn_eps: the intensity of generated dynamics noise
        :param dyn_phi: the probability of applying dynamics noise
        :param halfspan: the halfspan of the uniform random distribution used to sample
        :param apply_proccess_noise: whether adversarially generated process noise should be applied
        :param proc_eps: the intensity of generated process noise
        :param proc_phi: the probability of applying process noise
        :param apply_observation_noise: whether adversarially generated observation noise should be applied
        :param obs_eps: the intensity of generated observation noise
        :param obs_phi: the probability of applying observation noise
        :param torch_observation: a function to provide a differentiable observation
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        assert isinstance(subrtn, Algorithm)
        assert isinstance(max_iter, int) and max_iter > 0

        super().__init__(save_dir, max_iter, policy, logger)

        # Initialize adversarial wrappers
        if apply_dynamics_noise:
            assert isinstance(env, StateAugmentationWrapper)
            env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps,
                                             dyn_phi, halfspan)
        if apply_proccess_noise:
            env = AdversarialStateWrapper(env,
                                          self.policy,
                                          proc_eps,
                                          proc_phi,
                                          torch_observation=torch_observation)
        if apply_observation_noise:
            env = AdversarialObservationWrapper(env, self.policy, obs_eps,
                                                obs_phi)

        self.num_rollouts = num_rollouts
        self.sampler = ParallelRolloutSampler(
            env,
            expl_strat,
            num_workers=num_workers,
            min_steps=steps_num,
            min_rollouts=num_rollouts,
        )

        # Subroutine
        self._subrtn = subrtn
        self._subrtn.save_name = 'subrtn'

    @property
    def sample_count(self) -> int:
        return self._subrtn.sample_count

    def step(self, snapshot_mode: str, meta_info: dict = None):
        rollouts = self.sampler.sample()
        rets = [ro.undiscounted_return() for ro in rollouts]
        ret_avg = np.mean(rets)
        ret_med = np.median(rets)
        ret_std = np.std(rets)
        self.logger.add_value('avg return', ret_avg)
        self.logger.add_value('median return', ret_med)
        self.logger.add_value('std return', ret_std)
        self.logger.add_value('num total samples', self._cnt_samples)
        self.logger.add_value('avg rollout len',
                              np.mean([ro.length for ro in rollouts]))

        # Sub-routine
        self._subrtn.update(rollouts)
        self._subrtn.logger.record_step()
        self._subrtn.make_snapshot(snapshot_mode, ret_avg.item())

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of a meta-algorithm
            self._subrtn.save_snapshot(meta_info)
        else:
            raise pyrado.ValueErr(
                msg=f'{self.name} is not supposed be run as a subroutine!')
Beispiel #18
0
class SysIdViaEpisodicRL(Algorithm):
    """
    Wrapper to frame black-box system identification as an episodic reinforcement learning problem

    .. note::
        This algorithm was designed as a subroutine of SimOpt. However, it could also be used independently.
    """

    name: str = "sysiderl"
    iteration_key: str = "sysiderl_iteration"  # logger's iteration key

    def __init__(
        self,
        subrtn: ParameterExploring,
        behavior_policy: Policy,
        num_rollouts_per_distr: int,
        metric: Union[Callable[[np.ndarray], np.ndarray], None],
        obs_dim_weight: Union[list, np.ndarray],
        std_obs_filt: int = 5,
        w_abs: float = 0.5,
        w_sq: float = 1.0,
        num_workers: int = 4,
        base_seed: int = 1001,
    ):
        """
        Constructor

        :param subrtn: wrapped algorithm to fit the domain parameter distribution
        :param behavior_policy: lower level policy used to generate the rollouts
        :param num_rollouts_per_distr: number of rollouts per domain distribution parameter set
        :param metric: functional mapping from differences in observations to value
        :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions for the default metric
        :param std_obs_filt: number of standard deviations for the Gaussian filter applied to the observaitons
        :param w_abs: weight for the mean absolute errors for the default metric
        :param w_sq: weight for the mean squared errors for the default metric
        :param num_workers: number of environments for parallel sampling
        :param base_seed: seed to set for the parallel sampler in every iteration
        """
        if not isinstance(subrtn, ParameterExploring):
            raise pyrado.TypeErr(given=subrtn,
                                 expected_type=ParameterExploring)
        if not isinstance(subrtn.env, MetaDomainRandWrapper):
            raise pyrado.TypeErr(given=subrtn.env,
                                 expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn.policy, DomainDistrParamPolicy):
            raise pyrado.TypeErr(given=subrtn.policy,
                                 expected_type=DomainDistrParamPolicy)
        if not isinstance(behavior_policy, Policy):
            raise pyrado.TypeErr(given=behavior_policy, expected_type=Policy)
        if subrtn.policy.num_param != len(subrtn.env.dp_mapping):
            raise pyrado.ShapeErr(
                msg=
                f"Number of policy parameters {subrtn.policy.num_param} does not match the"
                f"number of domain distribution parameters {len(subrtn.env.dp_mapping)}!"
            )
        if subrtn.sampler.num_init_states_per_domain != 1:
            # Only sample one rollout in every domain. This is possible since we are synchronizing the init state.
            raise pyrado.ValueErr(
                given=subrtn.sampler.num_init_states_per_domain,
                eq_constraint="1")
        if num_rollouts_per_distr < 2:
            raise pyrado.ValueErr(given=num_rollouts_per_distr,
                                  g_constraint="1")
        if len(obs_dim_weight) != subrtn.env.obs_space.flat_dim:
            raise pyrado.ShapeErr(given=obs_dim_weight,
                                  expected_match=subrtn.env.obs_space)

        # Call Algorithm's constructor
        super().__init__(subrtn.save_dir, subrtn.max_iter, subrtn.policy,
                         subrtn.logger)

        self._subrtn = subrtn
        self._subrtn.save_name = "subrtn"
        self._behavior_policy = behavior_policy
        self.obs_dim_weight = np.diag(
            obs_dim_weight
        )  # weighting factor between the different observations
        self.std_obs_filt = std_obs_filt
        if metric is None or metric == "None":
            self.metric = partial(self.weighted_l1_l2_metric,
                                  w_abs=w_abs,
                                  w_sq=w_sq,
                                  obs_dim_weight=self.obs_dim_weight)
        else:
            self.metric = metric

        # Get and optionally clip the observation bounds of the environment
        elb, eub = subrtn.env.obs_space.bound_lo, subrtn.env.obs_space.bound_up
        elb, eub = self.override_obs_bounds(elb, eub,
                                            subrtn.env.obs_space.labels)
        self.obs_normalizer = UnitCubeProjector(bound_lo=elb, bound_up=eub)

        # Create the sampler used to execute the same policy as on the real system in the meta-randomized env
        self.base_seed = base_seed
        self.behavior_sampler = ParallelRolloutSampler(self._subrtn.env,
                                                       self._behavior_policy,
                                                       num_workers=num_workers,
                                                       min_rollouts=1,
                                                       seed=base_seed)
        self.num_rollouts_per_distr = num_rollouts_per_distr

    @property
    def subrtn(self) -> ParameterExploring:
        """Get the subroutine used for updating the domain parameter distribution."""
        return self._subrtn

    def reset(self, seed: int = None):
        # Reset internal variables inherited from Algorithm
        self._curr_iter = 0
        self._cnt_samples = 0
        self._highest_avg_ret = -pyrado.inf

        # Forward to subroutine
        self._subrtn.reset(seed)

    def step(self, snapshot_mode: str, meta_info: dict = None):
        if "rollouts_real" not in meta_info:
            raise pyrado.KeyErr(keys="rollouts_real", container=meta_info)

        # Extract the initial states from the real rollouts
        rollouts_real = meta_info["rollouts_real"]
        init_states_real = [ro.states[0, :] for ro in rollouts_real]

        # Sample new policy parameters a.k.a domain distribution parameters
        param_sets = self._subrtn.expl_strat.sample_param_sets(
            nominal_params=self._subrtn.policy.param_values,
            num_samples=self._subrtn.pop_size,
            include_nominal_params=True,
        )

        # Iterate over every domain parameter distribution. We basically mimic the ParameterExplorationSampler here,
        # but we need to adapt the randomizer (and not just the domain parameters) por every policy param set
        param_samples = []
        loss_hist = []
        for idx_ps, ps in enumerate(param_sets):
            # Update the randomizer to use the new
            new_ddp_vals = self._subrtn.policy.transform_to_ddp_space(ps)
            self._subrtn.env.adapt_randomizer(
                domain_distr_param_values=new_ddp_vals.detach().cpu().numpy())
            self._subrtn.env.randomizer.randomize(
                num_samples=self.num_rollouts_per_distr)
            sampled_domain_params = self._subrtn.env.randomizer.get_params()

            # Sample the rollouts
            rollouts_sim = self.behavior_sampler.sample(init_states_real,
                                                        sampled_domain_params,
                                                        eval=True)

            # Iterate over simulated rollout with the same initial state
            for idx_real, idcs_sim in enumerate(
                    gen_ordered_batch_idcs(self.num_rollouts_per_distr,
                                           len(rollouts_sim),
                                           sorted=True)):
                # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
                ros_real_tr, ros_sim_tr = self.truncate_rollouts(
                    [rollouts_real[idx_real]],
                    rollouts_sim[slice(idcs_sim[0], idcs_sim[-1] + 1)])

                # Check the validity of the initial states. The domain parameters will be different.
                assert len(ros_real_tr) == len(ros_sim_tr) == len(idcs_sim)
                assert check_all_equal([ro.states[0, :] for ro in ros_real_tr])
                assert check_all_equal([ro.states[0, :] for ro in ros_sim_tr])
                assert all([
                    np.allclose(r.states[0, :], s.states[0, :])
                    for r, s in zip(ros_real_tr, ros_sim_tr)
                ])

                # Compute the losses
                losses = np.asarray([
                    self.loss_fcn(ro_r, ro_s)
                    for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
                ])

                if np.all(losses == 0.0):
                    raise pyrado.ValueErr(
                        msg=
                        "All SysIdViaEpisodicRL losses are equal to zero! Most likely the domain"
                        "randomization is too extreme, such that every trajectory is done after"
                        "one step. Check the exploration strategy.")

                # Handle zero losses by setting them to the maximum current loss
                losses[losses == 0] = np.max(losses)
                loss_hist.extend(losses)

                # We need to assign the loss value to the simulated rollout, but this one can be of a different
                # length than the real-world rollouts as well as of different length than the original
                # (non-truncated) simulated rollout. Thus, we simply write the loss value into the first step.
                for i, l in zip(range(idcs_sim[0], idcs_sim[-1] + 1), losses):
                    rollouts_sim[i].rewards[:] = 0.0
                    rollouts_sim[i].rewards[0] = -l

            # Collect the results
            param_samples.append(
                ParameterSample(params=ps, rollouts=rollouts_sim))

        # Bind the parameter samples and their rollouts in the usual container
        param_samp_res = ParameterSamplingResult(param_samples)
        self._cnt_samples += sum(
            [len(ro) for pss in param_samp_res for ro in pss.rollouts])

        # Log metrics computed from the old policy (before the update)
        loss_hist = np.asarray(loss_hist)
        self.logger.add_value("min sysid loss", np.min(loss_hist), 6)
        self.logger.add_value("median sysid loss", np.median(loss_hist), 6)
        self.logger.add_value("avg sysid loss", np.mean(loss_hist), 6)
        self.logger.add_value("max sysid loss", np.max(loss_hist), 6)
        self.logger.add_value("std sysid loss", np.std(loss_hist), 6)

        # Extract the best policy parameter sample for saving it later
        self._subrtn.best_policy_param = param_samp_res.parameters[np.argmax(
            param_samp_res.mean_returns)].clone()

        # Save snapshot data
        self.make_snapshot(snapshot_mode,
                           float(np.max(param_samp_res.mean_returns)),
                           meta_info)

        # Update the wrapped algorithm's update method
        self._subrtn.update(
            param_samp_res,
            ret_avg_curr=param_samp_res[0].mean_undiscounted_return)

    @staticmethod
    def override_obs_bounds(bound_lo: np.ndarray, bound_up: np.ndarray,
                            labels: np.ndarray) -> (np.ndarray, np.ndarray):
        """
        Default overriding method for the bounds of an observation space. This is necessary when the observations
        are scaled with their range, e.g. to compare a deviation over different kinds of observations like position and
        annular velocity. Thus, infinite bounds are not feasible.

        :param bound_lo: lower bound of the observation space
        :param bound_up: upper bound of the observation space
        :param labels: label for each dimension of the observation space to override
        :return: clipped lower and upper bound
        """
        bound_lo = ObsNormWrapper.override_bounds(bound_lo, {
            "theta_dot": -20.0,
            "alpha_dot": -20.0
        }, labels)
        bound_up = ObsNormWrapper.override_bounds(bound_up, {
            "theta_dot": 20.0,
            "alpha_dot": 20.0
        }, labels)
        return bound_lo, bound_up

    @staticmethod
    def weighted_l1_l2_metric(err: np.ndarray, w_abs: float, w_sq: float,
                              obs_dim_weight: np.ndarray):
        """
        Compute the weighted linear combination of the observation error's MAE and MSE, averaged over time

        .. note::
            In contrast to [1], we are using the mean absolute error and the mean squared error instead of the L1 and
            the L2 norm. The reason for this is that longer time series would be punished otherwise.

        :param err: error signal with time steps along the first dimension
        :param w_abs: weight for the mean absolute errors
        :param w_sq: weight for the mean squared errors
        :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions
        :return: weighted linear combination of the error's MAE and MSE, averaged over time
        """
        err_w = np.matmul(err, obs_dim_weight)
        return w_abs * np.mean(np.abs(err_w), axis=0) + w_sq * np.mean(
            np.power(err_w, 2), axis=0)

    def loss_fcn(self, rollout_real: StepSequence,
                 rollout_sim: StepSequence) -> float:
        """
        Compute the discrepancy between two time sequences of observations given metric.
        Be sure to align and truncate the rollouts beforehand.

        :param rollout_real: (concatenated) real-world rollout containing the observations
        :param rollout_sim: (concatenated) simulated rollout containing the observations
        :return: discrepancy cost summed over the observation dimensions
        """
        if len(rollout_real) != len(rollout_sim):
            raise pyrado.ShapeErr(given=rollout_real,
                                  expected_match=rollout_sim)

        # Extract the observations
        real_obs = rollout_real.get_data_values("observations",
                                                truncate_last=True)
        sim_obs = rollout_sim.get_data_values("observations",
                                              truncate_last=True)

        # Filter the observations
        real_obs = gaussian_filter1d(real_obs, self.std_obs_filt, axis=0)
        sim_obs = gaussian_filter1d(sim_obs, self.std_obs_filt, axis=0)

        # Normalize the signals
        real_obs_norm = self.obs_normalizer.project_to(real_obs)
        sim_obs_norm = self.obs_normalizer.project_to(sim_obs)

        # Compute loss based on the error
        loss_per_obs_dim = self.metric(real_obs_norm - sim_obs_norm)
        assert len(loss_per_obs_dim) == real_obs.shape[1]
        assert all(loss_per_obs_dim >= 0)
        return sum(loss_per_obs_dim)

    @staticmethod
    def truncate_rollouts(
        rollouts_real: Sequence[StepSequence],
        rollouts_sim: Sequence[StepSequence],
        replicate: bool = True
    ) -> Tuple[Sequence[StepSequence], Sequence[StepSequence]]:
        """
        In case (some of the) rollouts failed or succeed in one domain, but not in the other, we truncate the longer
        observation sequence. When truncating, we compare every of the M real rollouts to every of the N simulated
        rollouts, thus replicate the real rollout N times and the simulated rollouts M times.

        :param rollouts_real: M real-world rollouts of different length if `replicate = True`, else K real-world
                              rollouts of different length
        :param rollouts_sim: N simulated rollouts of different length if `replicate = True`, else K simulated
                              rollouts of different length
        :param replicate: if `False` the i-th rollout from `rollouts_real` is (only) compared with the i-th rollout from
                          `rollouts_sim`, in this case the number of rollouts and the initial states have to match
        :return: MxN real-world rollouts and MxN simulated rollouts of equal length if `replicate = True`, else
                 K real-world rollouts and K simulated rollouts of equal length
        """
        if not isinstance(rollouts_real[0], Iterable):
            raise pyrado.TypeErr(given=rollouts_real[0],
                                 expected_type=Iterable)
        if not isinstance(rollouts_sim[0], Iterable):
            raise pyrado.TypeErr(given=rollouts_sim[0], expected_type=Iterable)
        if not replicate and len(rollouts_real) != len(rollouts_sim):
            raise pyrado.ShapeErr(
                msg=
                "In case of a one on one comparison, the number of rollouts needs to be equal!"
            )

        # Choose the function for creating the comparison, the rollouts
        comp_fcn = product if replicate else zip

        # Go over all combinations rollouts individually
        rollouts_real_tr = []
        rollouts_sim_tr = []
        for ro_r, ro_s in comp_fcn(rollouts_real, rollouts_sim):
            # Handle rollouts of different length, assuming that they are staring at the same state
            if ro_r.length < ro_s.length:
                rollouts_real_tr.append(ro_r)
                rollouts_sim_tr.append(ro_s[:ro_r.length])
            elif ro_r.length > ro_s.length:
                rollouts_real_tr.append(ro_r[:ro_s.length])
                rollouts_sim_tr.append(ro_s)
            else:
                rollouts_real_tr.append(ro_r)
                rollouts_sim_tr.append(ro_s)

        return rollouts_real_tr, rollouts_sim_tr

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # ParameterExploring subroutine saves the best policy (in this case a DomainDistrParamPolicy)
        prefix = meta_info.get("prefix", "")
        if prefix != "":
            self._subrtn.save_snapshot(meta_info=dict(
                prefix=f"{prefix}_ddp"))  # save iter_X_ddp_policy.pt
        self._subrtn.save_snapshot(
            meta_info=dict(prefix="ddp"))  # override ddp_policy.pt

        joblib.dump(self._subrtn.env, osp.join(self.save_dir, "env_sim.pkl"))

        # Print the current search distribution's mean
        cpp = self._subrtn.policy.transform_to_ddp_space(
            self._subrtn.policy.param_values)
        self._subrtn.env.adapt_randomizer(
            domain_distr_param_values=cpp.detach().cpu().numpy())
        print_cbt(
            f"Current policy domain parameter distribution\n{self._subrtn.env.randomizer}",
            "g")

        # Set the randomizer to best fitted domain distribution
        cbp = self._subrtn.policy.transform_to_ddp_space(
            self._subrtn.best_policy_param)
        self._subrtn.env.adapt_randomizer(
            domain_distr_param_values=cbp.detach().cpu().numpy())
        print_cbt(
            f"Best fitted domain parameter distribution\n{self._subrtn.env.randomizer}",
            "g")

        if "rollouts_real" not in meta_info:
            raise pyrado.KeyErr(keys="rollouts_real", container=meta_info)
        pyrado.save(meta_info["rollouts_real"],
                    "rollouts_real.pkl",
                    self.save_dir,
                    prefix=prefix)
Beispiel #19
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 critic: GAE,
                 max_iter: int,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 vfcn_coeff: float = 0.5,
                 entropy_coeff: float = 1e-3,
                 batch_size: int = 32,
                 std_init: float = 1.0,
                 max_grad_norm: float = None,
                 num_workers: int = 4,
                 lr: float = 5e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: [dict, None] = None,
                 logger: StepLogger = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$
        :param max_iter: number of iterations (policy updates)
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param vfcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2
        :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2
        :param batch_size: number of samples per policy update batch
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param num_workers: number of environments for parallel sampling
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ActorCritic's constructor
        super().__init__(env, policy, critic, save_dir, max_iter, logger)

        # Store the inputs
        self.min_rollouts = min_rollouts
        self.min_steps = min_steps
        self.vfcn_coeff = vfcn_coeff
        self.entropy_coeff = entropy_coeff
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm

        # Initialize
        self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init)
        self.sampler = ParallelRolloutSampler(
            env, self.expl_strat,
            num_workers=num_workers,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.optim = to.optim.RMSprop(
            [{'params': self._policy.parameters()},
             {'params': self.expl_strat.noise.parameters()},
             {'params': self._critic.vfcn.parameters()}],
            lr=lr, eps=1e-5
        )
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
Beispiel #20
0
class LQR(Algorithm):
    """ Linear Quadratic Regulator created using the control module """

    name: str = 'lqr'

    def __init__(self,
                 save_dir: str,
                 env: SimEnv,
                 policy: Policy,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 num_workers: int = 4,
                 logger: StepLogger = None,
                 ball_z_dim_mismatch: bool = True):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy which this algorithm is creating
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_workers: number of environments for parallel sampling
        :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim,
                                    set to True if the controller does not have the z component (relative position)
                                    of the ball in the state vector, i.e. state is 14-dim instead of 16-dim
        """
        if not isinstance(env, SimEnv):
            raise pyrado.TypeErr(given=env, expected_type=SimEnv)
        if not isinstance(policy, LinearPolicy):
            raise pyrado.TypeErr(given=policy, expected_type=LinearPolicy)

        # Call Algorithm's constructor
        super().__init__(save_dir, 1, policy, logger)

        # Store the inputs
        self._env = env
        self.ball_z_dim_mismatch = ball_z_dim_mismatch

        self.sampler = ParallelRolloutSampler(
            env, self._policy,
            num_workers=num_workers,
            min_steps=min_steps,
            min_rollouts=min_rollouts
        )
        self.eigvals = np.array([pyrado.inf])  # initialize with sth positive

    def step(self, snapshot_mode: str, meta_info: dict = None):

        if isinstance(inner_env(self._env), BallOnPlate5DSim):
            ctrl_gains = to.tensor([
                [0.1401, 0, 0, 0, -0.09819, -0.1359, 0, 0.545, 0, 0, 0, -0.01417, -0.04427, 0],
                [0, 0.1381, 0, 0.2518, 0, 0, -0.2142, 0, 0.5371, 0, 0.03336, 0, 0, -0.1262],
                [0, 0, 0.1414, 0.0002534, 0, 0, -0.0002152, 0, 0, 0.5318, 0, 0, 0, -0.0001269],
                [0, -0.479, -0.0004812, 39.24, 0, 0, -15.44, 0, -1.988, -0.001934, 9.466, 0, 0, -13.14],
                [0.3039, 0, 0, 0, 25.13, 15.66, 0, 1.284, 0, 0, 0, 7.609, 6.296, 0]
            ])

            # Compensate for the mismatching different state definition
            if self.ball_z_dim_mismatch:
                ctrl_gains = insert_tensor_col(ctrl_gains, 7, to.zeros((5, 1)))  # ball z position
                ctrl_gains = insert_tensor_col(ctrl_gains, -1, to.zeros((5, 1)))  # ball z velocity

        elif isinstance(inner_env(self._env), QBallBalancerSim):
            # Since the control module can by tricky to install (recommended using anaconda), we only load it if needed
            import control

            # System modeling
            dp = self._env.domain_param
            dp['J_eq'] = self._env._J_eq
            dp['B_eq_v'] = self._env._B_eq_v
            dp['c_kin'] = self._env._c_kin
            dp['zeta'] = self._env._zeta
            dp['A_m'] = self._env._A_m

            A = np.zeros((self._env.obs_space.flat_dim, self._env.obs_space.flat_dim))
            A[:self._env.obs_space.flat_dim//2, self._env.obs_space.flat_dim//2:] = \
                np.eye(self._env.obs_space.flat_dim//2)
            A[4, 4] = -dp['B_eq_v']/dp['J_eq']
            A[5, 5] = -dp['B_eq_v']/dp['J_eq']
            A[6, 0] = dp['c_kin']*dp['m_ball']*dp['g']*dp['r_ball']**2/dp['zeta']
            A[6, 6] = -dp['c_kin']*dp['r_ball']**2/dp['zeta']
            A[7, 1] = dp['c_kin']*dp['m_ball']*dp['g']*dp['r_ball']**2/dp['zeta']
            A[7, 7] = -dp['c_kin']*dp['r_ball']**2/dp['zeta']
            B = np.zeros((self._env.obs_space.flat_dim, self._env.act_space.flat_dim))
            B[4, 0] = dp['A_m']/dp['J_eq']
            B[5, 1] = dp['A_m']/dp['J_eq']
            # C = np.zeros((self._env.obs_space.flat_dim // 2, self._env.obs_space.flat_dim))
            # C[:self._env.obs_space.flat_dim // 2, :self._env.obs_space.flat_dim // 2] =
            # np.eye(self._env.obs_space.flat_dim // 2)
            # D = np.zeros((self._env.obs_space.flat_dim // 2, self._env.act_space.flat_dim))

            # Get the weighting matrices from the environment
            if not isinstance(self._env.task.rew_fcn, QuadrErrRewFcn):
                # The environment uses a reward function compatible with the LQR
                Q = self._env.task.rew_fcn.Q
                R = self._env.task.rew_fcn.R
            else:
                # The environment does not use a reward function compatible with the LQR, apply some fine tuning
                Q = np.diag([1e2, 1e2, 5e2, 5e2, 1e-2, 1e-2, 5e+0, 5e+0])
                R = np.diag([1e-2, 1e-2])

            # Solve the continuous time Riccati eq
            K, _, self.eigvals = control.lqr(A, B, Q, R)  # for discrete system pass dt
            ctrl_gains = to.from_numpy(K).to(to.get_default_dtype())

        else:
            raise pyrado.TypeErr(given=inner_env(self._env), expected_type=[BallOnPlate5DSim, QBallBalancerSim])

        # Assign the controller gains
        self._policy.init_param(-1*ctrl_gains)  # in classical control it is u = -K*x; here a = psi(s)*s

        # Sample rollouts to evaluate the LQR
        ros = self.sampler.sample()

        # Logging
        rets = [ro.undiscounted_return() for ro in ros]
        self.logger.add_value('max return', np.max(rets), 4)
        self.logger.add_value('median return', np.median(rets), 4)
        self.logger.add_value('min return', np.min(rets), 4)
        self.logger.add_value('avg return', np.mean(rets), 4)
        self.logger.add_value('std return', np.std(rets), 4)
        self.logger.add_value('avg rollout len', np.mean([ro.length for ro in ros]), 4)
        self.logger.add_value('num total samples', self._cnt_samples)
        self.logger.add_value('min mag policy param',
                              self._policy.param_values[to.argmin(abs(self._policy.param_values))])
        self.logger.add_value('max mag policy param',
                              self._policy.param_values[to.argmax(abs(self._policy.param_values))])

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(np.mean(rets)), meta_info)

    def stopping_criterion_met(self) -> bool:
        """ Checks if the all eigenvalues of the closed loop system are negative. """
        return (self.eigvals < 0).all()

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, 'env', 'pkl', self.save_dir, meta_info)
Beispiel #21
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        particle_hparam: dict,
        max_iter: int,
        num_particles: int,
        temperature: float,
        lr: float,
        horizon: int,
        std_init: float = 1.0,
        min_rollouts: int = None,
        min_steps: int = 10000,
        num_workers: int = 4,
        serial: bool = True,
        logger: StepLogger = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param particle_hparam: hyper-parameters for particle template construction
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_particles: number of distinct particles
        :param temperature: the temperature of the SVGD determines how jointly the training takes place
        :param lr: the learning rate for the update of the particles
        :param horizon: horizon for each particle
        :param std_init: initial standard deviation for the exploration
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_workers: number of environments for parallel sampling
        :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(particle_hparam, dict):
            raise pyrado.TypeErr(given=particle_hparam, expected_type=dict)
        if not all([key in particle_hparam for key in ["actor", "vfcn", "critic"]]):
            raise AttributeError

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy=None, logger=logger)

        # Store the inputs
        self._env = env
        self.num_particles = num_particles
        self.horizon = horizon
        self.lr = lr
        self.temperature = temperature
        self.serial = serial

        # Prepare placeholders for particles
        self.particles = [None] * num_particles
        self.particleSteps = [None] * num_particles
        self.expl_strats = [None] * num_particles
        self.optimizers = [None] * num_particles
        self.fixed_particles = [None] * num_particles
        self.fixed_expl_strats = [None] * num_particles
        self.samplers = [None] * num_particles
        self.count = 0
        self.update_count = 0

        # Particle factory
        actor = FNNPolicy(spec=env.spec, **particle_hparam["actor"])
        vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam["vfcn"])
        critic = GAE(vfcn, **particle_hparam["critic"])
        self.register_as_logger_parent(critic)
        particle = SVPGParticle(env.spec, actor, critic)

        for i in range(self.num_particles):
            self.particles[i] = deepcopy(particle)
            self.particles[i].init_param()
            self.expl_strats[i] = NormalActNoiseExplStrat(self.particles[i].actor, std_init)
            self.optimizers[i] = to.optim.Adam(self.expl_strats[i].parameters(), lr=self.lr)
            self.fixed_particles[i] = deepcopy(self.particles[i])
            self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i])
            self.particleSteps[i] = 0

            if self.serial:
                self.samplers[i] = ParallelRolloutSampler(
                    env, self.expl_strats[i], num_workers, min_rollouts=min_rollouts, min_steps=min_steps
                )
Beispiel #22
0
    def __init__(
        self,
        save_dir: str,
        env: Env,
        policy: Policy,
        lr: float = 5e-4,
        std_init: float = 0.15,
        min_steps: int = 1500,
        num_epochs: int = 10,
        max_iter: int = 500,
        num_teachers: int = 8,
        teacher_extra: Optional[dict] = None,
        teacher_policy: Optional[Policy] = None,
        teacher_algo: Optional[callable] = None,
        teacher_algo_hparam: Optional[dict] = None,
        randomizer: Optional[DomainRandomizer] = None,
        logger: Optional[StepLogger] = None,
        num_workers: int = 4,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                    By default, the learning rate is constant.
        :param std_init: initial standard deviation on the actions for the exploration noise
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_epochs: number of epochs (how often we iterate over the same batch)
        :param max_iter: number of iterations (policy updates)
        :param num_teachers: number of teachers that are used for distillation
        :param teacher_extra: extra dict from PDDRTeachers algo. If provided, teachers are loaded from there
        :param teacher_policy: policy to be updated (is duplicated for each teacher)
        :param teacher_algo: algorithm class to be used for training the teachers
        :param teacher_algo_hparam: hyper-params to be used for teacher_algo
        :param randomizer: randomizer for sampling the teacher domain parameters; if `None`, the environment's default
                           one is used
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        :param num_workers: number of environments for parallel sampling
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(policy, Policy):
            raise pyrado.TypeErr(given=policy, expected_type=Policy)

        # Call Algorithm's constructor.
        super().__init__(
            num_checkpoints=1, init_checkpoint=-1, save_dir=save_dir, max_iter=max_iter, policy=policy, logger=logger
        )

        # Store the inputs
        self.env_real = env
        self.min_steps = min_steps
        self.num_epochs = num_epochs
        self.num_teachers = num_teachers
        self.num_workers = num_workers

        self.teacher_policies = []
        self.teacher_envs = []
        self.teacher_expl_strats = []
        self.teacher_critics = []
        self.teacher_ex_dirs = []

        # Teachers
        if teacher_policy is not None and teacher_algo is not None and teacher_algo_hparam is not None:
            if not isinstance(teacher_policy, Policy):
                raise pyrado.TypeErr(given=teacher_policy, expected_type=Policy)
            if not issubclass(teacher_algo, Algorithm):
                raise pyrado.TypeErr(given=teacher_algo, expected_type=Algorithm)

            if randomizer is None:
                self.randomizer = create_default_randomizer(env)
            else:
                assert isinstance(randomizer, DomainRandomizer)
                self.randomizer = randomizer

            self.set_random_envs()

            # Prepare folders
            self.teacher_ex_dirs = [os.path.join(self.save_dir, f"teachers_{idx}") for idx in range(self.num_teachers)]
            for idx in range(self.num_teachers):
                os.makedirs(self.teacher_ex_dirs[idx], exist_ok=True)

            # Create teacher algos
            self.algos = [
                teacher_algo(
                    save_dir=self.teacher_ex_dirs[idx],
                    env=self.teacher_envs[idx],
                    policy=deepcopy(teacher_policy),
                    logger=None,
                    **deepcopy(teacher_algo_hparam),
                )
                for idx in range(self.num_teachers)
            ]
        elif teacher_extra is not None:
            self.unpack_teachers(teacher_extra)
            assert self.num_teachers == len(self.teacher_policies)
            self.reset_checkpoint()
        else:
            self.load_teachers()
            if self.num_teachers < len(self.teacher_policies):
                print(
                    f"You have loaded {len(self.teacher_policies)} teachers. Only the first {self.num_teachers} will be used!"
                )
                self.prune_teachers()
            assert self.num_teachers == len(self.teacher_policies)
            self.reset_checkpoint()

        # Student
        self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init)
        self.optimizer = to.optim.Adam([{"params": self.policy.parameters()}], lr=lr)

        # Environments
        self.samplers = [
            ParallelRolloutSampler(
                self.teacher_envs[t],
                deepcopy(self._expl_strat),
                num_workers=self.num_workers,
                min_steps=self.min_steps,
            )
            for t in range(self.num_teachers)
        ]

        self.teacher_weights = np.ones(self.num_teachers)

        # Distillation loss criterion
        self.criterion = to.nn.KLDivLoss(log_target=True, reduction="batchmean")
Beispiel #23
0
    def __init__(self,
                 save_dir: str,
                 env: [SimEnv, StateAugmentationWrapper],
                 subrtn: Algorithm,
                 policy: Policy,
                 expl_strat: StochasticActionExplStrat,
                 max_iter: int,
                 num_rollouts: int = None,
                 steps_num: int = None,
                 apply_dynamics_noise: bool = False,
                 dyn_eps: float = 0.01,
                 dyn_phi: float = 0.1,
                 halfspan: float = 0.25,
                 apply_proccess_noise: bool = False,
                 proc_eps: float = 0.01,
                 proc_phi: float = 0.05,
                 apply_observation_noise: bool = False,
                 obs_eps: float = 0.01,
                 obs_phi: float = 0.05,
                 torch_observation: bool = True,
                 num_workers: int = 4,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment in which the agent should be trained
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param policy: policy to be updated
        :param expl_strat: the exploration strategy
        :param max_iter: the maximum number of iterations
        :param num_rollouts: the number of rollouts to be performed for each update step
        :param steps_num: the number of steps to be performed for each update step
        :param apply_dynamics_noise: whether adversarially generated dynamics noise should be applied
        :param dyn_eps: the intensity of generated dynamics noise
        :param dyn_phi: the probability of applying dynamics noise
        :param halfspan: the halfspan of the uniform random distribution used to sample
        :param apply_proccess_noise: whether adversarially generated process noise should be applied
        :param proc_eps: the intensity of generated process noise
        :param proc_phi: the probability of applying process noise
        :param apply_observation_noise: whether adversarially generated observation noise should be applied
        :param obs_eps: the intensity of generated observation noise
        :param obs_phi: the probability of applying observation noise
        :param torch_observation: a function to provide a differentiable observation
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        assert isinstance(subrtn, Algorithm)
        assert isinstance(max_iter, int) and max_iter > 0

        super().__init__(save_dir, max_iter, policy, logger)

        # Initialize adversarial wrappers
        if apply_dynamics_noise:
            assert isinstance(env, StateAugmentationWrapper)
            env = AdversarialDynamicsWrapper(env, self.policy, dyn_eps,
                                             dyn_phi, halfspan)
        if apply_proccess_noise:
            env = AdversarialStateWrapper(env,
                                          self.policy,
                                          proc_eps,
                                          proc_phi,
                                          torch_observation=torch_observation)
        if apply_observation_noise:
            env = AdversarialObservationWrapper(env, self.policy, obs_eps,
                                                obs_phi)

        self.num_rollouts = num_rollouts
        self.sampler = ParallelRolloutSampler(
            env,
            expl_strat,
            num_workers=num_workers,
            min_steps=steps_num,
            min_rollouts=num_rollouts,
        )

        # Subroutine
        self._subrtn = subrtn
        self._subrtn.save_name = 'subrtn'
Beispiel #24
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: TwoHeadedPolicy,
                 qfcn_1: Policy,
                 qfcn_2: Policy,
                 memory_size: int,
                 gamma: float,
                 max_iter: int,
                 num_batch_updates: Optional[int] = None,
                 tau: float = 0.995,
                 ent_coeff_init: float = 0.2,
                 learn_ent_coeff: bool = True,
                 target_update_intvl: int = 1,
                 num_init_memory_steps: int = None,
                 standardize_rew: bool = True,
                 rew_scale: Union[int, float] = 1.,
                 min_rollouts: int = None,
                 min_steps: int = None,
                 batch_size: int = 256,
                 num_workers: int = 4,
                 max_grad_norm: float = 5.,
                 lr: float = 3e-4,
                 lr_scheduler=None,
                 lr_scheduler_hparam: Optional[dict] = None,
                 logger: StepLogger = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param qfcn_1: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param qfcn_2: state-action value function $Q(s,a)$, the associated target Q-functions is created from a
                        re-initialized copies of this one
        :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000
        :param gamma: temporal discount factor for the state values
        :param max_iter: number of iterations (policy updates)
        :param num_batch_updates: number of (batched) gradient updates per algorithm step
        :param tau: interpolation factor in averaging for target networks, update used for the soft update a.k.a. polyak
                    update, between 0 and 1
        :param ent_coeff_init: initial weighting factor of the entropy term in the loss function
        :param learn_ent_coeff: adapt the weighting factor of the entropy term
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to
                                      fill the buffer completely
        :param standardize_rew:  if `True`, the rewards are standardized to be $~ N(0,1)$
        :param rew_scale: scaling factor for the rewards, defaults no scaling
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param num_workers: number of environments for parallel sampling
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler.
                   By default, the learning rate is constant.
        :param lr_scheduler: learning rate scheduler type for the policy and the Q-functions that does one step
                             per `update()` call
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if typed_env(env, ActNormWrapper) is None:
            raise pyrado.TypeErr(msg='SAC required an environment wrapped by an ActNormWrapper!')
        if not isinstance(qfcn_1, Policy):
            raise pyrado.TypeErr(given=qfcn_1, expected_type=Policy)
        if not isinstance(qfcn_2, Policy):
            raise pyrado.TypeErr(given=qfcn_2, expected_type=Policy)

        # Call ValueBased's constructor
        super().__init__(save_dir, env, policy, memory_size, gamma, max_iter, num_batch_updates, target_update_intvl,
                         num_init_memory_steps, min_rollouts, min_steps, batch_size, num_workers, max_grad_norm, logger)

        self.qfcn_1 = qfcn_1
        self.qfcn_2 = qfcn_2
        self.qfcn_targ_1 = deepcopy(self.qfcn_1).eval()  # will not be trained using an optimizer
        self.qfcn_targ_2 = deepcopy(self.qfcn_2).eval()  # will not be trained using an optimizer
        self.tau = tau
        self.learn_ent_coeff = learn_ent_coeff
        self.standardize_rew = standardize_rew
        self.rew_scale = rew_scale

        # Create sampler for exploration during training
        self._expl_strat = SACExplStrat(self._policy)
        self.sampler_trn = ParallelRolloutSampler(
            self._env, self._expl_strat,
            num_workers=num_workers if min_steps != 1 else 1,
            min_steps=min_steps,  # in [2] this would be 1
            min_rollouts=min_rollouts,  # in [2] this would be None
        )

        # Q-function optimizers
        self._optim_policy = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr, eps=1e-5)
        self._optim_qfcns = to.optim.Adam([{'params': self.qfcn_1.parameters()},
                                           {'params': self.qfcn_2.parameters()}], lr=lr, eps=1e-5)

        # Automatic entropy tuning
        log_ent_coeff_init = to.log(to.tensor(ent_coeff_init, device=policy.device, dtype=to.get_default_dtype()))
        if learn_ent_coeff:
            self._log_ent_coeff = nn.Parameter(log_ent_coeff_init, requires_grad=True)
            self._ent_coeff_optim = to.optim.Adam([{'params': self._log_ent_coeff}], lr=lr, eps=1e-5)
            self.target_entropy = -to.prod(to.tensor(env.act_space.shape))
        else:
            self._log_ent_coeff = log_ent_coeff_init

        # Learning rate scheduler
        self._lr_scheduler_policy = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None:
            self._lr_scheduler_policy = lr_scheduler(self._optim_policy, **lr_scheduler_hparam)
            self._lr_scheduler_qfcns = lr_scheduler(self._optim_qfcns, **lr_scheduler_hparam)
Beispiel #25
0
    def __init__(
        self,
        subrtn: ParameterExploring,
        behavior_policy: Policy,
        num_rollouts_per_distr: int,
        metric: Union[Callable[[np.ndarray], np.ndarray], None],
        obs_dim_weight: Union[list, np.ndarray],
        std_obs_filt: int = 5,
        w_abs: float = 0.5,
        w_sq: float = 1.0,
        num_workers: int = 4,
        base_seed: int = 1001,
    ):
        """
        Constructor

        :param subrtn: wrapped algorithm to fit the domain parameter distribution
        :param behavior_policy: lower level policy used to generate the rollouts
        :param num_rollouts_per_distr: number of rollouts per domain distribution parameter set
        :param metric: functional mapping from differences in observations to value
        :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions for the default metric
        :param std_obs_filt: number of standard deviations for the Gaussian filter applied to the observaitons
        :param w_abs: weight for the mean absolute errors for the default metric
        :param w_sq: weight for the mean squared errors for the default metric
        :param num_workers: number of environments for parallel sampling
        :param base_seed: seed to set for the parallel sampler in every iteration
        """
        if not isinstance(subrtn, ParameterExploring):
            raise pyrado.TypeErr(given=subrtn,
                                 expected_type=ParameterExploring)
        if not isinstance(subrtn.env, MetaDomainRandWrapper):
            raise pyrado.TypeErr(given=subrtn.env,
                                 expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn.policy, DomainDistrParamPolicy):
            raise pyrado.TypeErr(given=subrtn.policy,
                                 expected_type=DomainDistrParamPolicy)
        if not isinstance(behavior_policy, Policy):
            raise pyrado.TypeErr(given=behavior_policy, expected_type=Policy)
        if subrtn.policy.num_param != len(subrtn.env.dp_mapping):
            raise pyrado.ShapeErr(
                msg=
                f"Number of policy parameters {subrtn.policy.num_param} does not match the"
                f"number of domain distribution parameters {len(subrtn.env.dp_mapping)}!"
            )
        if subrtn.sampler.num_init_states_per_domain != 1:
            # Only sample one rollout in every domain. This is possible since we are synchronizing the init state.
            raise pyrado.ValueErr(
                given=subrtn.sampler.num_init_states_per_domain,
                eq_constraint="1")
        if num_rollouts_per_distr < 2:
            raise pyrado.ValueErr(given=num_rollouts_per_distr,
                                  g_constraint="1")
        if len(obs_dim_weight) != subrtn.env.obs_space.flat_dim:
            raise pyrado.ShapeErr(given=obs_dim_weight,
                                  expected_match=subrtn.env.obs_space)

        # Call Algorithm's constructor
        super().__init__(subrtn.save_dir, subrtn.max_iter, subrtn.policy,
                         subrtn.logger)

        self._subrtn = subrtn
        self._subrtn.save_name = "subrtn"
        self._behavior_policy = behavior_policy
        self.obs_dim_weight = np.diag(
            obs_dim_weight
        )  # weighting factor between the different observations
        self.std_obs_filt = std_obs_filt
        if metric is None or metric == "None":
            self.metric = partial(self.weighted_l1_l2_metric,
                                  w_abs=w_abs,
                                  w_sq=w_sq,
                                  obs_dim_weight=self.obs_dim_weight)
        else:
            self.metric = metric

        # Get and optionally clip the observation bounds of the environment
        elb, eub = subrtn.env.obs_space.bound_lo, subrtn.env.obs_space.bound_up
        elb, eub = self.override_obs_bounds(elb, eub,
                                            subrtn.env.obs_space.labels)
        self.obs_normalizer = UnitCubeProjector(bound_lo=elb, bound_up=eub)

        # Create the sampler used to execute the same policy as on the real system in the meta-randomized env
        self.base_seed = base_seed
        self.behavior_sampler = ParallelRolloutSampler(self._subrtn.env,
                                                       self._behavior_policy,
                                                       num_workers=num_workers,
                                                       min_rollouts=1,
                                                       seed=base_seed)
        self.num_rollouts_per_distr = num_rollouts_per_distr
Beispiel #26
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma",
                                          [None, 0.99, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical("hidden_sizes_policy",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_policy",
                                      ["to_tanh", "to_relu"])),
    )

    # Critic
    vfcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical("hidden_sizes_critic",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_critic",
                                      ["to_tanh", "to_relu"])),
    )
    critic_hparam = dict(
        batch_size=250,
        gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0),
        lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0),
        num_epoch=trial.suggest_int("num_epoch_critic", 1, 10),
        lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical("standardize_adv_critic",
                                                  [True, False]),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_critic",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=300,
        batch_size=250,
        min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) *
        env.max_steps,
        num_epoch=trial.suggest_int("num_epoch_algo", 1, 10),
        eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2),
        std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0),
        lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_algo",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy,
               critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=1,
                                     min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Beispiel #27
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1/100., max_steps=600)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy_hparam = dict(
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )  # FNN
    # policy_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]),
    # )  # LSTM & GRU
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # policy = GRUPolicy(spec=env.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    # vfcn_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]),
    # )  # LSTM & GRU
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    critic_hparam = dict(
        batch_size=500,
        gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam
    )
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=250,
        batch_size=500,
        min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}'))
    algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(env, policy, num_workers=1,
                                     min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret
Beispiel #28
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Union[Policy, TwoHeadedPolicy],
        memory_size: int,
        gamma: float,
        max_iter: int,
        num_updates_per_step: int,
        target_update_intvl: int,
        num_init_memory_steps: int,
        min_rollouts: int,
        min_steps: int,
        batch_size: int,
        eval_intvl: int,
        max_grad_norm: float,
        num_workers: int,
        logger: StepLogger,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000
        :param gamma: temporal discount factor for the state values
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_updates_per_step: number of (batched) gradient updates per algorithm step
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to
                                      fill the buffer completely
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param eval_intvl: interval in which the evaluation rollouts are collected, also the interval in which the
                           logger prints the summary statistics
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(memory_size, int):
            raise pyrado.TypeErr(given=memory_size, expected_type=int)
        if not (num_init_memory_steps is None
                or isinstance(num_init_memory_steps, int)):
            raise pyrado.TypeErr(given=num_init_memory_steps,
                                 expected_type=int)

        if logger is None:
            # Create logger that only logs every logger_print_intvl steps of the algorithm
            logger = StepLogger(print_intvl=eval_intvl)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(
                CSVPrinter(osp.join(save_dir, "progress.csv")))
            logger.printers.append(TensorBoardPrinter(osp.join(save_dir,
                                                               "tb")))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        self._env = env
        self._memory = ReplayMemory(memory_size)
        self.gamma = gamma
        self.target_update_intvl = target_update_intvl
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm
        if num_init_memory_steps is None:
            self.num_init_memory_steps = memory_size
        else:
            self.num_init_memory_steps = max(
                min(num_init_memory_steps, memory_size), batch_size)

        # Heuristic for number of gradient updates per step
        if num_updates_per_step is None:
            self.num_batch_updates = ceil(
                min_steps /
                env.max_steps) if min_steps is not None else min_rollouts
        else:
            self.num_batch_updates = num_updates_per_step

        # Create sampler for initial filling of the replay memory
        if policy.is_recurrent:
            self.init_expl_policy = RecurrentDummyPolicy(
                env.spec, policy.hidden_size)
        else:
            self.init_expl_policy = DummyPolicy(env.spec)
        self.sampler_init = ParallelRolloutSampler(
            self._env,
            self.init_expl_policy,
            num_workers=num_workers,
            min_steps=self.num_init_memory_steps,
        )

        # Create sampler for initial filling of the replay memory and evaluation
        self.sampler_eval = ParallelRolloutSampler(
            self._env,
            self._policy,
            num_workers=num_workers,
            min_steps=None,
            min_rollouts=100,
            show_progress_bar=True,
        )

        self._expl_strat = None  # must be implemented by subclass
        self._sampler = None  # must be implemented by subclass
Beispiel #29
0
class ValueBased(Algorithm, ABC):
    """Base class of all value-based algorithms"""
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Union[Policy, TwoHeadedPolicy],
        memory_size: int,
        gamma: float,
        max_iter: int,
        num_updates_per_step: int,
        target_update_intvl: int,
        num_init_memory_steps: int,
        min_rollouts: int,
        min_steps: int,
        batch_size: int,
        eval_intvl: int,
        max_grad_norm: float,
        num_workers: int,
        logger: StepLogger,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param memory_size: number of transitions in the replay memory buffer, e.g. 1000000
        :param gamma: temporal discount factor for the state values
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_updates_per_step: number of (batched) gradient updates per algorithm step
        :param target_update_intvl: number of iterations that pass before updating the target network
        :param num_init_memory_steps: number of samples used to initially fill the replay buffer with, pass `None` to
                                      fill the buffer completely
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param batch_size: number of samples per policy update batch
        :param eval_intvl: interval in which the evaluation rollouts are collected, also the interval in which the
                           logger prints the summary statistics
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(memory_size, int):
            raise pyrado.TypeErr(given=memory_size, expected_type=int)
        if not (num_init_memory_steps is None
                or isinstance(num_init_memory_steps, int)):
            raise pyrado.TypeErr(given=num_init_memory_steps,
                                 expected_type=int)

        if logger is None:
            # Create logger that only logs every logger_print_intvl steps of the algorithm
            logger = StepLogger(print_intvl=eval_intvl)
            logger.printers.append(ConsolePrinter())
            logger.printers.append(
                CSVPrinter(osp.join(save_dir, "progress.csv")))
            logger.printers.append(TensorBoardPrinter(osp.join(save_dir,
                                                               "tb")))

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        self._env = env
        self._memory = ReplayMemory(memory_size)
        self.gamma = gamma
        self.target_update_intvl = target_update_intvl
        self.batch_size = batch_size
        self.max_grad_norm = max_grad_norm
        if num_init_memory_steps is None:
            self.num_init_memory_steps = memory_size
        else:
            self.num_init_memory_steps = max(
                min(num_init_memory_steps, memory_size), batch_size)

        # Heuristic for number of gradient updates per step
        if num_updates_per_step is None:
            self.num_batch_updates = ceil(
                min_steps /
                env.max_steps) if min_steps is not None else min_rollouts
        else:
            self.num_batch_updates = num_updates_per_step

        # Create sampler for initial filling of the replay memory
        if policy.is_recurrent:
            self.init_expl_policy = RecurrentDummyPolicy(
                env.spec, policy.hidden_size)
        else:
            self.init_expl_policy = DummyPolicy(env.spec)
        self.sampler_init = ParallelRolloutSampler(
            self._env,
            self.init_expl_policy,
            num_workers=num_workers,
            min_steps=self.num_init_memory_steps,
        )

        # Create sampler for initial filling of the replay memory and evaluation
        self.sampler_eval = ParallelRolloutSampler(
            self._env,
            self._policy,
            num_workers=num_workers,
            min_steps=None,
            min_rollouts=100,
            show_progress_bar=True,
        )

        self._expl_strat = None  # must be implemented by subclass
        self._sampler = None  # must be implemented by subclass

    @property
    def expl_strat(self) -> Union[SACExplStrat, EpsGreedyExplStrat]:
        return self._expl_strat

    @property
    def memory(self) -> ReplayMemory:
        """Get the replay memory."""
        return self._memory

    def step(self, snapshot_mode: str, meta_info: dict = None):
        if self._memory.isempty:
            # Warm-up phase
            print_cbt_once("Collecting samples until replay memory if full.",
                           "w")
            # Sample steps and store them in the replay memory
            ros = self.sampler_init.sample()
            self._memory.push(ros)
        else:
            # Sample steps and store them in the replay memory
            ros = self.sampler.sample()
            self._memory.push(ros)
        self._cnt_samples += sum([ro.length for ro in ros
                                  ])  # don't count the evaluation samples

        # Log metrics computed from the old policy (before the update)
        if self._curr_iter % self.logger.print_intvl == 0:
            ros = self.sampler_eval.sample()
            rets = [ro.undiscounted_return() for ro in ros]
            ret_max = np.max(rets)
            ret_med = np.median(rets)
            ret_avg = np.mean(rets)
            ret_min = np.min(rets)
            ret_std = np.std(rets)
        else:
            ret_max, ret_med, ret_avg, ret_min, ret_std = 5 * [
                -pyrado.inf
            ]  # dummy values
        self.logger.add_value("max return", ret_max, 4)
        self.logger.add_value("median return", ret_med, 4)
        self.logger.add_value("avg return", ret_avg, 4)
        self.logger.add_value("min return", ret_min, 4)
        self.logger.add_value("std return", ret_std, 4)
        self.logger.add_value("avg memory reward", self._memory.avg_reward(),
                              4)
        self.logger.add_value("avg rollout length",
                              np.mean([ro.length for ro in ros]), 4)
        self.logger.add_value("num total samples", self._cnt_samples)

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)

        # Use data in the memory to update the policy and the Q-functions
        self.update()

    @abstractmethod
    def update(self):
        raise NotImplementedError

    def reset(self, seed: Optional[int] = None):
        # Reset the exploration strategy, internal variables and the random seeds
        super().reset(seed)

        # Re-initialize samplers in case env or policy changed
        self.sampler_init.reinit(self._env, self.init_expl_policy)
        self.sampler.reinit(self._env, self._expl_strat)
        self.sampler_eval.reinit(self._env, self._policy)

        # Reset the replay memory
        self._memory.reset()

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            pyrado.save(self._env, "env.pkl", self.save_dir)
            pyrado.save(self._expl_strat.policy,
                        "policy.pt",
                        self.save_dir,
                        use_state_dict=True)
        else:
            pyrado.save(
                self._expl_strat.policy,
                "policy.pt",
                self.save_dir,
                prefix=meta_info.get("prefix", ""),
                suffix=meta_info.get("suffix", ""),
                use_state_dict=True,
            )
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        Mr=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        Mp=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        Lr=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        Lp=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ('Mr', 'mean'),
        1: ('Mr', 'std'),
        2: ('Mp', 'mean'),
        3: ('Mp', 'std'),
        4: ('Lr', 'mean'),
        5: ('Lr', 'std'),
        6: ('Lp', 'mean'),
        7: ('Lp', 'std')
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9885,
        lamda=0.9648,
        num_epoch=2,
        batch_size=500,
        standardize_adv=False,
        lr=5.792e-4,
        max_grad_norm=1.,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=200,
        min_steps=3 * 23 * env_sim.max_steps,
        num_epoch=7,
        eps_clip=0.0744,
        batch_size=500,
        std_init=0.9074,
        lr=3.446e-04,
        max_grad_norm=1.,
        num_workers=1,
    )
    subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20)
    prior = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom),
        NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom),
        NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom),
        NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom),
    )
    ddp_policy = DomainDistrParamPolicy(
        mapping=dp_map,
        trafo_mask=trafo_mask,
        prior=prior,
        scale_params=trial.suggest_categorical('ddp_policy_scale_params',
                                               [True, False]),
    )
    subsubrtn_distr_hparam = dict(
        max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]),
        pop_size=trial.suggest_int('pop_size', 50, 500),
        num_rollouts=1,
        num_is_samples=trial.suggest_int('num_is_samples', 5, 20),
        expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1),
        expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]),
        extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3,
                                                     1e-1),
        extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0,
                                                10),
        num_workers=1,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    subsubrtn_distr = CEM(study_dir,
                          env_sim,
                          ddp_policy,
                          **subsubrtn_distr_hparam,
                          logger=csv_logger)
    obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight],
        num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20,
                                                 100),
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=trial.suggest_categorical('algo_max_iter', [10]),
        num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts',
                                                    [5]),
        warmstart=trial.suggest_categorical('algo_warmstart', [True]),
        thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn',
                                                    [50]),
        subrtn_snapshot_mode='latest',
    )
    algo = SimOpt(study_dir,
                  env_sim,
                  env_real,
                  subrtn_policy,
                  subrtn_distr,
                  **algo_hparam,
                  logger=csv_logger)

    # Jeeeha
    algo.train(seed=args.seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env_real, algo.policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret