Esempio n. 1
0
    def _eval_cand_and_ref_one_domain(self, i: int) -> tuple:
        """
        Evaluate the candidate and the k-th reference solution (see outer loop) in the i-th domain using nJ rollouts.

        :param i: index of the domain to evaluate in
        :return: average return values for the candidate and the k-th reference in the i-th domain
        """
        cand_ret_avg = 0.
        refs_ret_avg = 0.

        # Do nJ rollouts for each set of physics params
        for r in range(self.nJ):

            # Candidate solution
            set_seed(self.base_seed + i * self.nJ + r)
            # Set the circular index for the particular realization
            self._env_dr.ring_idx = i
            # Do the rollout and collect the return
            ro_cand = rollout(self._env_dr,
                              self._subrtn_cand.policy,
                              eval=True)
            cand_ret_avg += ro_cand.undiscounted_return()

            # Reference solution
            set_seed(self.base_seed + i * self.nJ + r)
            # Set the circular index for the particular realization
            self._env_dr.ring_idx = i
            # Do the rollout and collect the return
            ro_ref = rollout(self._env_dr, self._subrtn_refs.policy, eval=True)
            refs_ret_avg += ro_ref.undiscounted_return()

        return cand_ret_avg / self.nJ, refs_ret_avg / self.nJ  # average over nJ seeds
Esempio n. 2
0
    def _handle_neg_samples(self, cand_rets: np.ndarray, refs_rets: np.ndarray,
                            k: int, i: int) -> np.ndarray:
        """
        Process negative optimality gap samples by Looking at the other Reference Solutions

        :param cand_rets: array of the candidate's return values
        :param refs_rets: array of the references' return values
        :param k: index of the reference solution
        :param i: index of the domain
        :return refs_rets: if a better reference has been round the associated value will be overwritten
        """
        if refs_rets[k, i] < cand_rets[k, i]:
            print_cbt(
                f'\nReference {k + 1} is worse than the candidate on domain realization {i + 1}.\n'  # 1-based index
                'Trying to replace this reference at this realization with a different one',
                'y')
            for other_k in range(self.nG):
                if other_k == k:
                    # Do nothing for the bad solution that brought us here
                    continue
                else:
                    # Load a reference solution different from the the k-th
                    other_ref = pyrado.load(
                        self._subrtn_refs._policy, 'policy', 'pt',
                        self.save_dir,
                        dict(prefix=f'iter_{self._curr_iter}',
                             suffix=f'ref_{other_k}'))
                    other_ref_ret = 0
                    for r in range(self.nJ):
                        # Set the same random seed
                        pyrado.set_seed(self.base_seed + i * self.nJ + r)
                        # Set the circular index for the particular realization
                        self.env_dr.ring_idx = i
                        # Do the rollout and collect the return
                        ro_other_ref = rollout(self.env_dr,
                                               other_ref,
                                               eval=True)
                        other_ref_ret += ro_other_ref.undiscounted_return(
                        ) / self.nJ  # average over nJ seeds
                    # Store the value if value is better
                    if other_ref_ret > refs_rets[k, i]:
                        refs_rets[k, i] = other_ref_ret
                        # If a better one was found, do not iterate over the remaining reference solutions
                        break

            if refs_rets[k, i] > cand_rets[k, i]:
                # Found a different reference that achieves a higher return that the candidate
                print_cbt('Successfully handled a negative OG sample', 'g')
            else:
                refs_rets[k, i] = cand_rets[
                    k, i]  # forces optimality gap sample to be 0
                print_cbt(
                    'Unsuccessfully handled a negative OG sample: Set the value to 0',
                    'r')

        else:
            # Everything is as it should be
            pass

        return refs_rets
Esempio n. 3
0
    def train(self,
              snapshot_mode: str = "latest",
              seed: int = None,
              meta_info: dict = None):
        """
        Train one/multiple policy/policies in a given environment.

        :param snapshot_mode: determines when the snapshots are stored (e.g. on every iteration or on new high-score)
        :param seed: seed value for the random number generators, pass `None` for no seeding
        :param meta_info: is not `None` if this algorithm is run as a subroutine of a meta-algorithm,
                          contains a dict of information about the current iteration of the meta-algorithm
        """
        if self._policy is not None:
            print_cbt(
                f"{get_class_name(self)} started training a {get_class_name(self._policy)} "
                f"with {self._policy.num_param} parameters using the snapshot mode {snapshot_mode}.",
                "g",
            )
            # Set dropout and batch normalization layers to training mode
            self._policy.train()
        else:
            print_cbt(
                f"{get_class_name(self)} started training using the snapshot mode {snapshot_mode}.",
                "g")

        # Set all rngs' seeds
        if seed is not None:
            set_seed(seed, verbose=True)

        while not self.stopping_criterion_met():
            # Record current iteration to logger
            self.logger.add_value(self.iteration_key, self._curr_iter)

            # Acquire data, save the training progress, and update the parameters
            self.step(snapshot_mode, meta_info)

            # Update logger and print
            self.logger.record_step()

            # Increase the iteration counter
            self._curr_iter += 1

        if self.stopping_criterion_met():
            stopping_reason = "Stopping criteria met!"
        else:
            stopping_reason = "Maximum number of iterations reached!"

        if self._policy is not None:
            print_cbt(
                f"{get_class_name(self)} finished training a {get_class_name(self._policy)} "
                f"with {self._policy.num_param} parameters. {stopping_reason}",
                "g",
            )
            # Set dropout and batch normalization layers to evaluation mode
            self._policy.eval()
        else:
            print_cbt(
                f"{get_class_name(self)} finished training. {stopping_reason}",
                "g")
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 250., max_steps=1500)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(feats=FeatureStack([
        identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat,
        ATan2Feat(1, 2),
        MultFeat([4, 5])
    ]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=50,
        pop_size=trial.suggest_int('pop_size', 50, 200),
        num_rollouts=trial.suggest_int('num_rollouts', 4, 10),
        num_is_samples=trial.suggest_int('num_is_samples', 5, 40),
        expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5),
        symm_sampling=trial.suggest_categorical('symm_sampling',
                                                [True, False]),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'),
                 env,
                 policy,
                 **algo_hparam,
                 logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Esempio n. 5
0
    def __init__(self,
                 env_name: str,
                 algo_name: str,
                 extra_info: str = None,
                 exp_id: str = None,
                 timestamp: datetime = None,
                 base_dir: str = pyrado.TEMP_DIR,
                 seed: int = None):
        """
        Constructor

        :param env_name: environment trained on
        :param algo_name: algorithm trained with
        :param extra_info: additional information on the experiment (freeform)
        :param exp_id: combined timestamp and extra_info, usually the final folder name.
        :param timestamp: experiment creation timestamp
        :param base_dir: base storage directory
        :param seed: seed value for the random number generators, pass None for no seeding
        """
        if exp_id is not None:
            # Try to parse extra_info from exp id
            sd = exp_id.split('--', 1)
            if len(sd) == 1:
                timestr = sd[0]
            else:
                timestr, extra_info = sd
            # Parse time string
            if '_' in timestr:
                timestamp = datetime.strptime(timestr, timestamp_format)
            else:
                timestamp = datetime.strptime(timestr, timestamp_date_format)
        else:
            # Create exp id from timestamp and info
            if timestamp is None:
                timestamp = datetime.now()
            exp_id = timestamp.strftime(timestamp_format)

            if extra_info is not None:
                exp_id = exp_id + '--' + extra_info

        # Store values
        self.env_name = env_name
        self.algo_name = algo_name
        self.extra_info = extra_info
        self.exp_id = exp_id
        self.timestamp = timestamp
        self.base_dir = base_dir
        self.seed = seed

        # Set the random seed
        if seed is not None:
            pyrado.set_seed(seed)
            print_cbt(f"Set the random number generators' seed to {seed}.",
                      'y')
Esempio n. 6
0
def test_domain_param_transforms(env: SimEnv, trafo_class: Type):
    pyrado.set_seed(0)

    # Create a mask for a random domain parameter
    offset = 1
    idx = random.randint(0, len(env.supported_domain_param) - 1)
    sel_dp_change = list(env.supported_domain_param)[idx]
    sel_dp_fix = list(
        env.supported_domain_param)[(idx + offset) %
                                    len(env.supported_domain_param)]
    while (offset == 1 or any([
            item in sel_dp_change for item in VORTEX_ONLY_DOMAIN_PARAM_LIST
    ]) or any([item in sel_dp_fix for item in VORTEX_ONLY_DOMAIN_PARAM_LIST])):
        idx = random.randint(0, len(env.supported_domain_param) - 1)
        sel_dp_change = list(env.supported_domain_param)[idx]
        sel_dp_fix = list(
            env.supported_domain_param)[(idx + offset) %
                                        len(env.supported_domain_param)]
        offset += 1

    mask = (sel_dp_change, )
    wenv = trafo_class(env, mask)
    assert isinstance(wenv, DomainParamTransform)

    # Check 5 random values
    for _ in range(5):
        # Change the selected domain parameter
        new_dp_val = random.random() * env.get_nominal_domain_param(
        )[sel_dp_change]
        new_dp_val = abs(
            new_dp_val) + 1e-6  # due to the domain of the new params
        transformed_new_dp_val = wenv.forward(new_dp_val)
        wenv.domain_param = {
            sel_dp_change: transformed_new_dp_val
        }  # calls inverse transform
        if not isinstance(inner_env(wenv), SimPyEnv):
            wenv.reset(
            )  # the RcsPySim and MujocoSim classes need to be reset to apply the new domain param

        # Test the actual domain param and the the getters
        assert inner_env(wenv)._domain_param[sel_dp_change] == pytest.approx(
            new_dp_val, abs=1e-5)
        assert wenv.domain_param[sel_dp_change] == pytest.approx(new_dp_val,
                                                                 abs=1e-5)
        assert wenv.domain_param[sel_dp_fix] != pytest.approx(new_dp_val)
Esempio n. 7
0
    def reset(self, seed: int = None):
        """
        Reset the algorithm to it's initial state. This should NOT reset learned policy parameters.
        By default, this resets the iteration count and the exploration strategy.
        Be sure to call this function if you override it.

        :param seed: seed value for the random number generators, pass `None` for no seeding
        """
        # Reset the exploration strategy if any
        if self.expl_strat is not None:
            self.expl_strat.reset_expl_params()

        # Reset internal variables
        self._curr_iter = 0
        self._cnt_samples = 0
        self._highest_avg_ret = -pyrado.inf

        # Set all rngs' seeds
        if seed is not None:
            set_seed(seed, verbose=True)
Esempio n. 8
0
def test_env_specific(env: Env):
    pyrado.set_seed(0)

    if "qbb" in env.name:
        policy = QBallBalancerPDCtrl(env.spec)
        policy.reset()
    elif "qcp" in env.name:
        policy = QCartPoleSwingUpAndBalanceCtrl(env.spec)
        policy.reset()
    elif "qq" in env.name:
        policy = QQubeSwingUpAndBalanceCtrl(env.spec)
        policy.reset()
    else:
        raise NotImplementedError

    # Sample an observation and do an action 10 times
    for _ in range(10):
        obs = env.obs_space.sample_uniform()
        obs = to.from_numpy(obs).to(dtype=to.get_default_dtype())
        act = policy(obs)
        assert isinstance(act, to.Tensor)
Esempio n. 9
0
def test_action_statistics(env, policy):
    sigma = 1.  # with lower values like 0.1 we can observe violations of the tolerances

    # Create an action-based exploration strategy
    explstrat = NormalActNoiseExplStrat(policy, std_init=sigma)

    # Sample a deterministic rollout
    pyrado.set_seed(0)
    ro_policy = rollout(env, policy, eval=True, max_steps=1000, stop_on_done=False)
    ro_policy.torch()

    # Run the exploration strategy on the previously sampled rollout
    if policy.is_recurrent:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _, _ = explstrat(ro_policy.observations)
        else:
            act_expl, _ = explstrat(ro_policy.observations)
        # Get the hidden states from the deterministic rollout
        hidden_states = ro_policy.hidden_states
    else:
        if isinstance(policy, TwoHeadedPolicy):
            act_expl, _ = explstrat(ro_policy.observations)
        else:
            act_expl = explstrat(ro_policy.observations)
        hidden_states = [0.]*ro_policy.length  # just something that does not violate the format

    ro_expl = StepSequence(
        actions=act_expl[:-1],  # truncate act due to last obs
        observations=ro_policy.observations,
        rewards=ro_policy.rewards,  # don't care but necessary
        hidden_states=hidden_states
    )

    # Compute action statistics and the ground truth
    actstats = compute_action_statistics(ro_expl, explstrat)
    gt_logprobs = Normal(loc=ro_policy.actions, scale=sigma).log_prob(ro_expl.actions)
    gt_entropy = Normal(loc=ro_policy.actions, scale=sigma).entropy()

    to.testing.assert_allclose(actstats.log_probs, gt_logprobs, rtol=1e-4, atol=1e-5)
    to.testing.assert_allclose(actstats.entropy, gt_entropy, rtol=1e-4, atol=1e-5)
Esempio n. 10
0
def test_order_act_noise_act_norm(env: SimEnv):
    # First noise wrapper then normalization wrapper
    wrapped_env_noise = GaussianActNoiseWrapper(
        env,
        noise_mean=0.2 * np.ones(env.act_space.shape),
        noise_std=0.1 * np.ones(env.act_space.shape))
    wrapped_env_noise_norm = ActNormWrapper(wrapped_env_noise)

    # First normalization wrapper then noise wrapper
    wrapped_env_norm = ActNormWrapper(env)
    wrapped_env_norm_noise = GaussianActNoiseWrapper(
        wrapped_env_norm,
        noise_mean=0.2 * np.ones(env.act_space.shape),
        noise_std=0.1 * np.ones(env.act_space.shape))

    # Sample some values directly from the act_spaces
    for i in range(3):
        pyrado.set_seed(i)
        act_noise_norm = wrapped_env_noise_norm.act_space.sample_uniform()

        pyrado.set_seed(i)
        act_norm_noise = wrapped_env_norm_noise.act_space.sample_uniform()

        # These samples must be the same since were not passed to _process_act function
        assert np.allclose(act_noise_norm, act_norm_noise)

    # Process a sampled action
    for i in range(3):
        # Sample a small random action such that the de-normalization does not map it to the act_space limits
        rand_act = 0.01 * env.act_space.sample_uniform()

        pyrado.set_seed(i)
        wrapped_env_noise_norm.reset()
        obs_noise_norm, _, _, _ = wrapped_env_noise_norm.step(rand_act)

        pyrado.set_seed(i)
        wrapped_env_norm_noise.reset()
        obs_norm_noise, _, _, _ = wrapped_env_norm_noise.step(rand_act)

        # The order of processing (first normalization or first randomization must make a difference)
        assert not np.allclose(obs_noise_norm, obs_norm_noise)
Esempio n. 11
0
def adn_variant(dt,
                max_steps,
                max_dist_force,
                physics_engine,
                normalize_obs=True,
                obsnorm_cpp=True):
    pyrado.set_seed(1001)

    # Explicit normalization bounds
    elb = {
        'EffectorLoadCell_Fx': -100.,
        'EffectorLoadCell_Fz': -100.,
        'Effector_Xd': -1,
        'Effector_Zd': -1,
        'GD_DS0d': -1,
        'GD_DS1d': -1,
        'GD_DS2d': -1,
    }
    eub = {
        'GD_DS0': 3.,
        'GD_DS1': 3,
        'GD_DS2': 3,
        'EffectorLoadCell_Fx': 100.,
        'EffectorLoadCell_Fz': 100.,
        'Effector_Xd': .5,
        'Effector_Zd': .5,
        'GD_DS0d': .5,
        'GD_DS1d': .5,
        'GD_DS2d': .5,
        'PredCollCost_h50': 1000.
    }

    extra_kwargs = {}
    if normalize_obs and obsnorm_cpp:
        extra_kwargs['normalizeObservations'] = True
        extra_kwargs['obsNormOverrideLower'] = elb
        extra_kwargs['obsNormOverrideUpper'] = eub

    # Set up environment
    env = Planar3LinkTASim(physicsEngine=physics_engine,
                           dt=dt,
                           max_steps=max_steps,
                           max_dist_force=max_dist_force,
                           collisionAvoidanceIK=True,
                           taskCombinationMethod='sum',
                           **extra_kwargs)

    if normalize_obs and not obsnorm_cpp:
        env = ObsNormWrapper(env, explicit_lb=elb, explicit_ub=eub)

    # Set up random policy
    policy_hparam = dict(
        tau_init=0.2,
        activation_nonlin=to.sigmoid,
        potentials_dyn_fcn=pd_cubic,
    )
    policy = ADNPolicy(spec=env.spec, dt=dt, **policy_hparam)
    print_cbt('Running ADNPolicy with random initialization', 'c', bright=True)

    # Simulate and plot potentials
    ro = rollout(env,
                 policy,
                 render_mode=RenderMode(video=True),
                 stop_on_done=True)
    plot_potentials(ro)

    return ro
Esempio n. 12
0
                            figsize=(8, 12),
                            sharex='col',
                            tight_layout=True)

    # Try to run several possible cases
    for pe in ['Bullet', 'Vortex']:
        print_cbt(f'Running with {pe} physics engine', 'c', bright=True)

        if rcsenv.supportsPhysicsEngine(pe):
            env, policy = create_setup(pe,
                                       dt=0.01,
                                       max_steps=1000,
                                       max_dist_force=0.)

            # Simulate
            pyrado.set_seed(1)
            ro = rollout(env, policy, render_mode=RenderMode(video=True))

            # Render plots
            axs[0].plot(ro.observations[:, 0], label=pe)
            axs[1].plot(ro.observations[:, 1], label=pe)
            axs[2].plot(ro.observations[:, 2], label=pe)
            axs[0].legend()
            axs[1].legend()
            axs[2].legend()

    # Show plots
    axs[0].set_title('gBotKuka.xml')
    axs[0].set_ylabel('plate x pos')
    axs[1].set_ylabel('plate y pos')
    axs[2].set_ylabel('plate z pos')
Esempio n. 13
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env = QQubeSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            'shared_hidden_sizes_policy',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('shared_hidden_nonlin_policy',
                                      ['to_tanh', 'to_relu'])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    q_fcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical(
            'hidden_sizes_critic',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('hidden_nonlin_critic',
                                      ['to_tanh', 'to_relu'])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)
    q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            'min_steps_algo', [1]),  # , 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform('memory_size_algo',
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform('tau_algo', 0.99, 1.),
        alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9),
        learn_alpha=trial.suggest_categorical('learn_alpha_algo',
                                              [True, False]),
        standardize_rew=trial.suggest_categorical('standardize_rew_algo',
                                                  [False]),
        gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.),
        target_update_intvl=trial.suggest_categorical(
            'target_update_intvl_algo', [1, 5]),
        num_batch_updates=trial.suggest_categorical('num_batch_updates_algo',
                                                    [1, 5]),
        batch_size=trial.suggest_categorical('batch_size_algo',
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(ex_dir, f'trial_{trial.number}'))
    algo = SAC(ex_dir,
               env,
               policy,
               q_fcn_1,
               q_fcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(
        env, policy, num_envs=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Esempio n. 14
0
Play around with PyTorch's 1-dim concolution class (in the context of using it for the NFPolicy class)

.. seealso::
    # https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728
    # https://github.com/jayleicn/TVQAplus/blob/master/model/cnn.py
"""
import torch as to
import torch.nn as nn
from matplotlib import pyplot as plt

import pyrado
from pyrado.policies.initialization import init_param
from pyrado.utils.nn_layers import MirrConv1d

if __name__ == '__main__':
    pyrado.set_seed(0)

    hand_coded_filter = False  # use a ramp from 0 to 1 instead of random weights
    use_depth_wise_conv = False
    use_custom_mirr_layer = False
    use_custom_bell_init = True

    batch_size = 1
    num_neurons = 360  # each potential-based neuron is basically like time steps of a signal
    in_channels = 1  # number of input signals
    out_channels = 6  # number of filters
    if hand_coded_filter:
        out_channels = 1
    kernel_size = 16  # larger number smooth out and reduce the length of the output signal, use odd numbers
    padding_mode = 'circular'  # circular, reflective, zeros
    padding = kernel_size // 2 if padding_mode != 'circular' else kernel_size - 1
Esempio n. 15
0
def test_out_of_bounds_base_seed(base_seed, sub_seed, sub_sub_seed, expected):
    assert pyrado.set_seed(base_seed, sub_seed, sub_sub_seed,
                           verbose=True) == expected
    assert pyrado.get_base_seed() == base_seed
Esempio n. 16
0
def _run_set_seed(G, seed):
    """Ignore global space, and forward to `pyrado.set_seed()`"""
    pyrado.set_seed(seed)
Esempio n. 17
0
Play around with PyTorch's 1-dim concolution class (in the context of using it for the NFPolicy class)

.. seealso::
    # https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728
    # https://github.com/jayleicn/TVQAplus/blob/master/model/cnn.py
"""
import torch as to
import torch.nn as nn
from matplotlib import pyplot as plt

import pyrado
from pyrado.utils.nn_layers import MirrConv1d


if __name__ == '__main__':
    pyrado.set_seed(10)

    hand_coded_filter = False  # use a ramp from 0 to 1 instead of random weights
    use_depth_wise_conv = False
    use_custom_symm_init = True

    batch_size = 1
    num_neurons = 360  # each potential-based neuron is basically like time steps of a signal
    in_channels = 2  # number of input signals
    out_channels = 6  # number of filters
    if hand_coded_filter:
        out_channels = 1
    kernel_size = 17  # larger number smooth out and reduce the length of the output signal, use odd numbers
    padding_mode = 'circular'  # circular, reflective, zeros
    padding = kernel_size//2 if padding_mode != 'circular' else kernel_size - 1
Esempio n. 18
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma",
                                          [None, 0.99, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical("hidden_sizes_policy",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_policy",
                                      ["to_tanh", "to_relu"])),
    )

    # Critic
    vfcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical("hidden_sizes_critic",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_critic",
                                      ["to_tanh", "to_relu"])),
    )
    critic_hparam = dict(
        batch_size=250,
        gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0),
        lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0),
        num_epoch=trial.suggest_int("num_epoch_critic", 1, 10),
        lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical("standardize_adv_critic",
                                                  [True, False]),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_critic",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=300,
        batch_size=250,
        min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) *
        env.max_steps,
        num_epoch=trial.suggest_int("num_epoch_algo", 1, 10),
        eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2),
        std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0),
        lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_algo",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy,
               critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=1,
                                     min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Esempio n. 19
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Load the data
    data_set_name = "oscillation_50Hz_initpos-0.5"
    data = pd.read_csv(osp.join(pyrado.PERMA_DIR, "misc", f"{data_set_name}.csv"))
    if data_set_name == "daily_min_temperatures":
        data = to.tensor(data["Temp"].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif data_set_name == "monthly_sunspots":
        data = to.tensor(data["Sunspots"].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif "oscillation" in data_set_name:
        data = to.tensor(data["Positions"].values, dtype=to.get_default_dtype()).view(-1, 1)
    else:
        raise pyrado.ValueErr(
            given=data_set_name,
            eq_constraint="'daily_min_temperatures', 'monthly_sunspots', "
            "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4",
        )

    # Dataset
    data_set_hparam = dict(
        name=data_set_name,
        ratio_train=0.7,
        window_size=trial.suggest_int("dataset_window_size", 1, 100),
        standardize_data=False,
        scale_min_max_data=True,
    )
    dataset = TimeSeriesDataSet(data, **data_set_hparam)

    # Policy
    policy_hparam = dict(
        dt=0.02 if "oscillation" in data_set_name else 1.0,
        obs_layer=None,
        activation_nonlin=to.tanh,
        potentials_dyn_fcn=fcn_from_str(
            trial.suggest_categorical("policy_potentials_dyn_fcn", ["pd_linear", "pd_cubic"])
        ),
        tau_init=trial.suggest_loguniform("policy_tau_init", 1e-2, 1e3),
        tau_learnable=True,
        kappa_init=trial.suggest_categorical("policy_kappa_init", [0, 1e-4, 1e-2]),
        kappa_learnable=True,
        capacity_learnable=True,
        potential_init_learnable=trial.suggest_categorical("policy_potential_init_learnable", [True, False]),
        init_param_kwargs=trial.suggest_categorical("policy_init_param_kwargs", [None]),
        use_cuda=False,
    )
    policy = ADNPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        windowed=trial.suggest_categorical("algo_windowed", [True, False]),
        max_iter=1000,
        optim_class=optim.Adam,
        optim_hparam=dict(
            lr=trial.suggest_uniform("optim_lr", 5e-4, 5e-2),
            eps=trial.suggest_uniform("optim_eps", 1e-8, 1e-5),
            weight_decay=trial.suggest_uniform("optim_weight_decay", 5e-5, 5e-3),
        ),
        loss_fcn=nn.MSELoss(),
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f"trial_{trial.number}"))
    algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    num_init_samples = dataset.window_size
    _, loss_trn = TSPred.evaluate(
        policy,
        dataset.data_trn_inp,
        dataset.data_trn_targ,
        windowed=algo.windowed,
        num_init_samples=num_init_samples,
        cascaded=False,
    )
    _, loss_tst = TSPred.evaluate(
        policy,
        dataset.data_tst_inp,
        dataset.data_tst_targ,
        windowed=algo.windowed,
        num_init_samples=num_init_samples,
        cascaded=False,
    )

    return loss_trn
            f"Did not find a recorded real trajectory (qpos_real_{mode} and qvel_real_{mode}) for this policy. "
            f"Run deployment/run_policy_wam.py to get real-world trajectories.",
            "y",
            bright=True,
        )

    # Load the policy and the environment
    env, policy, _ = load_experiment(ex_dir, args)

    # Get nominal environment
    env = remove_all_dr_wrappers(env)
    env.domain_param = env.get_nominal_domain_param()
    env.stop_on_collision = False

    # Fix seed for reproducibility
    pyrado.set_seed(args.seed)

    # Use the recorded initial state from the real system
    init_state = env.init_space.sample_uniform()
    if real_data_exists:
        if input(
                "Use the recorded initial state from the real system? [y] / n "
        ).lower() == "" or "y":
            init_state[:env.num_dof] = qpos_real[0, :]

    # Define indices of actuated joints
    act_idcs = [1, 3, 5] if env.num_dof == 7 else [1, 3]

    # Do rollout in simulation
    ro = rollout(env,
                 policy,
Esempio n. 21
0
def bootstrap_ci(
    data: np.ndarray,
    stat_fcn: Callable,
    num_reps: int,
    alpha: float,
    ci_sides: int,
    bias_correction: bool = False,
    studentized: bool = False,
    seed: int = None,
):
    r"""
    Re-sampling input data using the nonparametric bootstrap method, computing bootstrap replications using stat_fcn and
    computing a confidence interval on the statistic of interest given by `stat_fcn` which needs to expect the argument
    `axis` (like numpy functions do).

    .. seealso::
        [1] https://projecteuclid.org/download/pdf_1/euclid.ss/1032280214
        [2] https://people.csail.mit.edu/tommi/papers/SteJaa-nips03.pdf
        [3] Cameron & Trivedi, "Microeconometrics: Methods and Applications", 2005, page 361
        [4] http://users.stat.umn.edu/~helwig/notes/bootci-Notes.pdf
        [5] https://www.diva-portal.org/smash/get/diva2:130905/FULLTEXT01.pdf
        [6] https://www.ethz.ch/content/dam/ethz/special-interest/math/statistics/sfs/Education/Advanced%20Studies%20in%20Applied%20Statistics/course-material-1719/Nonparametric%20Methods/lecture_2up.pdf
        [7] https://ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and-statistics-spring-2014/readings/MIT18_05S14_Reading24.pdf

    :param data: data to bootstrap from (for now only 1D arrays supported)
    :param stat_fcn: function to compute a statistic of interest (e.g. mean, variance) on bootstrap samples
    :param num_reps: number of samples in every bootstrap sample
    :param alpha: determines the confidence level $1 - \alpha \in [0, 1]$
    :param ci_sides: one or two-sided confidence interval
    :param axis: axis to compute along in case of 2-dim data
    :param bias_correction: bool to decide if the bias should be subtracted (see [2]). However, the confidence intervals
                            are constructed independent of the bias-correction (see [5, p.7]).
                            The bias-correction can be dangerous in practice. Even though T_bc(D) is less biased than
                            T(D), the bias-corrected estimator may have substantially larger variance. This is due to a
                            possibly higher variability in the estimate of the bias, particularly when computed from
                            small data sets.
                            Other estimates of the bias-correction factor than stat_emp possible, see [4].
    :param studentized: flag to determine if the method based on the t-distribution is used (leads to a wider ci)
    :param seed: value for the random number generators' seeds, pass `None` to skip seeding
    :return: mean of the bootstrap replications, and the confidence interval
    """
    if not isinstance(data, np.ndarray):
        raise pyrado.TypeErr(given=data, expected_type=np.ndarray)
    if not callable(stat_fcn):
        raise pyrado.TypeErr(given=stat_fcn, expected_type=Callable)
    if not isinstance(alpha, (int, float)):
        raise pyrado.TypeErr(given=alpha, expected_type=[int, float])
    if not isinstance(num_reps, int) and num_reps > 0:
        raise pyrado.TypeErr(given=num_reps, expected_type=int)
    if not (ci_sides == 1 or ci_sides == 2):
        raise pyrado.ValueErr(given=ci_sides, eq_constraint="1 or 2")

    data = np.atleast_2d(data)
    if data.shape[0] == 1:
        data = np.transpose(data)  # correct for np.atleast_2d
    if data.ndim > 2:
        raise pyrado.ShapeErr(msg="The data array needs to be at max two-dimensional!")
    num_data_samples = data.shape[0]
    dim_data_samples = data.shape[1]

    # Set the seed if provided
    pyrado.set_seed(seed)

    # Get the bootstrap replications. The size of the samples drawn by the bootstrap method have to be equal input
    # sample, since the variance of the statistic to be computed depends on sample size
    data_bs = np.stack(
        [data[np.random.choice(num_data_samples, num_data_samples, replace=True)] for _ in range(num_reps)], axis=2
    )
    # data_bs = np.random.choice(data, size=(num_data_samples, dim_data_samples, num_reps), replace=True)

    # Compute the statistic of interest based on the empirical distribution (input data)
    stat_emp = stat_fcn(data, axis=0)
    assert stat_emp.shape == (dim_data_samples,)

    # Compute the statistic of interest based on the resampled distribution -->> bootstrap replications
    stat_bs = stat_fcn(data_bs, axis=0)
    assert stat_bs.shape == (dim_data_samples, num_reps)

    # Correct for the bias introduced by bootstrapping
    if bias_correction:
        # bias-corrected statistic (see (2) in [2], or (11.10) in [3])
        stat_bs_bc = 2 * stat_emp.reshape(-1, 1) - np.mean(
            stat_bs
        )  # repl_bc = stat_emp - bias, with bias = mean_repl - stat_emp
        # Return the bias-corrected estimator based on the original sample a.k.a. empirical distribution,
        # but use the correction also for the bootstrap replications
        stat_ret = stat_bs_bc
    else:
        # Return the estimator based on the original sample a.k.a. empirical distribution
        stat_ret = stat_emp

    # Compute the deviation to the value of the statistic based on the empirical distribution (see [7]). This is
    # analogous to the deviation of the empirical value around the true population value,
    # i.e. delta = stat_emp - stat_pop
    # Note: it makes no difference if one uses the percentile operator before or after this difference
    delta_bs = stat_bs - stat_emp.reshape(-1, 1)
    assert delta_bs.shape == (dim_data_samples, num_reps)

    # Confidence interval with asymptotic refinement (a.k.a. percentile-t method)
    if studentized:
        # Compute the standard error of the original sample
        se_emp = np.std(data, axis=0, ddof=0) / np.sqrt(data.shape[0])  # for dividing by (n-1) set ddof=1
        assert se_emp.shape == (dim_data_samples,)
        if np.any(se_emp < 1e-9):
            print_cbt("The standard error of the empirical data (se_emp) is below 1e-9.", "y")

        # Compute the standard error of the replications for the bootstrapped t-statistic
        se_bs = np.std(data_bs, axis=0, ddof=0) / np.sqrt(data_bs.shape[0])
        assert se_bs.shape == (dim_data_samples, num_reps)
        if np.any(se_bs < 1e-9):  # use any for version 2 above
            print_cbt(
                "The standard error of the bootstrapped data (se_bs) is below 1e-9. "
                "Setting confidence interval bounds to infinity.",
                "y",
            )
            return stat_ret, -pyrado.inf, pyrado.inf

        # Compute the t-statistic of the replications
        t_bs = delta_bs / se_bs  # is consistent with [3, p. 360]

        t_bs.sort()
        # Two-sided confidence interval
        if ci_sides == 2:
            t_lo, t_up = np.percentile(t_bs, 100 * np.array([alpha / 2, 1 - alpha / 2]), axis=1)
        # One-sided confidence interval  (lower and upper bound as if there would only be one of them)
        else:
            t_lo, t_up = np.percentile(t_bs, 100 * np.array([alpha, 1 - alpha]), axis=1)

        ci_lo = stat_emp - t_up * se_emp  # see [3, (11.6) p. 364]
        ci_up = stat_emp - t_lo * se_emp  # see [3, (11.6) p. 364]

    # Confidence interval without asymptotic refinement (a.k.a. basic method)
    else:
        delta_bs.sort()
        # Two-sided confidence interval
        if ci_sides == 2:
            delta_lo, delta_up = np.percentile(delta_bs, 100 * np.array([alpha / 2, 1 - alpha / 2]), axis=1)
        # One-sided confidence interval (lower and upper bound as if there would only be one of them)
        else:
            delta_lo, delta_up = np.percentile(delta_bs, 100 * np.array([alpha, 1 - alpha]), axis=1)

        ci_lo = stat_emp - delta_up
        ci_up = stat_emp - delta_lo

    assert ci_lo.shape == (dim_data_samples,)
    assert ci_up.shape == (dim_data_samples,)

    return stat_ret, ci_lo, ci_up
Esempio n. 22
0
def rollout(
    env: Env,
    policy: Union[nn.Module, Policy, Callable],
    eval: bool = False,
    max_steps: Optional[int] = None,
    reset_kwargs: Optional[dict] = None,
    render_mode: RenderMode = RenderMode(),
    render_step: int = 1,
    no_reset: bool = False,
    no_close: bool = False,
    record_dts: bool = False,
    stop_on_done: bool = True,
    seed: Optional[int] = None,
    sub_seed: Optional[int] = None,
    sub_sub_seed: Optional[int] = None,
) -> StepSequence:
    """
    Perform a rollout (i.e. sample a trajectory) in the given environment using given policy.

    :param env: environment to use (`SimEnv` or `RealEnv`)
    :param policy: policy to determine the next action given the current observation.
                   This policy may be wrapped by an exploration strategy.
    :param eval: pass `False` if the rollout is executed during training, else `True`. Forwarded to PyTorch `Module`.
    :param max_steps: maximum number of time steps, if `None` the environment's property is used
    :param reset_kwargs: keyword arguments passed to environment's reset function
    :param render_mode: determines if the user sees an animation, console prints, or nothing
    :param render_step: rendering interval, renders every step if set to 1
    :param no_reset: do not reset the environment before running the rollout
    :param no_close: do not close (and disconnect) the environment after running the rollout
    :param record_dts: flag if the time intervals of different parts of one step should be recorded (for debugging)
    :param stop_on_done: set to false to ignore the environment's done flag (for debugging)
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return paths of the observations, actions, rewards, and information about the environment as well as the policy
    """
    # Check the input
    if not isinstance(env, Env):
        raise pyrado.TypeErr(given=env, expected_type=Env)
    # Don't restrain policy type, can be any callable
    if not isinstance(eval, bool):
        raise pyrado.TypeErr(given=eval, expected_type=bool)
    # The max_steps argument is checked by the environment's setter
    if not (isinstance(reset_kwargs, dict) or reset_kwargs is None):
        raise pyrado.TypeErr(given=reset_kwargs, expected_type=dict)
    if not isinstance(render_mode, RenderMode):
        raise pyrado.TypeErr(given=render_mode, expected_type=RenderMode)

    # Initialize the paths
    obs_hist = []
    act_hist = []
    act_app_hist = []
    rew_hist = []
    state_hist = []
    env_info_hist = []
    t_hist = []
    if isinstance(policy, Policy):
        if policy.is_recurrent:
            hidden_hist = []
        # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly
        if isinstance(getattr(policy, "policy", policy), PotentialBasedPolicy):
            pot_hist = []
            stim_ext_hist = []
            stim_int_hist = []
        elif isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy):
            head_2_hist = []
        if record_dts:
            dt_policy_hist = []
            dt_step_hist = []
            dt_remainder_hist = []

    # Override the number of steps to execute
    if max_steps is not None:
        env.max_steps = max_steps

    # Set all rngs' seeds (call before resetting)
    if seed is not None:
        pyrado.set_seed(seed, sub_seed, sub_sub_seed)

    # Reset the environment and pass the kwargs
    if reset_kwargs is None:
        reset_kwargs = dict()
    obs = np.zeros(env.obs_space.shape) if no_reset else env.reset(**reset_kwargs)

    # Setup rollout information
    rollout_info = dict(env_name=env.name, env_spec=env.spec)
    if isinstance(inner_env(env), SimEnv):
        rollout_info["domain_param"] = env.domain_param

    if isinstance(policy, Policy):
        # Reset the policy, i.e. the exploration strategy in case of step-based exploration.
        # In case the environment is a simulation, the current domain parameters are passed to the policy. This allows
        # the policy policy to update it's internal model, e.g. for the energy-based swing-up controllers
        if isinstance(env, SimEnv):
            policy.reset(domain_param=env.domain_param)
        else:
            policy.reset()

        # Set dropout and batch normalization layers to the right mode
        if eval:
            policy.eval()
        else:
            policy.train()

        # Check for recurrent policy, which requires initializing the hidden state
        if policy.is_recurrent:
            hidden = policy.init_hidden()

    # Initialize animation
    env.render(render_mode, render_step=1)

    # Initialize the main loop variables
    done = False
    t = 0.0  # time starts at zero
    t_hist.append(t)
    if record_dts:
        t_post_step = time.time()  # first sample of remainder is useless

    # ----------
    # Begin loop
    # ----------

    # Terminate if the environment signals done, it also keeps track of the time
    while not (done and stop_on_done) and env.curr_step < env.max_steps:
        # Record step start time
        if record_dts or render_mode.video:
            t_start = time.time()  # dual purpose
        if record_dts:
            dt_remainder = t_start - t_post_step

        # Check observations
        if np.isnan(obs).any():
            env.render(render_mode, render_step=1)
            raise pyrado.ValueErr(
                msg=f"At least one observation value is NaN!"
                + tabulate(
                    [list(env.obs_space.labels), [*color_validity(obs, np.invert(np.isnan(obs)))]], headers="firstrow"
                )
            )

        # Get the agent's action
        obs_to = to.from_numpy(obs).type(to.get_default_dtype())  # policy operates on PyTorch tensors
        with to.no_grad():
            if isinstance(policy, Policy):
                if policy.is_recurrent:
                    if isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy):
                        act_to, head_2_to, hidden_next = policy(obs_to, hidden)
                    else:
                        act_to, hidden_next = policy(obs_to, hidden)
                else:
                    if isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy):
                        act_to, head_2_to = policy(obs_to)
                    else:
                        act_to = policy(obs_to)
            else:
                # If the policy ist not of type Policy, it should still operate on PyTorch tensors
                act_to = policy(obs_to)

        act = act_to.detach().cpu().numpy()  # environment operates on numpy arrays

        # Check actions
        if np.isnan(act).any():
            env.render(render_mode, render_step=1)
            raise pyrado.ValueErr(
                msg=f"At least one action value is NaN!"
                + tabulate(
                    [list(env.act_space.labels), [*color_validity(act, np.invert(np.isnan(act)))]], headers="firstrow"
                )
            )

        # Record time after the action was calculated
        if record_dts:
            t_post_policy = time.time()

        # Ask the environment to perform the simulation step
        state = env.state.copy()
        obs_next, rew, done, env_info = env.step(act)

        # Get the potentially clipped action, i.e. the one that was actually done in the environment
        act_app = env.limit_act(act)

        # Record time after the step i.e. the send and receive is completed
        if record_dts:
            t_post_step = time.time()
            dt_policy = t_post_policy - t_start
            dt_step = t_post_step - t_post_policy

        # Record data
        obs_hist.append(obs)
        act_hist.append(act)
        act_app_hist.append(act_app)
        rew_hist.append(rew)
        state_hist.append(state)
        env_info_hist.append(env_info)
        if record_dts:
            dt_policy_hist.append(dt_policy)
            dt_step_hist.append(dt_step)
            dt_remainder_hist.append(dt_remainder)
            t += dt_policy + dt_step + dt_remainder
        else:
            t += env.dt
        t_hist.append(t)
        if isinstance(policy, Policy):
            if policy.is_recurrent:
                hidden_hist.append(hidden)
                hidden = hidden_next
            # If an ExplStrat is passed use the policy property, if a Policy is passed use it directly
            if isinstance(getattr(policy, "policy", policy), PotentialBasedPolicy):
                pot_hist.append(hidden)
                stim_ext_hist.append(getattr(policy, "policy", policy).stimuli_external.detach().cpu().numpy())
                stim_int_hist.append(getattr(policy, "policy", policy).stimuli_internal.detach().cpu().numpy())
            elif isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy):
                head_2_hist.append(head_2_to)

        # Store the observation for next step (if done, this is the final observation)
        obs = obs_next

        # Render if wanted (actually renders the next state)
        env.render(render_mode, render_step)
        if render_mode.video:
            do_sleep = True
            if pyrado.mujoco_loaded:
                from pyrado.environments.mujoco.base import MujocoSimEnv

                if isinstance(env, MujocoSimEnv):
                    # MuJoCo environments seem to crash on time.sleep()
                    do_sleep = False
            if do_sleep:
                # Measure time spent and sleep if needed
                t_end = time.time()
                t_sleep = env.dt + t_start - t_end
                if t_sleep > 0:
                    time.sleep(t_sleep)

    # --------
    # End loop
    # --------

    if not no_close:
        # Disconnect from EnvReal instance (does nothing for EnvSim instances)
        env.close()

    # Add final observation to observations list
    obs_hist.append(obs)
    state_hist.append(env.state.copy())

    # Return result object
    res = StepSequence(
        observations=obs_hist,
        actions=act_hist,
        actions_applied=act_app_hist,
        rewards=rew_hist,
        states=state_hist,
        time=t_hist,
        rollout_info=rollout_info,
        env_infos=env_info_hist,
        complete=True,  # the rollout function always returns complete paths
    )

    # Add special entries to the resulting rollout
    if isinstance(policy, Policy):
        if policy.is_recurrent:
            res.add_data("hidden_states", hidden_hist)
        if isinstance(getattr(policy, "policy", policy), PotentialBasedPolicy):
            res.add_data("potentials", pot_hist)
            res.add_data("stimuli_external", stim_ext_hist)
            res.add_data("stimuli_internal", stim_int_hist)
        elif isinstance(getattr(policy, "policy", policy), TwoHeadedPolicy):
            res.add_data("head_2", head_2_hist)
    if record_dts:
        res.add_data("dts_policy", dt_policy_hist)
        res.add_data("dts_step", dt_step_hist)
        res.add_data("dts_remainder", dt_remainder_hist)

    return res
Esempio n. 23
0
import torch as to
from botorch.acquisition import ExpectedImprovement, ProbabilityOfImprovement, UpperConfidenceBound
from botorch.fit import fit_gpytorch_model
from botorch.models import SingleTaskGP
from botorch.optim import optimize_acqf
from gpytorch.mlls import ExactMarginalLogLikelihood
from matplotlib import pyplot as plt
from tqdm import tqdm

from pyrado import set_seed
from pyrado.utils.functions import noisy_nonlin_fcn
from pyrado.utils.math import UnitCubeProjector

if __name__ == "__main__":
    # Adjustable experiment parameters
    set_seed(1001)
    num_init_samples = 4  # number of initial random points
    num_iter = 6  # number of BO updates
    noise_std = 0.0  # noise level
    acq_fcn = "EI"  # acquisition function (UCB / EI / PI)
    num_acq_restarts = 100  # number of restarts for optimizing the acquisition function
    num_acq_samples = 500  # number of samples for used for optimizing the acquisition function
    ucb_beta = 0.1  # UCB coefficient (only necessary if UCB is used

    # Function boundaries
    x_min_raw, x_max_raw = (-2.0, 5.0)
    x_min, x_max = (0.0, 1.0)
    bounds_raw = to.tensor([[x_min_raw], [x_max_raw]])
    bounds = to.tensor([[x_min], [x_max]])
    uc = UnitCubeProjector(bounds_raw[0, :], bounds_raw[1, :])
Esempio n. 24
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Load the data
    data_set_name = 'oscillation_50Hz_initpos-0.5'
    data = pd.read_csv(osp.join(pyrado.PERMA_DIR, 'time_series', f'{data_set_name}.csv'))
    if data_set_name == 'daily_min_temperatures':
        data = to.tensor(data['Temp'].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif data_set_name == 'monthly_sunspots':
        data = to.tensor(data['Sunspots'].values, dtype=to.get_default_dtype()).view(-1, 1)
    elif 'oscillation' in data_set_name:
        data = to.tensor(data['Positions'].values, dtype=to.get_default_dtype()).view(-1, 1)
    else:
        raise pyrado.ValueErr(
            given=data_set_name, eq_constraint="'daily_min_temperatures', 'monthly_sunspots', "
                                               "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4")

    # Dataset
    data_set_hparam = dict(
        name=data_set_name,
        ratio_train=0.7,
        window_size=trial.suggest_int('dataset_window_size', 1, 100),
        standardize_data=False,
        scale_min_max_data=True
    )
    dataset = TimeSeriesDataSet(data, **data_set_hparam)

    # Policy
    policy_hparam = dict(
        dt=0.02 if 'oscillation' in data_set_name else 1.,
        hidden_size=trial.suggest_int('policy_hidden_size', 2, 51),
        obs_layer=None,
        activation_nonlin=fcn_from_str(
            trial.suggest_categorical('policy_activation_nonlin', ['to_tanh', 'to_sigmoid'])),
        mirrored_conv_weights=trial.suggest_categorical('policy_mirrored_conv_weights', [True, False]),
        conv_out_channels=1,
        conv_kernel_size=None,
        conv_padding_mode=trial.suggest_categorical('policy_conv_padding_mode', ['zeros', 'circular']),
        tau_init=trial.suggest_loguniform('policy_tau_init', 1e-2, 1e3),
        tau_learnable=True,
        kappa_init=trial.suggest_categorical('policy_kappa_init', [0, 1e-4, 1e-2]),
        kappa_learnable=True,
        potential_init_learnable=trial.suggest_categorical('policy_potential_init_learnable', [True, False]),
        init_param_kwargs=trial.suggest_categorical('policy_init_param_kwargs', [None, dict(bell=True)]),
        use_cuda=False
    )
    policy = NFPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        windowed=trial.suggest_categorical('algo_windowed', [True, False]),
        max_iter=1000,
        optim_class=optim.Adam,
        optim_hparam=dict(
            lr=trial.suggest_uniform('optim_lr', 5e-4, 5e-2),
            eps=trial.suggest_uniform('optim_eps', 1e-8, 1e-5),
            weight_decay=trial.suggest_uniform('optim_weight_decay', 5e-5, 5e-3)
        ),
        loss_fcn=nn.MSELoss(),
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}'))
    algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    num_init_samples = dataset.window_size
    _, loss_trn = TSPred.evaluate(policy, dataset.data_trn_inp, dataset.data_trn_targ, windowed=algo.windowed,
                                  num_init_samples=num_init_samples, cascaded=False)
    _, loss_tst = TSPred.evaluate(policy, dataset.data_tst_inp, dataset.data_tst_targ, windowed=algo.windowed,
                                  num_init_samples=num_init_samples, cascaded=False)

    return loss_trn
Esempio n. 25
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1/250., max_steps=1500)
    env = ActNormWrapper(env)

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )

    # Critic
    value_fcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    critic_hparam = dict(
        gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        batch_size=100,
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=500,
        min_steps=25*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        batch_size=100,
        std_init=0.9,
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret
Esempio n. 26
0
"""
First, we create an `Experiment`, which basically is a folder (by default in `Pyrado/data/temp`). The experiments are 
stored using the following scheme: <base_dir>/<env_name>/<algo_name>/<timestamp>--<extra_info>.
This rule is only required for the automatic search for experiments (e.g. used in `sim_policy()`). This search function 
requires the individual experiment folders to start with `date_time`. Aside from this, you can name your experiments 
and folders however you like. Use the `load_experiment()` function to later oad your results. It will look for an 
environment as well as a policy file in the provided path.
"""
ex_dir = setup_experiment(BallOnBeamSim.name,
                          f"{HCNormal.name}_{LinearPolicy.name}", "ident-sin")
"""
Additionally, you can set a seed for the random number generators. It is suggested to do so, if you want to
compare changes of certain hyper-parameters to eliminate the effect of the initial state and the initial policy
parameters (both are sampled randomly in most cases).
"""
pyrado.set_seed(seed=0, verbose=True)
"""
Set up the environment a.k.a. domain to train in. After creating the environment, you can apply various wrappers which
are modular. Note that the order of wrappers might be of importance. For example, wrapping an environment with an
`ObsNormWrapper` and then with an `GaussianObsNoiseWrapper` applies the noise on the normalized observations, and yields
different results than the reverse order of wrapping.
Environments in Pyrado can be of different types: (i) written in Python only (like the Quanser simulations or simple
OpenAI Gym environments), (ii) wrapped as well as self-designed MuJoCo-based simulations, or (iii) self-designed
robotic environments powered by Rcs using either the Bullet or Vortex physics engine. None of the simulations includes
any computer vision aspects. It is all about dynamics-based interaction and (continuous) control. The degree of
randomization for the environments varies strongly, since it is a lot of work to randomize them properly (including
testing) and I have to graduate after all ;)
"""
env_hparams = dict(dt=1 / 50.0, max_steps=300)
env = BallOnBeamSim(**env_hparams)
env = ActNormWrapper(env)
Esempio n. 27
0
                                  f"{SysIdViaEpisodicRL.name}-{CEM.name}")
        subrtn, subrtn_hparam = create_cem_subrtn(ex_dir, env_sim, ddp_policy)
    elif args.mode == REPS.name:
        ex_dir = setup_experiment(env_real.name,
                                  f"{SysIdViaEpisodicRL.name}-{REPS.name}")
        subrtn, subrtn_hparam = create_reps_subrtn(ex_dir, env_sim, ddp_policy)
    elif args.mode == NES.name:
        ex_dir = setup_experiment(env_real.name,
                                  f"{SysIdViaEpisodicRL.name}-{NES.name}")
        subrtn, subrtn_hparam = create_nes_subrtn(ex_dir, env_sim, ddp_policy)
    else:
        raise NotImplementedError(
            "Select mode cem, reps, or nes via the command line argument -m")

    # Set the seed
    pyrado.set_seed(1001, verbose=True)

    # Set the hyper-parameters of SysIdViaEpisodicRL
    num_eval_rollouts = 5
    algo_hparam = dict(
        metric=None,
        std_obs_filt=5,
        obs_dim_weight=[1, 1, 1, 1, 10, 10],
        num_rollouts_per_distr=len(dp_map) * 10,  # former 50
        num_workers=subrtn_hparam["num_workers"],
    )

    # Save the environments and the hyper-parameters
    save_dicts_to_yaml(
        dict(env=env_hparams),
        dict(subrtn=subrtn_hparam, subrtn_name=subrtn.name),
Esempio n. 28
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500)
    env = BallOnPlate2DSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64),
                                           (16, 16, 16), (32, 32, 32)]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("shared_hidden_nonlin_policy",
                                      ["to_tanh", "to_relu"])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    qfcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical("hidden_sizes_critic",
                                               [(16, 16), (32, 32), (64, 64),
                                                (16, 16, 16), (32, 32, 32)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_critic",
                                      ["to_tanh", "to_relu"])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                       **qfcn_hparam)
    qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                       **qfcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            "min_steps_algo", [1]),  # 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform("memory_size_algo",
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform("tau_algo", 0.99, 1.0),
        ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9),
        learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo",
                                                  [True, False]),
        standardize_rew=trial.suggest_categorical("standardize_rew_algo",
                                                  [False]),
        gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0),
        target_update_intvl=trial.suggest_categorical(
            "target_update_intvl_algo", [1, 5]),
        num_updates_per_step=trial.suggest_categorical(
            "num_batch_updates_algo", [1, 5]),
        batch_size=trial.suggest_categorical("batch_size_algo",
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f"trial_{trial.number}"))
    algo = SAC(study_dir,
               env,
               policy,
               qfcn_1,
               qfcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Esempio n. 29
0
    parser.add_argument("--frequency", default=250, type=int)
    parser.set_defaults(max_steps=600)
    parser.add_argument("--ppo_iterations", default=150, type=int)
    parser.add_argument("--sprl_iterations", default=50, type=int)
    parser.add_argument("--cov_only", action="store_true")
    args = parser.parse_args()

    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(
        QQubeSwingUpSim.name,
        f"{PPO.name}_{FNNPolicy.name}",
        f"{args.frequency}Hz_{args.max_steps}ROLen_{args.ppo_iterations}PPOIter_{args.sprl_iterations}SPRLIter_cov_only{args.cov_only}_seed_{args.seed}",
    )

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environment
    env_hparams = dict(dt=1 / float(args.frequency), max_steps=args.max_steps)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)  # FNN
    # policy_hparam = dict(hidden_size=32, num_recurrent_layers=1)  # LSTM & GRU
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # policy = GRUPolicy(spec=env.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu)  # FNN
    # vfcn_hparam = dict(hidden_size=32, num_recurrent_layers=1)  # LSTM & GRU
Esempio n. 30
0
    args = get_argparser().parse_args()

    # Set up the example
    ex_dir = osp.join(pyrado.EVAL_DIR, 'illustrative_example')
    env = CatapultExample(m=1.,
                          g_M=3.71,
                          k_M=1000.,
                          x_M=0.5,
                          g_V=8.87,
                          k_V=3000.,
                          x_V=1.5)
    psi = 0.7  # true probability of drawing Venus
    S = 100  # 100
    N = 30  # 30
    noise_th_scale = 0.15  # 0.15
    set_seed(args.seed)
    fig_size = tuple([0.75 * x for x in pyrado.figsize_thesis_1percol_18to10])

    th_true_opt = env.opt_policy_param(
        1 - psi, psi)  # true probabilities instead of counts
    J_true_opt = env.opt_est_expec_return(
        1 - psi, psi)  # true probabilities instead of counts
    print(f'th_true_opt: {th_true_opt}')
    print(f'J_true_opt:   {J_true_opt}\n')

    # Initialize containers
    n_M_hist = np.empty((S, N))
    n_V_hist = np.empty((S, N))
    th_n_opt_hist = np.empty((S, N))
    th_c_hist = np.empty((S, N))
    Jhat_th_n_opt_hist = np.empty((S, N))