Ejemplo n.º 1
0
    def __init__(
        self, spec: EnvSpec, feats: FeatureStack, init_param_kwargs: Optional[dict] = None, use_cuda: bool = False
    ):
        """
        Constructor

        :param spec: specification of environment
        :param feats: list of feature functions
        :param init_param_kwargs: additional keyword arguments for the policy parameter initialization
        :param use_cuda: `True` to move the module to the GPU, `False` (default) to use the CPU
        """
        if not isinstance(feats, FeatureStack):
            raise pyrado.TypeErr(given=feats, expected_type=FeatureStack)

        # Call Policy's constructor
        super().__init__(spec, use_cuda)

        self._feats = feats
        self.num_active_feat = feats.get_num_feat(spec.obs_space.flat_dim)
        self.net = nn.Linear(self.num_active_feat, spec.act_space.flat_dim, bias=False)

        # Call custom initialization function after PyTorch network parameter initialization
        init_param_kwargs = init_param_kwargs if init_param_kwargs is not None else dict()
        self.init_param(None, **init_param_kwargs)
        self.to(self.device)
Ejemplo n.º 2
0
    def __init__(self,
                 spec: EnvSpec,
                 feats: FeatureStack,
                 init_param_kwargs: dict = None,
                 use_cuda: bool = False):
        """
        Constructor

        :param spec: specification of environment
        :param feats: list of feature functions
        :param init_param_kwargs: additional keyword arguments for the policy parameter initialization
        """
        super().__init__(spec, use_cuda)

        if not isinstance(feats, FeatureStack):
            raise pyrado.TypeErr(given=feats, expected_type=FeatureStack)

        # Store inputs
        self._num_act = spec.act_space.flat_dim
        self._num_obs = spec.obs_space.flat_dim

        self._feats = feats
        self.num_active_feat = feats.get_num_feat(self._num_obs)
        self.net = nn.Linear(self.num_active_feat, self._num_act, bias=False)

        # Call custom initialization function after PyTorch network parameter initialization
        init_param_kwargs = init_param_kwargs if init_param_kwargs is not None else dict(
        )
        self.init_param(None, **init_param_kwargs)
        self.to(self.device)
Ejemplo n.º 3
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 250., max_steps=1500)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(feats=FeatureStack([
        identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat,
        ATan2Feat(1, 2),
        MultFeat([4, 5])
    ]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=50,
        pop_size=trial.suggest_int('pop_size', 50, 200),
        num_rollouts=trial.suggest_int('num_rollouts', 4, 10),
        num_is_samples=trial.suggest_int('num_is_samples', 5, 40),
        expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5),
        symm_sampling=trial.suggest_categorical('symm_sampling',
                                                [True, False]),
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'),
                 env,
                 policy,
                 **algo_hparam,
                 logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Ejemplo n.º 4
0
def create_nonrecurrent_policy():
    return LinearPolicy(
        EnvSpec(
            BoxSpace(-1, 1, 4),
            BoxSpace(-1, 1, 3),
        ),
        FeatureStack(const_feat, identity_feat, squared_feat),
    )
Ejemplo n.º 5
0
def create_lin_setup(physicsEngine: str, dt: float, max_steps: int,
                     checkJointLimits: bool):
    # Set up environment
    env = MiniGolfIKSim(
        usePhysicsNode=True,
        physicsEngine=physicsEngine,
        dt=dt,
        max_steps=max_steps,
        checkJointLimits=checkJointLimits,
        fixedInitState=True,
    )

    # Set up policy
    policy = LinearPolicy(env.spec, FeatureStack([const_feat]))
    policy.param_values = to.tensor([0.6, 0.0, 0.03
                                     ])  # X (abs), Y (rel), Z (abs), C (abs)

    return env, policy
Ejemplo n.º 6
0
def create_bob_setup():
    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=500)
    env_real = BallOnBeamSim(**env_hparams)
    env_real.domain_param = dict(
        # l_beam=1.95,
        # ang_offset=-0.03,
        g=10.81)

    env_sim = BallOnBeamSim(**env_hparams)
    randomizer = DomainRandomizer(
        # NormalDomainParam(name='l_beam', mean=0, std=1e-12, clip_lo=1.5, clip_up=3.5),
        # UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12),
        NormalDomainParam(name='g', mean=0, std=1e-12), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        # 0: ('l_beam', 'mean'), 1: ('l_beam', 'std'),
        # 2: ('ang_offset', 'mean'), 3: ('ang_offset', 'halfspan')
        0: ('g', 'mean'),
        1: ('g', 'std')
    }
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(
                                       [identity_feat, sin_feat]))
    behavior_policy.param_values = to.tensor(
        [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443])
    prior = DomainRandomizer(
        # NormalDomainParam(name='l_beam', mean=2.05, std=2.05/10),
        # UniformDomainParam(name='ang_offset', mean=0.03, halfspan=0.03/10),
        NormalDomainParam(name='g', mean=8.81, std=8.81 / 10), )
    # trafo_mask = [False, True, False, True]
    trafo_mask = [True, True]
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=True)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
Ejemplo n.º 7
0
    ex_dir = setup_experiment(BallOnBeamSim.name,
                              f"{REPS.name}_{LinearPolicy.name}")

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environment
    env_hparams = dict(dt=1 / 100.0, max_steps=500)
    env = BallOnBeamSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        # feats=FeatureStack(RFFeat(env.obs_space.flat_dim, num_feat=1000, bandwidth=1/env.obs_space.bound_up))
        # feats=FeatureStack(RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=0.8)),
        feats=FeatureStack(identity_feat, sin_feat))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=500,
        eps=0.2,
        pop_size=10 * policy.num_param,
        num_init_states_per_domain=10,
        expl_std_init=0.2,
        expl_std_min=0.02,
        num_epoch_dual=1000,
        optim_mode="scipy",
        lr_dual=1e-3,
        use_map=True,
        num_workers=8,
Ejemplo n.º 8
0
from pyrado.policies.linear import LinearPolicy


if __name__ == '__main__':
    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(BallOnBeamSim.name, REPS.name, LinearPolicy.name, seed=1001)

    # Environment
    env_hparams = dict(dt=1/100., max_steps=500)
    env = BallOnBeamSim(**env_hparams)

    # Policy
    policy_hparam = dict(
        # feats=FeatureStack([RandFourierFeat(env.obs_space.flat_dim, num_feat=100, bandwidth=env.obs_space.bound_up)])
        # feats=FeatureStack([RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=0.8)]),
        feats=FeatureStack([identity_feat, sin_feat])
    )
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=200,
        eps=0.1,
        gamma=0.995,
        pop_size=20*policy.num_param,
        num_rollouts=10,
        expl_std_init=1.0,
        expl_std_min=0.02,
        num_epoch_dual=500,
        grad_free_optim=True,
        lr_dual=1e-4,
Ejemplo n.º 9
0
from pyrado.policies.linear import LinearPolicy
from pyrado.spaces import BoxSpace
from pyrado.utils.data_types import EnvSpec


if __name__ == '__main__':
    # Define some arbitrary EnvSpec
    obs_space = BoxSpace(bound_lo=np.array([-5., -12.]), bound_up=np.array([10., 6.]))
    act_space = BoxSpace(bound_lo=np.array([-1.]), bound_up=np.array([1.]))
    spec = EnvSpec(obs_space, act_space)

    num_fpd = 5
    num_eval_points = 500

    policy_hparam = dict(
        feats=FeatureStack([RBFFeat(num_feat_per_dim=num_fpd, bounds=obs_space.bounds, scale=None)])
    )
    policy = LinearPolicy(spec, **policy_hparam)

    eval_grid_0 = to.linspace(-5., 10, num_eval_points)
    eval_grid_1 = to.linspace(-12., 6, num_eval_points)
    eval_grid = to.stack([eval_grid_0, eval_grid_1], dim=1)

    feat_vals = to.empty(num_eval_points, num_fpd*obs_space.flat_dim)
    # Feed evaluation samples one by one
    for i, obs in enumerate(eval_grid):
        feat_vals[i, :] = policy.eval_feats(obs)

    feat_vals_batch = policy.eval_feats(eval_grid)

    if (feat_vals == feat_vals_batch).all():
Ejemplo n.º 10
0
robotic environments powered by Rcs using either the Bullet or Vortex physics engine. None of the simulations includes
any computer vision aspects. It is all about dynamics-based interaction and (continuous) control. The degree of
randomization for the environments varies strongly, since it is a lot of work to randomize them properly (including
testing) and I have to graduate after all ;)
"""
env_hparams = dict(dt=1 / 50.0, max_steps=300)
env = BallOnBeamSim(**env_hparams)
env = ActNormWrapper(env)
"""
Set up the policy after the environment since it needs to know the dimensions of the policies observation and action
space. There are many different policy architectures available under `Pyrado/pyrado/policies`, which significantly
vary in terms of required hyper-parameters. You can find some examples at `Pyrado/scripts/training`.
Note that all policies must inherit from `Policy` which inherits from `torch.nn.Module`. Moreover, all `Policy`
instances are deterministic. The exploration is handled separately (see `Pyrado/pyrado/exploration`).
"""
policy_hparam = dict(feats=FeatureStack(identity_feat, sin_feat))
policy = LinearPolicy(spec=env.spec, **policy_hparam)
"""
Specify the algorithm you want to use for learning the policy parameters.
For deterministic sampling, you need to set `num_workers=1`. If `num_workers>1`, PyTorch's multiprocessing
library will be used to parallelize sampling from the environment on the CPU. The resulting behavior is non-deterministic,
i.e. even for the same random seed, you will get different results. Moreover, it is advised to set `num_workers` to 1
if you want to debug your code.
The algorithms can be categorized in two different types: one type randomizes the action every step (their exploration
strategy inherits from `StochasticActionExplStrat`), and the other type randomizes the policy parameters once every
rollout their exploration strategy inherits from `StochasticParamExplStrat`). It goes without saying that every
algorithm has different hyper-parameters. However, they all use the same `rollout()` function to generate their data.
"""
algo_hparam = dict(
    max_iter=8,
    pop_size=20,
Ejemplo n.º 11
0
    args = get_argparser().parse_args()

    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(OneMassOscillatorSim.name, f'{PEPG.name}_{LinearPolicy.name}')

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environment
    env_hparams = dict(dt=1/50., max_steps=200)
    env = OneMassOscillatorSim(**env_hparams, task_args=dict(task_args=dict(state_des=np.array([0.5, 0]))))
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        feats=FeatureStack([const_feat, identity_feat])
    )
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=100,
        num_rollouts=8,
        pop_size=60,
        expl_std_init=1.0,
        clip_ratio_std=0.05,
        normalize_update=False,
        transform_returns=True,
        lr=1e-2,
        num_workers=8,
    )
Ejemplo n.º 12
0
"""
from tabulate import tabulate

from pyrado.environment_wrappers.action_normalization import ActNormWrapper
from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim
from pyrado.policies.features import FeatureStack, identity_feat, squared_feat
from pyrado.policies.feed_back.linear import LinearPolicy
from pyrado.sampling.parallel_rollout_sampler import ParallelRolloutSampler

if __name__ == "__main__":
    # Set up environment
    env = BallOnBeamSim(dt=0.02, max_steps=500)
    env = ActNormWrapper(env)

    # Set up policy
    feats = FeatureStack(identity_feat, squared_feat)
    policy = LinearPolicy(env.spec, feats)

    # Set up sampler
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=2,
                                     min_rollouts=2000)

    # Sample and print
    ros = sampler.sample()
    print(
        tabulate({
            "StepSequence count": len(ros),
            "Step count": sum(map(len, ros)),
        }.items()))
Ejemplo n.º 13
0
    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(QCartPoleSwingUpSim.name, f'{REPS.name}_{LinearPolicy.name}')

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_hparams = dict(dt=1/250., max_steps=3000, long=False)
    env = QCartPoleSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        # feats=FeatureStack([RandFourierFeat(env.obs_space.flat_dim, num_feat=20, bandwidth=env.obs_space.bound_up)])
        feats=FeatureStack([const_feat, identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2),
                            MultFeat([3, 4])])
    )
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=500,
        eps=1.0,
        pop_size=20*policy.num_param,
        num_rollouts=4,
        expl_std_init=0.2,
        expl_std_min=0.02,
        use_map=True,
        optim_mode='scipy',
        num_workers=12,
    )
Ejemplo n.º 14
0
    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(OneMassOscillatorSim.name,
                              f"{PEPG.name}_{LinearPolicy.name}")

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environment
    env_hparams = dict(dt=1 / 50.0, max_steps=200)
    env = OneMassOscillatorSim(**env_hparams,
                               task_args=dict(state_des=np.array([0.5, 0])))
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(feats=FeatureStack(const_feat, identity_feat))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=100,
        num_init_states_per_domain=8,
        pop_size=60,
        expl_std_init=1.0,
        clip_ratio_std=0.05,
        normalize_update=False,
        transform_returns=True,
        lr=1e-2,
        num_workers=8,
    )
    algo = PEPG(ex_dir, env, policy, **algo_hparam)
Ejemplo n.º 15
0
def get_lin_ctrl(env: SimEnv, ctrl_type: str, ball_z_dim_mismatch: bool = True) -> LinearPolicy:
    """
    Construct a linear controller specified by its controller gains.
    Parameters for BallOnPlate5DSim by Markus Lamprecht (clipped gains < 1e-5 to 0).

    :param env: environment
    :param ctrl_type: type of the controller: 'lqr', or 'h2'
    :param ball_z_dim_mismatch: only useful for BallOnPlate5DSim
                                set to True if the given controller dos not have the z component (relative position)
                                of the ball in the state vector, i.e. state is 14-dim instead of 16-dim
    :return: controller compatible with Pyrado Policy
    """
    from pyrado.environments.rcspysim.ball_on_plate import BallOnPlate5DSim

    if isinstance(inner_env(env), BallOnPlate5DSim):
        # Get the controller gains (K-matrix)
        if ctrl_type.lower() == 'lqr':
            ctrl_gains = to.tensor([
                [0.1401, 0, 0, 0, -0.09819, -0.1359, 0, 0.545, 0, 0, 0, -0.01417, -0.04427, 0],
                [0, 0.1381, 0, 0.2518, 0, 0, -0.2142, 0, 0.5371, 0, 0.03336, 0, 0, -0.1262],
                [0, 0, 0.1414, 0.0002534, 0, 0, -0.0002152, 0, 0, 0.5318, 0, 0, 0, -0.0001269],
                [0, -0.479, -0.0004812, 39.24, 0, 0, -15.44, 0, -1.988, -0.001934, 9.466, 0, 0, -13.14],
                [0.3039, 0, 0, 0, 25.13, 15.66, 0, 1.284, 0, 0, 0, 7.609, 6.296, 0]
            ])

        elif ctrl_type.lower() == 'h2':
            ctrl_gains = to.tensor([
                [-73.88, -2.318, 39.49, -4.270, 12.25, 0.9779, 0.2564, 35.11, 5.756, 0.8661, -0.9898, 1.421, 3.132,
                 -0.01899],
                [-24.45, 0.7202, -10.58, 2.445, -0.6957, 2.1619, -0.3966, -61.66, -3.254, 5.356, 0.1908, 12.88,
                 6.142, -0.3812],
                [-101.8, -9.011, 64.345, -5.091, 17.83, -2.636, 0.9506, -44.28, 3.206, 37.59, 2.965, -32.65, -21.68,
                 -0.1133],
                [-59.56, 1.56, -0.5794, 26.54, -2.503, 3.827, -7.534, 9.999, 1.143, -16.96, 8.450, -5.302, 4.620,
                 -10.32],
                [-107.1, 0.4359, 19.03, -9.601, 20.33, 10.36, 0.2285, -74.98, -2.136, 7.084, -1.240, 62.62, 33.66,
                 1.790]
            ])

        else:
            raise pyrado.ValueErr(given=ctrl_type, eq_constraint="'lqr' or 'h2'")

        # Compensate for the mismatching different state definition
        if ball_z_dim_mismatch:
            ctrl_gains = insert_tensor_col(ctrl_gains, 7, to.zeros((5, 1)))  # ball z position
            ctrl_gains = insert_tensor_col(ctrl_gains, -1, to.zeros((5, 1)))  # ball z velocity

    elif isinstance(inner_env(env), QBallBalancerSim):
        # Get the controller gains (K-matrix)
        if ctrl_type.lower() == 'pd':
            # Quanser gains (the original Quanser controller includes action clipping)
            ctrl_gains = -to.tensor([[-14., 0, -14*3.45, 0, 0, 0, -14*2.11, 0],
                                     [0, -14., 0, -14*3.45, 0, 0, 0, -14*2.11]])

        elif ctrl_type.lower() == 'lqr':
            # Since the control module can by tricky to install (recommended using anaconda), we only load it if needed
            import control

            # System modeling
            A = np.zeros((env.obs_space.flat_dim, env.obs_space.flat_dim))
            A[:env.obs_space.flat_dim//2, env.obs_space.flat_dim//2:] = np.eye(env.obs_space.flat_dim//2)
            A[4, 4] = -env.B_eq_v/env.J_eq
            A[5, 5] = -env.B_eq_v/env.J_eq
            A[6, 0] = env.c_kin*env.m_ball*env.g*env.r_ball**2/env.zeta
            A[6, 6] = -env.c_kin*env.r_ball**2/env.zeta
            A[7, 1] = env.c_kin*env.m_ball*env.g*env.r_ball**2/env.zeta
            A[7, 7] = -env.c_kin*env.r_ball**2/env.zeta
            B = np.zeros((env.obs_space.flat_dim, env.act_space.flat_dim))
            B[4, 0] = env.A_m/env.J_eq
            B[5, 1] = env.A_m/env.J_eq
            # C = np.zeros((env.obs_space.flat_dim // 2, env.obs_space.flat_dim))
            # C[:env.obs_space.flat_dim // 2, :env.obs_space.flat_dim // 2] = np.eye(env.obs_space.flat_dim // 2)
            # D = np.zeros((env.obs_space.flat_dim // 2, env.act_space.flat_dim))

            # Get the weighting matrices from the environment
            Q = env.task.rew_fcn.Q
            R = env.task.rew_fcn.R
            # Q = np.diag([1e2, 1e2, 5e2, 5e2, 1e-2, 1e-2, 1e+1, 1e+1])

            # Solve the continuous time Riccati eq
            K, _, _ = control.lqr(A, B, Q, R)  # for discrete system pass dt
            ctrl_gains = to.from_numpy(K).to(to.get_default_dtype())
        else:
            raise pyrado.ValueErr(given=ctrl_type, eq_constraint="'pd', 'lqr'")

    else:
        raise pyrado.TypeErr(given=inner_env(env), expected_type=BallOnPlate5DSim)

    # Reconstruct the controller
    feats = FeatureStack([identity_feat])
    ctrl = LinearPolicy(env.spec, feats)
    ctrl.init_param(-1*ctrl_gains)  # in classical control it is u = -K*x; here a = psi(s)*s
    return ctrl
Ejemplo n.º 16
0
    def __init__(self,
                 spec: EnvSpec,
                 rbf_hparam: dict,
                 dim_mask: int = 2,
                 init_param_kwargs: dict = None,
                 use_cuda: bool = False):
        """
        Constructor

        :param spec: specification of environment
        :param rbf_hparam: hyper-parameters for the RBF-features, see `RBFFeat`
        :param dim_mask: number of RBF features to mask out at the beginning and the end of every dimension,
                         pass 1 to remove the first and the last features for the policy, pass 0 to use all
                         RBF features. Masking out RBFs makes sense if you want to obtain a smooth starting behavior.
        :param init_param_kwargs: additional keyword arguments for the policy parameter initialization
        :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU
        """
        if not spec.act_space.flat_dim % 2 == 0:
            raise pyrado.ShapeErr(
                msg=
                "DualRBFLinearPolicy only works with an even number of actions, since we are using the time "
                "derivative of the features to create the second half of the outputs. This is done to use "
                "forward() in order to obtain the joint position and the joint velocities. Check the action space "
                "of the environment if the second halt of the actions space are velocities!"
            )
        if not (0 <= dim_mask <= rbf_hparam["num_feat_per_dim"] // 2):
            raise pyrado.ValueErr(
                given=dim_mask,
                ge_constraint="0",
                le_constraint=f"{rbf_hparam['num_feat_per_dim']//2}")

        # Construct the RBF features
        self._feats = RBFFeat(**rbf_hparam)

        # Call LinearPolicy's constructor (custom parts will be overridden later)
        super().__init__(spec, FeatureStack(self._feats), init_param_kwargs,
                         use_cuda)

        # Override custom parts
        self._feats = RBFFeat(**rbf_hparam)
        self.dim_mask = dim_mask
        if self.dim_mask > 0:
            self.num_active_feat = self._feats.num_feat - 2 * self.dim_mask * spec.obs_space.flat_dim
        else:
            self.num_active_feat = self._feats.num_feat
        self.net = nn.Linear(self.num_active_feat,
                             self.env_spec.act_space.flat_dim // 2,
                             bias=False)

        # Create mask to deactivate first and last feature of every input dimension
        self.feats_mask = to.ones(self._feats.centers.shape, dtype=to.bool)
        self.feats_mask[:self.dim_mask, :] = False
        self.feats_mask[-self.dim_mask:, :] = False
        self.feats_mask = self.feats_mask.t().reshape(
            -1)  # reshape the same way as in RBFFeat

        # Call custom initialization function after PyTorch network parameter initialization
        init_param_kwargs = init_param_kwargs if init_param_kwargs is not None else dict(
        )
        self.init_param(None, **init_param_kwargs)
        self.to(self.device)
Ejemplo n.º 17
0
    ex_dir = setup_experiment(QBallBalancerSim.name, f'{SPOTA.name}-{HCNormal.name}',
                              f'{LinearPolicy.name}_obsnoise-s_actedlay-10', seed=1001)

    # Environment and domain randomization
    env_hparams = dict(dt=1/100., max_steps=500)
    env = QBallBalancerSim(**env_hparams)
    env = GaussianObsNoiseWrapper(env, noise_std=[1/180*pi, 1/180*pi, 0.005, 0.005,  # [rad, rad, m, m, ...
                                                  10/180*pi, 10/180*pi, 0.05, 0.05])  # ... rad/s, rad/s, m/s, m/s]
    # env = ObsPartialWrapper(env, mask=[0, 0, 0, 0, 1, 1, 0, 0])
    env = ActDelayWrapper(env)
    randomizer = get_default_randomizer(env)
    randomizer.add_domain_params(UniformDomainParam(name='act_delay', mean=5, halfspan=5, clip_lo=0, roundint=True))
    env = DomainRandWrapperBuffer(env, randomizer)

    # Policy
    policy_hparam = dict(feats=FeatureStack([identity_feat]))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Initialize with Quanser's PD gains
    init_policy_param_values = to.tensor([[-14., 0, -14*3.45, 0, 0, 0, -14*2.11, 0],
                                          [0, -14., 0, -14*3.45, 0, 0, 0, -14*2.11]])

    # Algorithm
    subrtn_hparam_cand = dict(
        max_iter=100,
        num_rollouts=0,  # will be overwritten by SPOTA
        pop_size=50,
        expl_factor=1.1,
        expl_std_init=0.5,
    )
    subrtn_hparam_cand = subrtn_hparam_cand
Ejemplo n.º 18
0
        len_rollouts=env_sim.max_steps,
        # recurrent_network_type=nn.RNN,
        # only_last_output=True,
        # hidden_size=20,
        # num_recurrent_layers=1,
        # output_size=1,
    )
    embedding = create_embedding(DeltaStepsEmbedding.name, env_sim.spec,
                                 **embedding_hparam)

    # Posterior (normalizing flow)
    posterior_hparam = dict(model="maf", hidden_features=20, num_transforms=4)

    # Policy
    policy_hparam = dict(
        feats=FeatureStack(const_feat, identity_feat, sign_feat, squared_feat,
                           MultFeat((0, 2)), MultFeat((1, 2))))
    policy = LinearPolicy(spec=env_sim.spec, **policy_hparam)

    # Policy optimization subroutine
    subrtn_policy_hparam = dict(
        max_iter=5,
        pop_size=5 * policy.num_param,
        num_domains=20,
        num_init_states_per_domain=1,
        expl_factor=1.05,
        expl_std_init=1.0,
        num_workers=args.num_workers,
    )
    subrtn_policy = HCNormal(ex_dir, env_sim, policy, **subrtn_policy_hparam)

    # Algorithm