def test_truncate_rollouts(env, num_real_ros, num_sim_ros, max_real_steps,
                           max_sim_steps):
    policy = DummyPolicy(env.spec)
    ros_real = []
    ros_sim = []

    # Create the rollout data
    for _ in range(num_real_ros):
        ros_real.append(
            rollout(env,
                    policy,
                    eval=True,
                    max_steps=max_real_steps,
                    stop_on_done=True))
    for _ in range(num_sim_ros):
        ros_sim.append(
            rollout(env,
                    policy,
                    eval=True,
                    max_steps=max_sim_steps,
                    stop_on_done=True))

    # Truncate them
    ros_real_tr, ros_sim_tr = SysIdViaEpisodicRL.truncate_rollouts(
        ros_real, ros_sim)

    # Obtained the right number of rollouts
    assert len(ros_real_tr) == len(ros_sim_tr)

    for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr):
        # All individual truncated rollouts have the correct length
        assert ro_r.length == ro_s.length
Esempio n. 2
0
    def eval_ddp_policy(
        rollouts_real: Sequence[StepSequence],
        env_sim: MetaDomainRandWrapper,
        num_rollouts: int,
        subrtn_distr: SysIdViaEpisodicRL,
        subrtn_policy: Algorithm,
    ) -> float:
        """
        Evaluate the policy that fits the domain parameter distribution to the observed rollouts.

        :param rollouts_real: recorded real-world rollouts
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param num_rollouts: number of rollouts to collect on the target domain
        :param subrtn_distr: algorithm which performs the optimization of the domain parameter distribution policy
        :param subrtn_policy: algorithm which performs the optimization of the behavioral policy (and value-function)
        :return: average system identification loss
        """
        # Run rollouts in simulation with the same initial states as the real-world rollouts
        assert env_sim.randomizer is subrtn_distr.subrtn.env.randomizer
        init_states_real = np.array([ro.states[0, :] for ro in rollouts_real])
        rollouts_sim = SimOpt.eval_behav_policy(None, env_sim,
                                                subrtn_policy.policy, "",
                                                num_rollouts, init_states_real)

        # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
        ros_real_tr, ros_sim_tr = SysIdViaEpisodicRL.truncate_rollouts(
            rollouts_real, rollouts_sim, replicate=False)
        assert len(ros_real_tr) == len(ros_sim_tr)
        assert all([
            np.allclose(r.states[0, :], s.states[0, :])
            for r, s in zip(ros_real_tr, ros_sim_tr)
        ])

        # Return the average the loss
        losses = [
            subrtn_distr.loss_fcn(ro_r, ro_s)
            for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
        ]
        return float(np.mean(np.asarray(losses)))
Esempio n. 3
0
        obs_dim_weight=[1, 1, 1, 1, 10, 10],
        num_rollouts_per_distr=len(dp_map) * 10,  # former 50
        num_workers=subrtn_hparam["num_workers"],
    )

    # Save the environments and the hyper-parameters
    save_dicts_to_yaml(
        dict(env=env_hparams),
        dict(subrtn=subrtn_hparam, subrtn_name=subrtn.name),
        dict(algo=algo_hparam,
             algo_name=SysIdViaEpisodicRL.name,
             dp_map=dp_map),
        save_dir=ex_dir,
    )

    algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam)

    # Jeeeha
    while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met():
        algo.logger.add_value(algo.iteration_key, algo.curr_iter)

        # Create fake real-world data
        ro_real = []
        for _ in range(num_eval_rollouts):
            ro_real.append(rollout(env_real, behavior_policy, eval=True))

        algo.step(snapshot_mode="latest",
                  meta_info=dict(rollouts_real=ro_real))
        algo.logger.record_step()
        algo._curr_iter += 1
Esempio n. 4
0
        eta_mean=1.0,
        num_init_states_per_domain=1,
        expl_std_init=5e-2,
        expl_std_min=1e-4,
        num_workers=num_workers,
    )
    subsubrtn_distr = NES(ex_dir, env_sim, ddp_policy,
                          **subsubrtn_distr_hparam)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1.0, 1.0, 1.0, 1.0, 2.0, 2.0],
        num_rollouts_per_distr=len(dp_map) * 10,
        num_workers=num_workers,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr,
                                      behavior_policy=behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=15,
        num_eval_rollouts=5,
        warmstart=True,
        thold_succ_subrtn=100,
        subrtn_snapshot_mode="latest",
    )
    algo = SimOpt(ex_dir, env_sim, env_real, subrtn_policy, subrtn_distr,
                  **algo_hparam)

    # Save the environments and the hyper-parameters
    save_dicts_to_yaml(
Esempio n. 5
0
def test_simopt_cem_ppo(ex_dir, env: SimEnv):
    pyrado.set_seed(0)

    # Environments
    env_real = deepcopy(env)
    env_real = ActNormWrapper(env_real)
    env_sim = ActNormWrapper(env)
    randomizer = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.relu)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.99,
        lamda=0.98,
        num_epoch=2,
        batch_size=128,
        standardize_adv=True,
        lr=8e-4,
        max_grad_norm=5.0,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=2,
        eps_clip=0.13,
        min_steps=4 * env_sim.max_steps,
        num_epoch=3,
        batch_size=128,
        std_init=0.75,
        lr=3e-04,
        max_grad_norm=1.0,
        num_workers=1,
    )
    subrtn_policy = PPO(ex_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy_hparam = dict(mapping=dp_map,
                             trafo_mask=trafo_mask,
                             scale_params=True)
    ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam)
    subsubrtn_distr_hparam = dict(
        max_iter=2,
        pop_size=10,
        num_init_states_per_domain=1,
        num_is_samples=8,
        expl_std_init=1e-2,
        expl_std_min=1e-5,
        extra_expl_std_init=1e-2,
        extra_expl_decay_iter=5,
        num_workers=1,
    )
    subsubrtn_distr = CEM(ex_dir, env_sim, ddp_policy,
                          **subsubrtn_distr_hparam)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, 10, 10],
        num_rollouts_per_distr=3,
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr,
                                      behavior_policy=behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=1,
        num_eval_rollouts=5,
        warmstart=True,
    )
    algo = SimOpt(ex_dir, env_sim, env_real, subrtn_policy, subrtn_distr,
                  **algo_hparam)
    algo.train()

    assert algo.curr_iter == algo.max_iter
Esempio n. 6
0
def test_sysidasrl_reps(ex_dir, env: SimEnv, num_eval_rollouts: int):
    pyrado.set_seed(0)

    def eval_ddp_policy(rollouts_real):
        init_states_real = np.array([ro.states[0, :] for ro in rollouts_real])
        rollouts_sim = []
        for i, _ in enumerate(range(num_eval_rollouts)):
            rollouts_sim.append(
                rollout(env_sim,
                        behavior_policy,
                        eval=True,
                        reset_kwargs=dict(init_state=init_states_real[i, :])))

        # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
        ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real,
                                                         rollouts_sim,
                                                         replicate=False)
        assert len(ros_real_tr) == len(ros_sim_tr)
        assert all([
            np.allclose(r.states[0, :], s.states[0, :])
            for r, s in zip(ros_real_tr, ros_sim_tr)
        ])

        # Return the average the loss
        losses = [
            algo.loss_fcn(ro_r, ro_s)
            for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
        ]
        return float(np.mean(np.asarray(losses)))

    # Environments
    env_real = deepcopy(env)
    env_real.domain_param = dict(ang_offset=-2 * np.pi / 180)

    env_sim = deepcopy(env)
    randomizer = DomainRandomizer(
        UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {0: ("ang_offset", "mean"), 1: ("ang_offset", "halfspan")}
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    assert env_real is not env_sim

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(identity_feat))
    prior = DomainRandomizer(
        UniformDomainParam(name="ang_offset",
                           mean=1 * np.pi / 180,
                           halfspan=1 * np.pi / 180), )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=[False, True],
                                        prior=prior)

    # Subroutine
    subrtn_hparam = dict(
        max_iter=2,
        eps=1.0,
        pop_size=100,
        num_init_states_per_domain=1,
        expl_std_init=5e-2,
        expl_std_min=1e-4,
        num_workers=1,
    )
    subrtn = REPS(ex_dir, env_sim, ddp_policy, **subrtn_hparam)

    algo_hparam = dict(metric=None,
                       obs_dim_weight=np.ones(env_sim.obs_space.shape),
                       num_rollouts_per_distr=5,
                       num_workers=1)
    algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam)

    rollouts_real_tst = []
    for _ in range(num_eval_rollouts):
        rollouts_real_tst.append(rollout(env_real, behavior_policy, eval=True))
    loss_pre = eval_ddp_policy(rollouts_real_tst)

    # Mimic training
    while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met():
        algo.logger.add_value(algo.iteration_key, algo.curr_iter)

        # Creat fake real-world data
        rollouts_real = []
        for _ in range(num_eval_rollouts):
            rollouts_real.append(rollout(env_real, behavior_policy, eval=True))

        algo.step(snapshot_mode="latest",
                  meta_info=dict(rollouts_real=rollouts_real))

        algo.logger.record_step()
        algo._curr_iter += 1

    loss_post = eval_ddp_policy(rollouts_real_tst)
    assert loss_post <= loss_pre  # don't have to be better every step
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        Mr=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        Mp=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        Lr=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        Lp=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ('Mr', 'mean'),
        1: ('Mr', 'std'),
        2: ('Mp', 'mean'),
        3: ('Mp', 'std'),
        4: ('Lr', 'mean'),
        5: ('Lr', 'std'),
        6: ('Lp', 'mean'),
        7: ('Lp', 'std')
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9885,
        lamda=0.9648,
        num_epoch=2,
        batch_size=500,
        standardize_adv=False,
        lr=5.792e-4,
        max_grad_norm=1.,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=200,
        min_steps=3 * 23 * env_sim.max_steps,
        num_epoch=7,
        eps_clip=0.0744,
        batch_size=500,
        std_init=0.9074,
        lr=3.446e-04,
        max_grad_norm=1.,
        num_workers=1,
    )
    subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20)
    prior = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom),
        NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom),
        NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom),
        NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom),
    )
    ddp_policy = DomainDistrParamPolicy(
        mapping=dp_map,
        trafo_mask=trafo_mask,
        prior=prior,
        scale_params=trial.suggest_categorical('ddp_policy_scale_params',
                                               [True, False]),
    )
    subsubrtn_distr_hparam = dict(
        max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]),
        pop_size=trial.suggest_int('pop_size', 50, 500),
        num_rollouts=1,
        num_is_samples=trial.suggest_int('num_is_samples', 5, 20),
        expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1),
        expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]),
        extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3,
                                                     1e-1),
        extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0,
                                                10),
        num_workers=1,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    subsubrtn_distr = CEM(study_dir,
                          env_sim,
                          ddp_policy,
                          **subsubrtn_distr_hparam,
                          logger=csv_logger)
    obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight],
        num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20,
                                                 100),
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=trial.suggest_categorical('algo_max_iter', [10]),
        num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts',
                                                    [5]),
        warmstart=trial.suggest_categorical('algo_warmstart', [True]),
        thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn',
                                                    [50]),
        subrtn_snapshot_mode='latest',
    )
    algo = SimOpt(study_dir,
                  env_sim,
                  env_real,
                  subrtn_policy,
                  subrtn_distr,
                  **algo_hparam,
                  logger=csv_logger)

    # Jeeeha
    algo.train(seed=args.seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env_real, algo.policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret