Ejemplo n.º 1
0
        # feats=FeatureStack(RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=0.8)),
        feats=FeatureStack(identity_feat, sin_feat))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=500,
        eps=0.2,
        pop_size=10 * policy.num_param,
        num_init_states_per_domain=10,
        expl_std_init=0.2,
        expl_std_min=0.02,
        num_epoch_dual=1000,
        optim_mode="scipy",
        lr_dual=1e-3,
        use_map=True,
        num_workers=8,
    )
    algo = REPS(ex_dir, env, policy, **algo_hparam)

    # Save the hyper-parameters
    save_dicts_to_yaml(
        dict(env=env_hparams, seed=args.seed),
        dict(policy=policy_hparam),
        dict(algo=algo_hparam, algo_name=algo.name),
        save_dir=ex_dir,
    )

    # Jeeeha
    algo.train(seed=args.seed)
Ejemplo n.º 2
0
def test_sysidasrl_reps(ex_dir, env: SimEnv, num_eval_rollouts: int):
    pyrado.set_seed(0)

    def eval_ddp_policy(rollouts_real):
        init_states_real = np.array([ro.states[0, :] for ro in rollouts_real])
        rollouts_sim = []
        for i, _ in enumerate(range(num_eval_rollouts)):
            rollouts_sim.append(
                rollout(env_sim,
                        behavior_policy,
                        eval=True,
                        reset_kwargs=dict(init_state=init_states_real[i, :])))

        # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
        ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real,
                                                         rollouts_sim,
                                                         replicate=False)
        assert len(ros_real_tr) == len(ros_sim_tr)
        assert all([
            np.allclose(r.states[0, :], s.states[0, :])
            for r, s in zip(ros_real_tr, ros_sim_tr)
        ])

        # Return the average the loss
        losses = [
            algo.loss_fcn(ro_r, ro_s)
            for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
        ]
        return float(np.mean(np.asarray(losses)))

    # Environments
    env_real = deepcopy(env)
    env_real.domain_param = dict(ang_offset=-2 * np.pi / 180)

    env_sim = deepcopy(env)
    randomizer = DomainRandomizer(
        UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {0: ("ang_offset", "mean"), 1: ("ang_offset", "halfspan")}
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    assert env_real is not env_sim

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(identity_feat))
    prior = DomainRandomizer(
        UniformDomainParam(name="ang_offset",
                           mean=1 * np.pi / 180,
                           halfspan=1 * np.pi / 180), )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=[False, True],
                                        prior=prior)

    # Subroutine
    subrtn_hparam = dict(
        max_iter=2,
        eps=1.0,
        pop_size=100,
        num_init_states_per_domain=1,
        expl_std_init=5e-2,
        expl_std_min=1e-4,
        num_workers=1,
    )
    subrtn = REPS(ex_dir, env_sim, ddp_policy, **subrtn_hparam)

    algo_hparam = dict(metric=None,
                       obs_dim_weight=np.ones(env_sim.obs_space.shape),
                       num_rollouts_per_distr=5,
                       num_workers=1)
    algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam)

    rollouts_real_tst = []
    for _ in range(num_eval_rollouts):
        rollouts_real_tst.append(rollout(env_real, behavior_policy, eval=True))
    loss_pre = eval_ddp_policy(rollouts_real_tst)

    # Mimic training
    while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met():
        algo.logger.add_value(algo.iteration_key, algo.curr_iter)

        # Creat fake real-world data
        rollouts_real = []
        for _ in range(num_eval_rollouts):
            rollouts_real.append(rollout(env_real, behavior_policy, eval=True))

        algo.step(snapshot_mode="latest",
                  meta_info=dict(rollouts_real=rollouts_real))

        algo.logger.record_step()
        algo._curr_iter += 1

    loss_post = eval_ddp_policy(rollouts_real_tst)
    assert loss_post <= loss_pre  # don't have to be better every step
Ejemplo n.º 3
0
                             scale_params=True)
    ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam)
    subsubrtn_distr_hparam = dict(
        max_iter=5,
        eps=1.0,
        pop_size=500,
        num_rollouts=1,
        expl_std_init=5e-2,
        expl_std_min=1e-5,
        num_epoch_dual=1000,
        optim_mode='torch',
        lr_dual=5e-4,
        use_map=True,
        num_workers=num_workers,
    )
    subsubrtn_distr = REPS(ex_dir, env_sim, ddp_policy,
                           **subsubrtn_distr_hparam)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, 10, 10],
        num_rollouts_per_distr=len(dp_map) * 10,
        num_workers=num_workers,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr,
                                      behavior_policy=behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=15,
        num_eval_rollouts=5,
        warmstart=True,
Ejemplo n.º 4
0
        # feats=FeatureStack([RandFourierFeat(env.obs_space.flat_dim, num_feat=20, bandwidth=env.obs_space.bound_up)])
        feats=FeatureStack([const_feat, identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2),
                            MultFeat([3, 4])])
    )
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=500,
        eps=1.0,
        pop_size=20*policy.num_param,
        num_rollouts=4,
        expl_std_init=0.2,
        expl_std_min=0.02,
        use_map=True,
        optim_mode='scipy',
        num_workers=12,
    )
    algo = REPS(ex_dir, env, policy, **algo_hparam)

    # Save the hyper-parameters
    save_list_of_dicts_to_yaml([
        dict(env=env_hparams, seed=args.seed),
        dict(policy=policy_hparam),
        dict(algo=algo_hparam, algo_name=algo.name)],
        ex_dir
    )

    # Jeeeha
    algo.train(snapshot_mode='best', seed=args.seed)