Exemple #1
0
def test_adr(env, ex_dir, subrtn_hparam, actor_hparam, value_fcn_hparam,
             critic_hparam, adr_hparam):
    # Create the subroutine for the meta-algorithm
    actor = FNNPolicy(spec=env.spec, **actor_hparam)
    value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                          **value_fcn_hparam)
    critic = GAE(value_fcn, **critic_hparam)
    subroutine = PPO(ex_dir, env, actor, critic, **subrtn_hparam)

    # Create algorithm and train
    particle_hparam = dict(actor=actor_hparam,
                           value_fcn=value_fcn_hparam,
                           critic=critic_hparam)
    algo = ADR(ex_dir,
               env,
               subroutine,
               svpg_particle_hparam=particle_hparam,
               **adr_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
Exemple #2
0
def test_cuda_sampling_w_dr(default_bob, bob_pert):
    # Add randomizer
    env = DomainRandWrapperLive(default_bob, bob_pert)

    # Use a simple policy
    policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh, use_cuda=True)

    # Create the sampler
    sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=10)

    samples = sampler.sample()
    assert samples is not None
Exemple #3
0
def test_actor_critic(env, linear_policy, ex_dir, algo, algo_hparam,
                      value_fcn_type, use_cuda):
    # Create value function
    if value_fcn_type == 'fnn-plain':
        value_fcn = FNN(input_size=env.obs_space.flat_dim,
                        output_size=1,
                        hidden_sizes=[16, 16],
                        hidden_nonlin=to.tanh,
                        use_cuda=use_cuda)
    else:
        vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace)
        if value_fcn_type == 'fnn':
            value_fcn = FNNPolicy(vf_spec,
                                  hidden_sizes=[16, 16],
                                  hidden_nonlin=to.tanh,
                                  use_cuda=use_cuda)
        else:
            value_fcn = RNNPolicy(vf_spec,
                                  hidden_size=16,
                                  num_recurrent_layers=1,
                                  use_cuda=use_cuda)

    # Create critic
    critic_hparam = dict(
        gamma=0.98,
        lamda=0.95,
        batch_size=32,
        lr=1e-3,
        standardize_adv=False,
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Common hyper-parameters
    common_hparam = dict(max_iter=3, min_rollouts=3, num_sampler_envs=1)
    # Add specific hyper parameters if any
    common_hparam.update(algo_hparam)

    # Create algorithm and train
    algo = algo(ex_dir, env, linear_policy, critic, **common_hparam)
    algo.train()
    assert algo.curr_iter == algo.max_iter
Exemple #4
0
def test_param_expl_sampler(default_bob, bob_pert):
    # Add randomizer
    env = DomainRandWrapperLive(default_bob, bob_pert)

    # Use a simple policy
    policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh)

    # Create the sampler
    num_rollouts_per_param = 12
    sampler = ParameterExplorationSampler(
        env,
        policy,
        num_envs=1,
        num_rollouts_per_param=num_rollouts_per_param,
    )

    # Use some random parameters
    num_ps = 12
    params = to.rand(num_ps, policy.num_param)

    # Do the sampling
    samples = sampler.sample(params)

    assert num_ps == len(samples)
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for ri in range(num_rollouts_per_param):
        # Use the first paramset as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[ri]
        # Iterate through others
        for ops in piter:
            ro = ops.rollouts[ri]

            # Compare domain params
            assert pivot.rollout_info['domain_param'] == ro.rollout_info['domain_param']
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(ro[0].observation)
Exemple #5
0
def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=100,
        reward_multiplier=1,
        logger=None
    )
    policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh)
    dr = get_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(format='dict', dtype='numpy')
    reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000)
    random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000)

    losses = []
    for i in range(50):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]
Exemple #6
0
def test_spota_ppo(env, spota_hparam, ex_dir):
    # Environment and domain randomization
    randomizer = get_default_randomizer(env)
    env = DomainRandWrapperBuffer(env, randomizer)

    # Policy and subroutines
    policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh)
    value_fcn = FNN(input_size=env.obs_space.flat_dim,
                    output_size=1,
                    hidden_sizes=[16, 16],
                    hidden_nonlin=to.tanh)
    critic_hparam = dict(gamma=0.998,
                         lamda=0.95,
                         num_epoch=3,
                         batch_size=64,
                         lr=1e-3)
    critic_cand = GAE(value_fcn, **critic_hparam)
    critic_refs = GAE(deepcopy(value_fcn), **critic_hparam)

    subrtn_hparam_cand = dict(
        # min_rollouts=0,  # will be overwritten by SPOTA
        min_steps=0,  # will be overwritten by SPOTA
        max_iter=2,
        num_epoch=3,
        eps_clip=0.1,
        batch_size=64,
        num_sampler_envs=4,
        std_init=0.5,
        lr=1e-2)
    subrtn_hparam_cand = subrtn_hparam_cand

    sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_cand)
    sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs,
                  **subrtn_hparam_cand)

    # Create algorithm and train
    algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam)
    algo.train()
Exemple #7
0
def test_snapshots_notmeta(ex_dir, env, policy, algo_class, algo_hparam):
    # Collect hyper-parameters, create algorithm, and train
    common_hparam = dict(max_iter=1, num_sampler_envs=1)
    common_hparam.update(algo_hparam)

    if issubclass(algo_class, ActorCritic):
        common_hparam.update(
            min_rollouts=3,
            critic=GAE(value_fcn=FNNPolicy(spec=EnvSpec(
                env.obs_space, ValueFunctionSpace),
                                           hidden_sizes=[16, 16],
                                           hidden_nonlin=to.tanh)))
    elif issubclass(algo_class, ParameterExploring):
        common_hparam.update(num_rollouts=1)
    else:
        raise NotImplementedError

    # Train
    algo = algo_class(ex_dir, env, policy, **common_hparam)
    algo.train()
    if isinstance(algo, ActorCritic):
        policy_posttrn_param_values = algo.policy.param_values
        critic_posttrn_value_fcn_param_values = algo.critic.value_fcn.param_values
    elif isinstance(algo, ParameterExploring):
        policy_posttrn_param_values = algo.best_policy_param

    # Save and load
    algo.save_snapshot(meta_info=None)
    algo.load_snapshot(load_dir=ex_dir, meta_info=None)
    policy_loaded = deepcopy(algo.policy)

    # Check
    assert all(policy_posttrn_param_values == policy_loaded.param_values)
    if algo_class in [A2C, PPO, PPO2]:
        critic_loaded = deepcopy(algo.critic)
        assert all(critic_posttrn_value_fcn_param_values ==
                   critic_loaded.value_fcn.param_values)
Exemple #8
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env = QBallBalancerSim(dt=1/250., max_steps=1500)
    env = ActNormWrapper(env)

    # Policy
    policy = FNNPolicy(
        spec=env.spec,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])),
    )

    # Critic
    value_fcn = FNN(
        input_size=env.obs_space.flat_dim,
        output_size=1,
        hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]),
        hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])),
    )
    critic_hparam = dict(
        gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.),
        lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.),
        num_epoch=trial.suggest_int('num_epoch_critic', 1, 10),
        batch_size=100,
        lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    critic = GAE(value_fcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=500,
        min_steps=25*env.max_steps,
        num_epoch=trial.suggest_int('num_epoch_algo', 1, 10),
        eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2),
        batch_size=100,
        std_init=0.9,
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
        # max_grad_norm=5.,
        # lr_scheduler=scheduler.StepLR,
        # lr_scheduler_hparam=dict(step_size=10, gamma=0.9)
        # lr_scheduler=scheduler.ExponentialLR,
        # lr_scheduler_hparam=dict(gamma=0.99)
    )
    algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts)
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts

    return mean_ret
Exemple #9
0
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env = QQubeSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=trial.suggest_categorical(
            'shared_hidden_sizes_policy',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        shared_hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('shared_hidden_nonlin_policy',
                                      ['to_tanh', 'to_relu'])),
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    q_fcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical(
            'hidden_sizes_critic',
            [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical('hidden_nonlin_critic',
                                      ['to_tanh', 'to_relu'])),
    )
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)
    q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)

    # Algorithm
    algo_hparam = dict(
        num_sampler_envs=1,  # parallelize via optuna n_jobs
        max_iter=100 * env.max_steps,
        min_steps=trial.suggest_categorical(
            'min_steps_algo', [1]),  # , 10, env.max_steps, 10*env.max_steps
        memory_size=trial.suggest_loguniform('memory_size_algo',
                                             1e2 * env.max_steps,
                                             1e4 * env.max_steps),
        tau=trial.suggest_uniform('tau_algo', 0.99, 1.),
        alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9),
        learn_alpha=trial.suggest_categorical('learn_alpha_algo',
                                              [True, False]),
        standardize_rew=trial.suggest_categorical('standardize_rew_algo',
                                                  [False]),
        gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.),
        target_update_intvl=trial.suggest_categorical(
            'target_update_intvl_algo', [1, 5]),
        num_batch_updates=trial.suggest_categorical('num_batch_updates_algo',
                                                    [1, 5]),
        batch_size=trial.suggest_categorical('batch_size_algo',
                                             [128, 256, 512]),
        lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3),
    )
    csv_logger = create_csv_step_logger(
        osp.join(ex_dir, f'trial_{trial.number}'))
    algo = SAC(ex_dir,
               env,
               policy,
               q_fcn_1,
               q_fcn_2,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode='latest', seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelSampler(
        env, policy, num_envs=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Exemple #10
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 particle_hparam: dict,
                 max_iter: int,
                 num_particles: int,
                 temperature: float,
                 lr: float,
                 horizon: int,
                 std_init: float = 1.0,
                 min_rollouts: int = None,
                 min_steps: int = 10000,
                 num_sampler_envs: int = 4,
                 serial: bool = True,
                 logger: StepLogger = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param particle_hparam: hyper-parameters for particle template construction
        :param max_iter: number of iterations
        :param num_particles: number of distinct particles
        :param temperature: the temperature of the SVGD determines how jointly the training takes place
        :param lr: the learning rate for the update of the particles
        :param horizon: horizon for each particle
        :param std_init: initial standard deviation for the exploration
        :param min_rollouts: minimum number of rollouts sampled per policy update batch
        :param min_steps: minimum number of state transitions sampled per policy update batch
        :param num_sampler_envs: number of environments for parallel sampling
        :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside
        :param logger: logger for every step of the algorithm
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not isinstance(particle_hparam, dict):
            raise pyrado.TypeErr(given=particle_hparam, expected_type=dict)
        if not all([
                key in particle_hparam
                for key in ['actor', 'value_fcn', 'critic']
        ]):
            raise AttributeError

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy=None, logger=logger)

        # Store the inputs
        self._env = env
        self.num_particles = num_particles
        self.horizon = horizon  # TODO @Robin: where is the horizon used?!
        self.lr = lr
        self.temperature = temperature
        self.serial = serial

        # Prepare placeholders for particles
        self.particles = [None] * num_particles
        self.expl_strats = [None] * num_particles
        self.optimizers = [None] * num_particles
        self.fixed_particles = [None] * num_particles
        self.fixed_expl_strats = [None] * num_particles
        self.samplers = [None] * num_particles
        self.count = 0
        self.updatecount = 0

        # Particle factory
        actor = FNNPolicy(spec=env.spec, **particle_hparam['actor'])
        value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                              **particle_hparam['value_fcn'])
        critic = GAE(value_fcn, **particle_hparam['critic'])
        particle = SVPGParticle(env.spec, actor, critic)

        for i in range(self.num_particles):
            self.particles[i] = deepcopy(particle)
            self.particles[i].init_param()
            self.expl_strats[i] = NormalActNoiseExplStrat(
                self.particles[i].actor, std_init)
            self.optimizers[i] = to.optim.Adam(
                self.expl_strats[i].parameters(), lr=self.lr)
            self.fixed_particles[i] = deepcopy(self.particles[i])
            self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i])

            if self.serial:
                self.samplers[i] = ParallelSampler(env,
                                                   self.expl_strats[i],
                                                   num_sampler_envs,
                                                   min_rollouts=min_rollouts,
                                                   min_steps=min_steps)
Exemple #11
0
def fnn_policy(env):
    return FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh)
Exemple #12
0
    # Environment
    env_hparams = dict()
    env = HopperSim(**env_hparams)
    env = ActNormWrapper(env)

    # # Simple Randomizer
    # dp_nom = HopperSim.get_nominal_domain_param()
    # randomizer = DomainRandomizer(
    #     NormalDomainParam(name='total_mass', mean=dp_nom['total_mass'], std=dp_nom['total_mass']/10, clip_lo=1e-3)
    # )
    # env = DomainRandWrapperLive(env, randomizer)

    # Policy
    policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # Critic
    value_fcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                          **value_fcn_hparam)
    critic_hparam = dict(
        gamma=0.995,
        lamda=0.95,
        num_epoch=10,
        batch_size=512,
        standardize_adv=False,
        standardizer=None,
        max_grad_norm=1.,
        lr=5e-4,
    )
    critic = GAE(value_fcn, **critic_hparam)
Exemple #13
0
    # Environment
    env_hparams = dict(physicsEngine='Bullet', dt=1 / 100., max_steps=500)
    env = BallOnPlate2DSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        shared_hidden_sizes=[32, 32],
        shared_hidden_nonlin=to.relu,
    )
    policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    q_fcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu)
    obsact_space = BoxSpace.cat([env.obs_space, env.act_space])
    q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)
    q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace),
                        **q_fcn_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=1000 * env.max_steps,
        memory_size=1000 * env.max_steps,
        gamma=0.995,
        num_batch_updates=1,
        tau=0.99,
        alpha_init=0.2,
        learn_alpha=False,
        target_update_intvl=1,
        standardize_rew=False,
        min_steps=1,
Exemple #14
0
def cuda_fnnpol_bobspec(default_bob):
    return FNNPolicy(spec=default_bob.spec,
                     hidden_sizes=(32, 32),
                     hidden_nonlin=to.tanh,
                     use_cuda=True)
Exemple #15
0
def fnnpol_bobspec(default_bob):
    return FNNPolicy(spec=default_bob.spec,
                     hidden_sizes=(32, 32),
                     hidden_nonlin=to.tanh)