def test_adr(env, ex_dir, subrtn_hparam, actor_hparam, value_fcn_hparam, critic_hparam, adr_hparam): # Create the subroutine for the meta-algorithm actor = FNNPolicy(spec=env.spec, **actor_hparam) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) critic = GAE(value_fcn, **critic_hparam) subroutine = PPO(ex_dir, env, actor, critic, **subrtn_hparam) # Create algorithm and train particle_hparam = dict(actor=actor_hparam, value_fcn=value_fcn_hparam, critic=critic_hparam) algo = ADR(ex_dir, env, subroutine, svpg_particle_hparam=particle_hparam, **adr_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_cuda_sampling_w_dr(default_bob, bob_pert): # Add randomizer env = DomainRandWrapperLive(default_bob, bob_pert) # Use a simple policy policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh, use_cuda=True) # Create the sampler sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=10) samples = sampler.sample() assert samples is not None
def test_actor_critic(env, linear_policy, ex_dir, algo, algo_hparam, value_fcn_type, use_cuda): # Create value function if value_fcn_type == 'fnn-plain': value_fcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) if value_fcn_type == 'fnn': value_fcn = FNNPolicy(vf_spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: value_fcn = RNNPolicy(vf_spec, hidden_size=16, num_recurrent_layers=1, use_cuda=use_cuda) # Create critic critic_hparam = dict( gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False, ) critic = GAE(value_fcn, **critic_hparam) # Common hyper-parameters common_hparam = dict(max_iter=3, min_rollouts=3, num_sampler_envs=1) # Add specific hyper parameters if any common_hparam.update(algo_hparam) # Create algorithm and train algo = algo(ex_dir, env, linear_policy, critic, **common_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_param_expl_sampler(default_bob, bob_pert): # Add randomizer env = DomainRandWrapperLive(default_bob, bob_pert) # Use a simple policy policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh) # Create the sampler num_rollouts_per_param = 12 sampler = ParameterExplorationSampler( env, policy, num_envs=1, num_rollouts_per_param=num_rollouts_per_param, ) # Use some random parameters num_ps = 12 params = to.rand(num_ps, policy.num_param) # Do the sampling samples = sampler.sample(params) assert num_ps == len(samples) for ps in samples: assert len(ps.rollouts) == num_rollouts_per_param # Compare rollouts that should be matching for ri in range(num_rollouts_per_param): # Use the first paramset as pivot piter = iter(samples) pivot = next(piter).rollouts[ri] # Iterate through others for ops in piter: ro = ops.rollouts[ri] # Compare domain params assert pivot.rollout_info['domain_param'] == ro.rollout_info['domain_param'] # Compare first observation a.k.a. init state assert pivot[0].observation == pytest.approx(ro[0].observation)
def test_adr_reward_generator(env): reference_env = env random_env = deepcopy(env) reward_generator = RewardGenerator( env_spec=random_env.spec, batch_size=100, reward_multiplier=1, logger=None ) policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh) dr = get_default_randomizer_omo() dr.randomize(num_samples=1) random_env.domain_param = dr.get_params(format='dict', dtype='numpy') reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000) random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000) losses = [] for i in range(50): reference_traj = StepSequence.concat(reference_sampler.sample()) random_traj = StepSequence.concat(random_sampler.sample()) losses.append(reward_generator.train(reference_traj, random_traj, 10)) assert losses[len(losses) - 1] < losses[0]
def test_spota_ppo(env, spota_hparam, ex_dir): # Environment and domain randomization randomizer = get_default_randomizer(env) env = DomainRandWrapperBuffer(env, randomizer) # Policy and subroutines policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh) value_fcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) critic_hparam = dict(gamma=0.998, lamda=0.95, num_epoch=3, batch_size=64, lr=1e-3) critic_cand = GAE(value_fcn, **critic_hparam) critic_refs = GAE(deepcopy(value_fcn), **critic_hparam) subrtn_hparam_cand = dict( # min_rollouts=0, # will be overwritten by SPOTA min_steps=0, # will be overwritten by SPOTA max_iter=2, num_epoch=3, eps_clip=0.1, batch_size=64, num_sampler_envs=4, std_init=0.5, lr=1e-2) subrtn_hparam_cand = subrtn_hparam_cand sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_cand) sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs, **subrtn_hparam_cand) # Create algorithm and train algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam) algo.train()
def test_snapshots_notmeta(ex_dir, env, policy, algo_class, algo_hparam): # Collect hyper-parameters, create algorithm, and train common_hparam = dict(max_iter=1, num_sampler_envs=1) common_hparam.update(algo_hparam) if issubclass(algo_class, ActorCritic): common_hparam.update( min_rollouts=3, critic=GAE(value_fcn=FNNPolicy(spec=EnvSpec( env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh))) elif issubclass(algo_class, ParameterExploring): common_hparam.update(num_rollouts=1) else: raise NotImplementedError # Train algo = algo_class(ex_dir, env, policy, **common_hparam) algo.train() if isinstance(algo, ActorCritic): policy_posttrn_param_values = algo.policy.param_values critic_posttrn_value_fcn_param_values = algo.critic.value_fcn.param_values elif isinstance(algo, ParameterExploring): policy_posttrn_param_values = algo.best_policy_param # Save and load algo.save_snapshot(meta_info=None) algo.load_snapshot(load_dir=ex_dir, meta_info=None) policy_loaded = deepcopy(algo.policy) # Check assert all(policy_posttrn_param_values == policy_loaded.param_values) if algo_class in [A2C, PPO, PPO2]: critic_loaded = deepcopy(algo.critic) assert all(critic_posttrn_value_fcn_param_values == critic_loaded.value_fcn.param_values)
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1/250., max_steps=1500) env = ActNormWrapper(env) # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # Critic value_fcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) critic_hparam = dict( gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), batch_size=100, lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) critic = GAE(value_fcn, **critic_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=500, min_steps=25*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), batch_size=100, std_init=0.9, lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 100., max_steps=600) env = QQubeSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( 'shared_hidden_sizes_policy', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical('shared_hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic q_fcn_hparam = dict( hidden_sizes=trial.suggest_categorical( 'hidden_sizes_critic', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( 'min_steps_algo', [1]), # , 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform('memory_size_algo', 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform('tau_algo', 0.99, 1.), alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9), learn_alpha=trial.suggest_categorical('learn_alpha_algo', [True, False]), standardize_rew=trial.suggest_categorical('standardize_rew_algo', [False]), gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.), target_update_intvl=trial.suggest_categorical( 'target_update_intvl_algo', [1, 5]), num_batch_updates=trial.suggest_categorical('num_batch_updates_algo', [1, 5]), batch_size=trial.suggest_categorical('batch_size_algo', [128, 256, 512]), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(ex_dir, f'trial_{trial.number}')) algo = SAC(ex_dir, env, policy, q_fcn_1, q_fcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler( env, policy, num_envs=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def __init__(self, save_dir: str, env: Env, particle_hparam: dict, max_iter: int, num_particles: int, temperature: float, lr: float, horizon: int, std_init: float = 1.0, min_rollouts: int = None, min_steps: int = 10000, num_sampler_envs: int = 4, serial: bool = True, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param particle_hparam: hyper-parameters for particle template construction :param max_iter: number of iterations :param num_particles: number of distinct particles :param temperature: the temperature of the SVGD determines how jointly the training takes place :param lr: the learning rate for the update of the particles :param horizon: horizon for each particle :param std_init: initial standard deviation for the exploration :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside :param logger: logger for every step of the algorithm """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(particle_hparam, dict): raise pyrado.TypeErr(given=particle_hparam, expected_type=dict) if not all([ key in particle_hparam for key in ['actor', 'value_fcn', 'critic'] ]): raise AttributeError # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy=None, logger=logger) # Store the inputs self._env = env self.num_particles = num_particles self.horizon = horizon # TODO @Robin: where is the horizon used?! self.lr = lr self.temperature = temperature self.serial = serial # Prepare placeholders for particles self.particles = [None] * num_particles self.expl_strats = [None] * num_particles self.optimizers = [None] * num_particles self.fixed_particles = [None] * num_particles self.fixed_expl_strats = [None] * num_particles self.samplers = [None] * num_particles self.count = 0 self.updatecount = 0 # Particle factory actor = FNNPolicy(spec=env.spec, **particle_hparam['actor']) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam['value_fcn']) critic = GAE(value_fcn, **particle_hparam['critic']) particle = SVPGParticle(env.spec, actor, critic) for i in range(self.num_particles): self.particles[i] = deepcopy(particle) self.particles[i].init_param() self.expl_strats[i] = NormalActNoiseExplStrat( self.particles[i].actor, std_init) self.optimizers[i] = to.optim.Adam( self.expl_strats[i].parameters(), lr=self.lr) self.fixed_particles[i] = deepcopy(self.particles[i]) self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i]) if self.serial: self.samplers[i] = ParallelSampler(env, self.expl_strats[i], num_sampler_envs, min_rollouts=min_rollouts, min_steps=min_steps)
def fnn_policy(env): return FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh)
# Environment env_hparams = dict() env = HopperSim(**env_hparams) env = ActNormWrapper(env) # # Simple Randomizer # dp_nom = HopperSim.get_nominal_domain_param() # randomizer = DomainRandomizer( # NormalDomainParam(name='total_mass', mean=dp_nom['total_mass'], std=dp_nom['total_mass']/10, clip_lo=1e-3) # ) # env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) policy = FNNPolicy(spec=env.spec, **policy_hparam) # Critic value_fcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) critic_hparam = dict( gamma=0.995, lamda=0.95, num_epoch=10, batch_size=512, standardize_adv=False, standardizer=None, max_grad_norm=1., lr=5e-4, ) critic = GAE(value_fcn, **critic_hparam)
# Environment env_hparams = dict(physicsEngine='Bullet', dt=1 / 100., max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=[32, 32], shared_hidden_nonlin=to.relu, ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic q_fcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) # Algorithm algo_hparam = dict( max_iter=1000 * env.max_steps, memory_size=1000 * env.max_steps, gamma=0.995, num_batch_updates=1, tau=0.99, alpha_init=0.2, learn_alpha=False, target_update_intvl=1, standardize_rew=False, min_steps=1,
def cuda_fnnpol_bobspec(default_bob): return FNNPolicy(spec=default_bob.spec, hidden_sizes=(32, 32), hidden_nonlin=to.tanh, use_cuda=True)
def fnnpol_bobspec(default_bob): return FNNPolicy(spec=default_bob.spec, hidden_sizes=(32, 32), hidden_nonlin=to.tanh)