def test_parallel_sampling_deterministic_wo_min_steps( env: SimEnv, policy: Policy, min_rollouts: Optional[int], init_states: Optional[int], domain_params: Optional[List[dict]], ): env.max_steps = 20 if init_states is not None: init_states = [ env.spec.state_space.sample_uniform() for _ in range(init_states) ] nums_workers = (1, 2, 4) all_rollouts = [] for num_workers in nums_workers: # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed # anyway). all_rollouts.append( ParallelRolloutSampler( env, NormalActNoiseExplStrat(policy, std_init=1.0), num_workers=num_workers, min_rollouts=min_rollouts, seed=0, ).sample(init_states=init_states, domain_params=domain_params)) # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts. for ros in all_rollouts: for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]: # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions # are different when using the idle policy. if isinstance(policy, IdlePolicy): # The Quanser Ball Balancer is a deterministic environment (conditioned on the initial state). As the # idle policy is a deterministic policy, this will result in the rollouts being equivalent for each # initial state, so do not check for difference if the initial states where set. if init_states is None: assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx( ro_b.observations) else: assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx(ro_b.observations) assert ro_a.actions != pytest.approx(ro_b.actions) # Test that the rollouts for all number of workers are equal. for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]: assert len(ros_a) == len(ros_b) for ro_a, ro_b in zip(ros_a, ros_b): assert ro_a.rewards == pytest.approx(ro_b.rewards) assert ro_a.observations == pytest.approx(ro_b.observations) assert ro_a.actions == pytest.approx(ro_b.actions)
def test_action_statistics(env: SimEnv, policy: Policy): sigma = 1.0 # with lower values like 0.1 we can observe violations of the tolerances # Create an action-based exploration strategy explstrat = NormalActNoiseExplStrat(policy, std_init=sigma) # Sample a deterministic rollout ro_policy = rollout(env, policy, eval=True, max_steps=1000, stop_on_done=False, seed=0) ro_policy.torch(to.get_default_dtype()) # Run the exploration strategy on the previously sampled rollout if policy.is_recurrent: if isinstance(policy, TwoHeadedPolicy): act_expl, _, _ = explstrat(ro_policy.observations) else: act_expl, _ = explstrat(ro_policy.observations) # Get the hidden states from the deterministic rollout hidden_states = ro_policy.hidden_states else: if isinstance(policy, TwoHeadedPolicy): act_expl, _ = explstrat(ro_policy.observations) else: act_expl = explstrat(ro_policy.observations) hidden_states = [ 0.0 ] * ro_policy.length # just something that does not violate the format ro_expl = StepSequence( actions=act_expl[:-1], # truncate act due to last obs observations=ro_policy.observations, rewards=ro_policy.rewards, # don't care but necessary hidden_states=hidden_states, ) ro_expl.torch() # Compute action statistics and the ground truth actstats = compute_action_statistics(ro_expl, explstrat) gt_logprobs = Normal(loc=ro_policy.actions, scale=sigma).log_prob(ro_expl.actions) gt_entropy = Normal(loc=ro_policy.actions, scale=sigma).entropy() to.testing.assert_allclose(actstats.log_probs, gt_logprobs, rtol=1e-4, atol=1e-5) to.testing.assert_allclose(actstats.entropy, gt_entropy, rtol=1e-4, atol=1e-5)
def test_parallel_sampling_deterministic_w_min_steps( env: SimEnv, policy: Policy, min_rollouts: Optional[int], min_steps: int, domain_params: Optional[List[dict]], ): env.max_steps = 20 nums_workers = (1, 2, 4) all_rollouts = [] for num_workers in nums_workers: # Act an exploration strategy to test if that works too (it should as the policy gets pickled and distributed # anyway). all_rollouts.append( ParallelRolloutSampler( env, NormalActNoiseExplStrat(policy, std_init=1.0), num_workers=num_workers, min_rollouts=min_rollouts, min_steps=min_steps * env.max_steps, seed=0, ).sample(domain_params=domain_params)) # Test that the rollouts are actually different, i.e., that not the same seed is used for all rollouts. for ros in all_rollouts: for ro_a, ro_b in [(a, b) for a in ros for b in ros if a is not b]: # The idle policy iy deterministic and always outputs the zero action. Hence, do not check that the actions # are different when using the idle policy. if not isinstance(policy, IdlePolicy): assert ro_a.rewards != pytest.approx(ro_b.rewards) assert ro_a.observations != pytest.approx(ro_b.observations) assert ro_a.actions != pytest.approx(ro_b.actions) # Test that the rollouts for all number of workers are equal. for ros_a, ros_b in [(a, b) for a in all_rollouts for b in all_rollouts]: assert sum([len(ro) for ro in ros_a]) == sum([len(ro) for ro in ros_b]) assert sum([len(ro) for ro in ros_a]) >= min_steps * env.max_steps assert sum([len(ro) for ro in ros_b]) >= min_steps * env.max_steps assert len(ros_a) == len(ros_b) if min_rollouts is not None: assert len(ros_a) >= min_rollouts assert len(ros_b) >= min_rollouts for ro_a, ro_b in zip(ros_a, ros_b): assert ro_a.rewards == pytest.approx(ro_b.rewards) assert ro_a.observations == pytest.approx(ro_b.observations) assert ro_a.actions == pytest.approx(ro_b.actions)
def test_noise_on_act(env, policy): for _ in range(100): # Init the exploration strategy act_noise_strat = NormalActNoiseExplStrat(policy, std_init=0.5, train_mean=True) # Set new parameters for the exploration noise std = to.ones(env.act_space.flat_dim) * to.rand(1) mean = to.rand(env.act_space.shape) act_noise_strat.noise.adapt(mean, std) assert (mean == act_noise_strat.noise.mean).all() # Sample a random observation from the environment obs = to.from_numpy(env.obs_space.sample_uniform()) # Get a clean and a noisy action act = policy(obs) # policy expects Tensors act_noisy = act_noise_strat( obs) # exploration strategy expects ndarrays assert isinstance(act, to.Tensor) assert not to.equal(act, act_noisy)
def __init__(self, save_dir: str, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, num_epoch: int = 3, eps_clip: float = 0.1, batch_size: int = 64, std_init: float = 1.0, num_sampler_envs: int = 4, max_grad_norm: float = None, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: number of iterations (policy updates) :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_epoch: number of iterations over all gathered samples during one policy update :param eps_clip: max/min probability ratio, see [1] :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param num_sampler_envs: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created .. note:: The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler schedules the maximum learning rate. """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) assert isinstance(policy, Policy) # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) # Store the inputs self.num_epoch = num_epoch self.eps_clip = eps_clip self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self.log_loss = True self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.sampler = ParallelSampler(env, self._expl_strat, num_envs=num_sampler_envs, min_steps=min_steps, min_rollouts=min_rollouts) self.optim = to.optim.Adam( [{ 'params': self._expl_strat.policy.parameters() }, { 'params': self._expl_strat.noise.parameters() }], lr=lr, eps=1e-5) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
def __init__( self, save_dir: str, env: Env, policy: Policy, lr: float = 5e-4, std_init: float = 0.15, min_steps: int = 1500, num_epochs: int = 10, max_iter: int = 500, num_teachers: int = 8, teacher_extra: Optional[dict] = None, teacher_policy: Optional[Policy] = None, teacher_algo: Optional[callable] = None, teacher_algo_hparam: Optional[dict] = None, randomizer: Optional[DomainRandomizer] = None, logger: Optional[StepLogger] = None, num_workers: int = 4, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param std_init: initial standard deviation on the actions for the exploration noise :param min_steps: minimum number of state transitions sampled per policy update batch :param num_epochs: number of epochs (how often we iterate over the same batch) :param max_iter: number of iterations (policy updates) :param num_teachers: number of teachers that are used for distillation :param teacher_extra: extra dict from PDDRTeachers algo. If provided, teachers are loaded from there :param teacher_policy: policy to be updated (is duplicated for each teacher) :param teacher_algo: algorithm class to be used for training the teachers :param teacher_algo_hparam: hyper-params to be used for teacher_algo :param randomizer: randomizer for sampling the teacher domain parameters; if `None`, the environment's default one is used :param logger: logger for every step of the algorithm, if `None` the default logger will be created :param num_workers: number of environments for parallel sampling """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(policy, Policy): raise pyrado.TypeErr(given=policy, expected_type=Policy) # Call Algorithm's constructor. super().__init__( num_checkpoints=1, init_checkpoint=-1, save_dir=save_dir, max_iter=max_iter, policy=policy, logger=logger ) # Store the inputs self.env_real = env self.min_steps = min_steps self.num_epochs = num_epochs self.num_teachers = num_teachers self.num_workers = num_workers self.teacher_policies = [] self.teacher_envs = [] self.teacher_expl_strats = [] self.teacher_critics = [] self.teacher_ex_dirs = [] # Teachers if teacher_policy is not None and teacher_algo is not None and teacher_algo_hparam is not None: if not isinstance(teacher_policy, Policy): raise pyrado.TypeErr(given=teacher_policy, expected_type=Policy) if not issubclass(teacher_algo, Algorithm): raise pyrado.TypeErr(given=teacher_algo, expected_type=Algorithm) if randomizer is None: self.randomizer = create_default_randomizer(env) else: assert isinstance(randomizer, DomainRandomizer) self.randomizer = randomizer self.set_random_envs() # Prepare folders self.teacher_ex_dirs = [os.path.join(self.save_dir, f"teachers_{idx}") for idx in range(self.num_teachers)] for idx in range(self.num_teachers): os.makedirs(self.teacher_ex_dirs[idx], exist_ok=True) # Create teacher algos self.algos = [ teacher_algo( save_dir=self.teacher_ex_dirs[idx], env=self.teacher_envs[idx], policy=deepcopy(teacher_policy), logger=None, **deepcopy(teacher_algo_hparam), ) for idx in range(self.num_teachers) ] elif teacher_extra is not None: self.unpack_teachers(teacher_extra) assert self.num_teachers == len(self.teacher_policies) self.reset_checkpoint() else: self.load_teachers() if self.num_teachers < len(self.teacher_policies): print( f"You have loaded {len(self.teacher_policies)} teachers. Only the first {self.num_teachers} will be used!" ) self.prune_teachers() assert self.num_teachers == len(self.teacher_policies) self.reset_checkpoint() # Student self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.optimizer = to.optim.Adam([{"params": self.policy.parameters()}], lr=lr) # Environments self.samplers = [ ParallelRolloutSampler( self.teacher_envs[t], deepcopy(self._expl_strat), num_workers=self.num_workers, min_steps=self.min_steps, ) for t in range(self.num_teachers) ] self.teacher_weights = np.ones(self.num_teachers) # Distillation loss criterion self.criterion = to.nn.KLDivLoss(log_target=True, reduction="batchmean")
def __init__(self, save_dir: str, env: Env, particle_hparam: dict, max_iter: int, num_particles: int, temperature: float, lr: float, horizon: int, std_init: float = 1.0, min_rollouts: int = None, min_steps: int = 10000, num_sampler_envs: int = 4, serial: bool = True, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param particle_hparam: hyper-parameters for particle template construction :param max_iter: number of iterations :param num_particles: number of distinct particles :param temperature: the temperature of the SVGD determines how jointly the training takes place :param lr: the learning rate for the update of the particles :param horizon: horizon for each particle :param std_init: initial standard deviation for the exploration :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside :param logger: logger for every step of the algorithm """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(particle_hparam, dict): raise pyrado.TypeErr(given=particle_hparam, expected_type=dict) if not all([ key in particle_hparam for key in ['actor', 'value_fcn', 'critic'] ]): raise AttributeError # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy=None, logger=logger) # Store the inputs self._env = env self.num_particles = num_particles self.horizon = horizon # TODO @Robin: where is the horizon used?! self.lr = lr self.temperature = temperature self.serial = serial # Prepare placeholders for particles self.particles = [None] * num_particles self.expl_strats = [None] * num_particles self.optimizers = [None] * num_particles self.fixed_particles = [None] * num_particles self.fixed_expl_strats = [None] * num_particles self.samplers = [None] * num_particles self.count = 0 self.updatecount = 0 # Particle factory actor = FNNPolicy(spec=env.spec, **particle_hparam['actor']) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam['value_fcn']) critic = GAE(value_fcn, **particle_hparam['critic']) particle = SVPGParticle(env.spec, actor, critic) for i in range(self.num_particles): self.particles[i] = deepcopy(particle) self.particles[i].init_param() self.expl_strats[i] = NormalActNoiseExplStrat( self.particles[i].actor, std_init) self.optimizers[i] = to.optim.Adam( self.expl_strats[i].parameters(), lr=self.lr) self.fixed_particles[i] = deepcopy(self.particles[i]) self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i]) if self.serial: self.samplers[i] = ParallelSampler(env, self.expl_strats[i], num_sampler_envs, min_rollouts=min_rollouts, min_steps=min_steps)
def __init__(self, save_dir: str, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, vfcn_coeff: float = 0.5, entropy_coeff: float = 1e-3, batch_size: int = 32, std_init: float = 1.0, max_grad_norm: float = None, num_workers: int = 4, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: number of iterations (policy updates) :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param vfcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2 :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2 :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param num_workers: number of environments for parallel sampling :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) # Store the inputs self.min_rollouts = min_rollouts self.min_steps = min_steps self.vfcn_coeff = vfcn_coeff self.entropy_coeff = entropy_coeff self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.sampler = ParallelRolloutSampler( env, self.expl_strat, num_workers=num_workers, min_steps=min_steps, min_rollouts=min_rollouts ) self.optim = to.optim.RMSprop( [{'params': self._policy.parameters()}, {'params': self.expl_strat.noise.parameters()}, {'params': self._critic.vfcn.parameters()}], lr=lr, eps=1e-5 ) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)