def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TorchPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. super().__init__(policy, trainer_settings) reward_signal_configs = trainer_settings.reward_signals reward_signal_names = [ key.value for key, _ in reward_signal_configs.items() ] if policy.shared_critic: self._critic = policy.actor else: self._critic = ValueNetwork( reward_signal_names, policy.behavior_spec.observation_specs, network_settings=trainer_settings.network_settings, ) self._critic.to(default_device()) params = list(self.policy.actor.parameters()) + list( self._critic.parameters()) self.hyperparameters: PPOSettings = cast( PPOSettings, trainer_settings.hyperparameters) self.decay_learning_rate = ModelUtils.DecayedValue( self.hyperparameters.learning_rate_schedule, self.hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.decay_epsilon = ModelUtils.DecayedValue( self.hyperparameters.learning_rate_schedule, self.hyperparameters.epsilon, 0.1, self.trainer_settings.max_steps, ) self.decay_beta = ModelUtils.DecayedValue( self.hyperparameters.learning_rate_schedule, self.hyperparameters.beta, 1e-5, self.trainer_settings.max_steps, ) self.optimizer = torch.optim.Adam( params, lr=self.trainer_settings.hyperparameters.learning_rate) self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", } self.stream_names = list(self.reward_signals.keys())
def test_decayed_value(): test_steps = [0, 4, 9] # Test constant decay param = ModelUtils.DecayedValue(ScheduleType.CONSTANT, 1.0, 0.2, test_steps[-1]) for _step in test_steps: _param = param.get_value(_step) assert _param == 1.0 test_results = [1.0, 0.6444, 0.2] # Test linear decay param = ModelUtils.DecayedValue(ScheduleType.LINEAR, 1.0, 0.2, test_steps[-1]) for _step, _result in zip(test_steps, test_results): _param = param.get_value(_step) assert _param == pytest.approx(_result, abs=0.01) # Test invalid with pytest.raises(UnityTrainerException): ModelUtils.DecayedValue( "SomeOtherSchedule", 1.0, 0.2, test_steps[-1] ).get_value(0)
def __init__( self, policy: TorchPolicy, settings: BehavioralCloningSettings, policy_learning_rate: float, default_batch_size: int, default_num_epoch: int, ): """ A BC trainer that can be used inline with RL. :param policy: The policy of the learning model :param settings: The settings for BehavioralCloning including LR strength, batch_size, num_epochs, samples_per_update and LR annealing steps. :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate for the pretrainer. """ self.policy = policy self._anneal_steps = settings.steps self.current_lr = policy_learning_rate * settings.strength learning_rate_schedule: ScheduleType = ScheduleType.LINEAR if self._anneal_steps > 0 else ScheduleType.CONSTANT self.decay_learning_rate = ModelUtils.DecayedValue( learning_rate_schedule, self.current_lr, 1e-10, self._anneal_steps ) params = self.policy.actor_critic.parameters() self.optimizer = torch.optim.Adam(params, lr=self.current_lr) _, self.demonstration_buffer = demo_to_buffer( settings.demo_path, policy.sequence_length, policy.behavior_spec ) self.batch_size = ( settings.batch_size if settings.batch_size else default_batch_size ) self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch self.n_sequences = max( min(self.batch_size, self.demonstration_buffer.num_experiences) // policy.sequence_length, 1, ) self.has_updated = False self.use_recurrent = self.policy.use_recurrent self.samples_per_update = settings.samples_per_update
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy policy_network_settings = policy.network_settings self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 self.continuous_target_entropy_scale = 1.0 self.stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val.gamma for _val in trainer_params.reward_signals.values() ] self.use_dones_in_backup = { name: int(not self.reward_signals[name].ignore_done) for name in self.stream_names } self._action_spec = self.policy.behavior_spec.action_spec self.value_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.sensor_specs, policy_network_settings, self._action_spec, ) self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.sensor_specs, policy_network_settings, ) ModelUtils.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) # We create one entropy coefficient per action, whether discrete or continuous. _disc_log_ent_coef = torch.nn.Parameter( torch.log( torch.as_tensor([self.init_entcoef] * len(self._action_spec.discrete_branches))), requires_grad=True, ) _cont_log_ent_coef = torch.nn.Parameter(torch.log( torch.as_tensor([self.init_entcoef])), requires_grad=True) self._log_ent_coef = TorchSACOptimizer.LogEntCoef( discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef) _cont_target = ( -1 * self.continuous_target_entropy_scale * np.prod(self._action_spec.continuous_size).astype(np.float32)) _disc_target = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self._action_spec.discrete_branches ] self.target_entropy = TorchSACOptimizer.TargetEntropy( continuous=_cont_target, discrete=_disc_target) policy_params = list( self.policy.actor_critic.network_body.parameters()) + list( self.policy.actor_critic.action_model.parameters()) value_params = list(self.value_network.parameters()) + list( self.policy.actor_critic.critic.parameters()) logger.debug("value_vars") for param in value_params: logger.debug(param.shape) logger.debug("policy_vars") for param in policy_params: logger.debug(param.shape) self.decay_learning_rate = ModelUtils.DecayedValue( hyperparameters.learning_rate_schedule, hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.policy_optimizer = torch.optim.Adam( policy_params, lr=hyperparameters.learning_rate) self.value_optimizer = torch.optim.Adam( value_params, lr=hyperparameters.learning_rate) self.entropy_optimizer = torch.optim.Adam( self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate) self._move_to_device(default_device())
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy self.act_size = policy.act_size policy_network_settings = policy.network_settings self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 self.continuous_target_entropy_scale = 1.0 self.stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()] self.use_dones_in_backup = { name: int(not self.reward_signals[name].ignore_done) for name in self.stream_names } # Critics should have 1/2 of the memory of the policy critic_memory = policy_network_settings.memory if critic_memory is not None: critic_memory = attr.evolve( critic_memory, memory_size=critic_memory.memory_size // 2 ) value_network_settings = attr.evolve( policy_network_settings, memory=critic_memory ) self.value_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, value_network_settings, self.policy.behavior_spec.action_type, self.act_size, ) self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, value_network_settings, ) self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) self._log_ent_coef = torch.nn.Parameter( torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))), requires_grad=True, ) if self.policy.use_continuous_act: self.target_entropy = torch.as_tensor( -1 * self.continuous_target_entropy_scale * np.prod(self.act_size[0]).astype(np.float32) ) else: self.target_entropy = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self.act_size ] policy_params = list(self.policy.actor_critic.network_body.parameters()) + list( self.policy.actor_critic.distribution.parameters() ) value_params = list(self.value_network.parameters()) + list( self.policy.actor_critic.critic.parameters() ) logger.debug("value_vars") for param in value_params: logger.debug(param.shape) logger.debug("policy_vars") for param in policy_params: logger.debug(param.shape) self.decay_learning_rate = ModelUtils.DecayedValue( hyperparameters.learning_rate_schedule, hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.policy_optimizer = torch.optim.Adam( policy_params, lr=hyperparameters.learning_rate ) self.value_optimizer = torch.optim.Adam( value_params, lr=hyperparameters.learning_rate ) self.entropy_optimizer = torch.optim.Adam( [self._log_ent_coef], lr=hyperparameters.learning_rate )