def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ policy1.actor = policy1.actor.to(default_device()) policy2.actor = policy2.actor.to(default_device()) decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) np_obs = decision_step.obs masks = policy1._extract_masks(decision_step) memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] with torch.no_grad(): _, log_probs1, _, _ = policy1.sample_actions(tensor_obs, masks=masks, memories=memories) _, log_probs2, _, _ = policy2.sample_actions(tensor_obs, masks=masks, memories=memories) np.testing.assert_array_equal( ModelUtils.to_numpy(log_probs1.all_discrete_tensor), ModelUtils.to_numpy(log_probs2.all_discrete_tensor), )
def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TorchPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. super().__init__(policy, trainer_settings) reward_signal_configs = trainer_settings.reward_signals reward_signal_names = [ key.value for key, _ in reward_signal_configs.items() ] if policy.shared_critic: self._critic = policy.actor else: self._critic = ValueNetwork( reward_signal_names, policy.behavior_spec.observation_specs, network_settings=trainer_settings.network_settings, ) self._critic.to(default_device()) params = list(self.policy.actor.parameters()) + list( self._critic.parameters()) self.hyperparameters: PPOSettings = cast( PPOSettings, trainer_settings.hyperparameters) self.decay_learning_rate = ModelUtils.DecayedValue( self.hyperparameters.learning_rate_schedule, self.hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.decay_epsilon = ModelUtils.DecayedValue( self.hyperparameters.learning_rate_schedule, self.hyperparameters.epsilon, 0.1, self.trainer_settings.max_steps, ) self.decay_beta = ModelUtils.DecayedValue( self.hyperparameters.learning_rate_schedule, self.hyperparameters.beta, 1e-5, self.trainer_settings.max_steps, ) self.optimizer = torch.optim.Adam( params, lr=self.trainer_settings.hyperparameters.learning_rate) self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", } self.stream_names = list(self.reward_signals.keys())
def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: super().__init__(specs, settings) self._ignore_done = True self._discriminator_network = DiscriminatorNetwork(specs, settings) self._discriminator_network.to(default_device()) _, self._demo_buffer = demo_to_buffer( settings.demo_path, 1, specs ) # This is supposed to be the sequence length but we do not have access here params = list(self._discriminator_network.parameters()) self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate)
def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__(specs, settings) self._ignore_done = True self._network = CuriosityNetwork(specs, settings) self._network.to(default_device()) self.optimizer = torch.optim.Adam(self._network.parameters(), lr=settings.learning_rate) self._has_updated_once = False
def test_set_torch_device( mock_set_default_tensor_type, device_str, expected_type, expected_index, expected_tensor_type, ): try: torch_settings = TorchSettings(device=device_str) set_torch_config(torch_settings) assert default_device().type == expected_type if expected_index is None: assert default_device().index is None else: assert default_device().index == expected_index mock_set_default_tensor_type.assert_called_once_with( expected_tensor_type) except Exception: raise finally: # restore the defaults torch_settings = TorchSettings(device=None) set_torch_config(torch_settings)
def environment_initialized(self, run_options: RunOptions) -> None: self.run_options = run_options # Tuple of (major, minor, patch) vi = sys.version_info env_params = run_options.environment_parameters msg = TrainingEnvironmentInitialized( python_version=f"{vi[0]}.{vi[1]}.{vi[2]}", mlagents_version=mlagents.trainers.__version__, mlagents_envs_version=mlagents_envs.__version__, torch_version=torch_utils.torch.__version__, torch_device_type=torch_utils.default_device().type, num_envs=run_options.env_settings.num_envs, num_environment_parameters=len(env_params) if env_params else 0, ) any_message = Any() any_message.Pack(msg) env_init_msg = OutgoingMessage() env_init_msg.set_raw_bytes(any_message.SerializeToString()) super().queue_message_to_send(env_init_msg)
def __init__( self, seed: int, behavior_spec: BehaviorSpec, trainer_settings: TrainerSettings, tanh_squash: bool = False, reparameterize: bool = False, separate_critic: bool = True, condition_sigma_on_obs: bool = True, ): """ Policy that uses a multilayer perceptron to map the observations to actions. Could also use a CNN to encode visual input prior to the MLP. Supports discrete and continuous actions, as well as recurrent networks. :param seed: Random seed. :param behavior_spec: Assigned BehaviorSpec object. :param trainer_settings: Defined training parameters. :param load: Whether a pre-trained model will be loaded or a new one created. :param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output. :param reparameterize: Whether we are using the resampling trick to update the policy in continuous output. """ super().__init__( seed, behavior_spec, trainer_settings, tanh_squash, reparameterize, condition_sigma_on_obs, ) self.global_step = ( GlobalSteps()) # could be much simpler if TorchPolicy is nn.Module self.grads = None reward_signal_configs = trainer_settings.reward_signals reward_signal_names = [ key.value for key, _ in reward_signal_configs.items() ] self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", } if separate_critic: ac_class = SeparateActorCritic else: ac_class = SharedActorCritic self.actor_critic = ac_class( sensor_specs=self.behavior_spec.sensor_specs, network_settings=trainer_settings.network_settings, action_spec=behavior_spec.action_spec, stream_names=reward_signal_names, conditional_sigma=self.condition_sigma_on_obs, tanh_squash=tanh_squash, ) # Save the m_size needed for export self._export_m_size = self.m_size # m_size needed for training is determined by network, not trainer settings self.m_size = self.actor_critic.memory_size self.actor_critic.to(default_device()) self._clip_action = not tanh_squash
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters) self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy policy_network_settings = policy.network_settings self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 self.continuous_target_entropy_scale = 1.0 self.stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val.gamma for _val in trainer_params.reward_signals.values() ] self.use_dones_in_backup = { name: int(not self.reward_signals[name].ignore_done) for name in self.stream_names } self._action_spec = self.policy.behavior_spec.action_spec self.value_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.sensor_specs, policy_network_settings, self._action_spec, ) self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.sensor_specs, policy_network_settings, ) ModelUtils.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) # We create one entropy coefficient per action, whether discrete or continuous. _disc_log_ent_coef = torch.nn.Parameter( torch.log( torch.as_tensor([self.init_entcoef] * len(self._action_spec.discrete_branches))), requires_grad=True, ) _cont_log_ent_coef = torch.nn.Parameter(torch.log( torch.as_tensor([self.init_entcoef])), requires_grad=True) self._log_ent_coef = TorchSACOptimizer.LogEntCoef( discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef) _cont_target = ( -1 * self.continuous_target_entropy_scale * np.prod(self._action_spec.continuous_size).astype(np.float32)) _disc_target = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self._action_spec.discrete_branches ] self.target_entropy = TorchSACOptimizer.TargetEntropy( continuous=_cont_target, discrete=_disc_target) policy_params = list( self.policy.actor_critic.network_body.parameters()) + list( self.policy.actor_critic.action_model.parameters()) value_params = list(self.value_network.parameters()) + list( self.policy.actor_critic.critic.parameters()) logger.debug("value_vars") for param in value_params: logger.debug(param.shape) logger.debug("policy_vars") for param in policy_params: logger.debug(param.shape) self.decay_learning_rate = ModelUtils.DecayedValue( hyperparameters.learning_rate_schedule, hyperparameters.learning_rate, 1e-10, self.trainer_settings.max_steps, ) self.policy_optimizer = torch.optim.Adam( policy_params, lr=hyperparameters.learning_rate) self.value_optimizer = torch.optim.Adam( value_params, lr=hyperparameters.learning_rate) self.entropy_optimizer = torch.optim.Adam( self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate) self._move_to_device(default_device())