def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ppo.appo.DEFAULT_CONFIG, **config) # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. VTraceOptimizer.__init__(self) LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) TorchPolicyV2.__init__( self, observation_space, action_space, config, max_seq_len=config["model"]["max_seq_len"], ) EntropyCoeffSchedule.__init__( self, config["entropy_coeff"], config["entropy_coeff_schedule"] ) ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch() # Initiate TargetNetwork ops after loss initialization. TargetNetworkMixin.__init__(self)
def __init__(self, observation_space, action_space, config): config = dict( ray.rllib.algorithms.marwil.marwil.MARWILConfig().to_dict(), **config) TorchPolicyV2.__init__( self, observation_space, action_space, config, max_seq_len=config["model"]["max_seq_len"], ) ValueNetworkMixin.__init__(self, config) PostprocessAdvantages.__init__(self) # Not needed for pure BC. if config["beta"] != 0.0: # Set up a torch-var for the squared moving avg. advantage norm. self._moving_average_sqd_adv_norm = torch.tensor( [config["moving_average_sqd_adv_norm_start"]], dtype=torch.float32, requires_grad=False, ).to(self.device) # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch()
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) setup_config(self, observation_space, action_space, config) TorchPolicy.__init__( self, observation_space, action_space, config, max_seq_len=config["model"]["max_seq_len"], ) ValueNetworkMixin.__init__(self, config) EntropyCoeffSchedule.__init__(self, config["entropy_coeff"], config["entropy_coeff_schedule"]) LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) # The current KL value (as python float). self.kl_coeff = self.config["kl_coeff"] # Constant target value. self.kl_target = self.config["kl_target"] # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch()
def setup_mixins( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> None: # Setup Value branch of our NN. ValueNetworkMixin.__init__(policy, config) # Not needed for pure BC. if policy.config["beta"] != 0.0: # Set up a torch-var for the squared moving avg. advantage norm. policy._moving_average_sqd_adv_norm = torch.tensor( [policy.config["moving_average_sqd_adv_norm_start"]], dtype=torch.float32, requires_grad=False, ).to(policy.device)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.maml.maml.DEFAULT_CONFIG, **config) validate_config(config) TorchPolicyV2.__init__( self, observation_space, action_space, config, max_seq_len=config["model"]["max_seq_len"], ) KLCoeffMixin.__init__(self, config) ValueNetworkMixin.__init__(self, config) # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch()
def setup_late_mixins( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ): """Call all mixin classes' constructors after APPOPolicy initialization. Args: policy (Policy): The Policy object. obs_space (gym.spaces.Space): The Policy's observation space. action_space (gym.spaces.Space): The Policy's action space. config (TrainerConfigDict): The Policy's config. """ KLCoeffMixin.__init__(policy, config) ValueNetworkMixin.__init__(policy, config) TargetNetworkMixin.__init__(policy)
def setup_mixins( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> None: """Call all mixin classes' constructors before PPOPolicy initialization. Args: policy (Policy): The Policy object. obs_space (gym.spaces.Space): The Policy's observation space. action_space (gym.spaces.Space): The Policy's action space. config (TrainerConfigDict): The Policy's config. """ EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], config["entropy_coeff_schedule"]) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) ValueNetworkMixin.__init__(policy, config)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.A3CConfig().to_dict(), **config) TorchPolicyV2.__init__( self, observation_space, action_space, config, max_seq_len=config["model"]["max_seq_len"], ) ValueNetworkMixin.__init__(self, config) LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) EntropyCoeffSchedule.__init__(self, config["entropy_coeff"], config["entropy_coeff_schedule"]) # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch()
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.ppo.ppo.PPOConfig().to_dict(), **config) # TODO: Move into Policy API, if needed at all here. Why not move this into # `PPOConfig`?. validate_config(config) TorchPolicyV2.__init__( self, observation_space, action_space, config, max_seq_len=config["model"]["max_seq_len"], ) ValueNetworkMixin.__init__(self, config) LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) EntropyCoeffSchedule.__init__(self, config["entropy_coeff"], config["entropy_coeff_schedule"]) KLCoeffMixin.__init__(self, config) # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch()
def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, config) KLCoeffMixin.__init__(policy, config)