def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", discrete: bool = True, num_critics: int = 2, **kwargs, ): super(MlpSingleActorMultiCritic, self).__init__() self.num_critics = num_critics self.actor = MlpPolicy(state_dim, action_dim, policy_layers, discrete, **kwargs) self.critic1 = MlpValue(state_dim, action_dim, "Qsa", value_layers, **kwargs) self.critic2 = MlpValue(state_dim, action_dim, "Qsa", value_layers, **kwargs) self.action_scale = kwargs[ "action_scale"] if "action_scale" in kwargs else 1 self.action_bias = kwargs[ "action_bias"] if "action_bias" in kwargs else 0
def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, shared_layers: None, policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", discrete: bool = True, **kwargs, ): super(MlpActorCritic, self).__init__() self.actor = MlpPolicy(state_dim, action_dim, policy_layers, discrete, **kwargs) self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs)
class MlpActorCritic(BaseActorCritic): """MLP Actor Critic Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False sac (bool): True if a SAC-like network is needed, else False activation (str): Activation function to be used. Can be either "tanh" or "relu" """ def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, shared_layers: None, policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", discrete: bool = True, **kwargs, ): super(MlpActorCritic, self).__init__() self.actor = MlpPolicy(state_dim, action_dim, policy_layers, discrete, **kwargs) self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs) def get_params(self): actor_params = self.actor.parameters() critic_params = self.critic.parameters() return actor_params, critic_params
def __init__( self, framestack: int, action_dim: spaces.Space, policy_layers: Tuple = (256,), value_layers: Tuple = (256,), val_type: str = "V", discrete: bool = True, *args, **kwargs, ): super(CNNActorCritic, self).__init__() self.feature, output_size = cnn((framestack, 16, 32)) self.actor = MlpPolicy( output_size, action_dim, policy_layers, discrete, **kwargs ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers)
def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, shared_layers: Tuple = (32, 32), policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", discrete: bool = True, **kwargs, ): super(MlpSharedActorCritic, self).__init__() self.shared_network = mlp([state_dim] + list(shared_layers)) self.actor = MlpPolicy( shared_layers[-1], action_dim, policy_layers, discrete, **kwargs ) self.critic = MlpValue( shared_layers[-1], action_dim, val_type, value_layers, **kwargs ) self.state_dim = state_dim self.action_dim = action_dim
class MlpSingleActorMultiCritic(BaseActorCritic): """MLP Actor Critic Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False num_critics (int): Number of critics in the architecture sac (bool): True if a SAC-like network is needed, else False activation (str): Activation function to be used. Can be either "tanh" or "relu" """ def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", discrete: bool = True, num_critics: int = 2, **kwargs, ): super(MlpSingleActorMultiCritic, self).__init__() self.num_critics = num_critics self.actor = MlpPolicy(state_dim, action_dim, policy_layers, discrete, **kwargs) self.critic1 = MlpValue(state_dim, action_dim, "Qsa", value_layers, **kwargs) self.critic2 = MlpValue(state_dim, action_dim, "Qsa", value_layers, **kwargs) self.action_scale = kwargs[ "action_scale"] if "action_scale" in kwargs else 1 self.action_bias = kwargs[ "action_bias"] if "action_bias" in kwargs else 0 def forward(self, x): q1_values = self.critic1(x).squeeze(-1) q2_values = self.critic2(x).squeeze(-1) return (q1_values, q2_values) def get_action(self, state: torch.Tensor, deterministic: bool = False): state = torch.as_tensor(state).float() if self.actor.sac: mean, log_std = self.actor(state) std = log_std.exp() distribution = Normal(mean, std) action_probs = distribution.rsample() log_probs = distribution.log_prob(action_probs) action_probs = torch.tanh(action_probs) action = action_probs * self.action_scale + self.action_bias # enforcing action bound (appendix of SAC paper) log_probs -= torch.log(self.action_scale * (1 - action_probs.pow(2)) + np.finfo(np.float32).eps) log_probs = log_probs.sum(1, keepdim=True) mean = torch.tanh(mean) * self.action_scale + self.action_bias action = (action.float(), log_probs, mean) else: action = self.actor.get_action(state, deterministic=deterministic) return action def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: """Get Values from the Critic Arg: state (:obj:`torch.Tensor`): The state(s) being passed to the critics mode (str): What values should be returned. Types: "both" --> Both values will be returned "min" --> The minimum of both values will be returned "first" --> The value from the first critic only will be returned Returns: values (:obj:`list`): List of values as estimated by each individual critic """ state = torch.as_tensor(state).float() if mode == "both": values = self.forward(state) elif mode == "min": values = self.forward(state) values = torch.min(*values).squeeze(-1) elif mode == "first": values = self.critic1(state) else: raise KeyError("Mode doesn't exist") return values
class MlpSharedActorCritic(BaseActorCritic): """MLP Shared Actor Critic Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment shared_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the shared MLP policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False sac (bool): True if a SAC-like network is needed, else False activation (str): Activation function to be used. Can be either "tanh" or "relu" """ def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, shared_layers: Tuple = (32, 32), policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", discrete: bool = True, **kwargs, ): super(MlpSharedActorCritic, self).__init__() self.shared_network = mlp([state_dim] + list(shared_layers)) self.actor = MlpPolicy( shared_layers[-1], action_dim, policy_layers, discrete, **kwargs ) self.critic = MlpValue( shared_layers[-1], action_dim, val_type, value_layers, **kwargs ) self.state_dim = state_dim self.action_dim = action_dim def get_params(self): actor_params = list(self.shared_network.parameters()) + list( self.actor.parameters() ) critic_params = list(self.shared_network.parameters()) + list( self.critic.parameters() ) return actor_params, critic_params def get_features(self, state: torch.Tensor): """Extract features from the state, which is then an input to get_action and get_value Args: state (:obj:`torch.Tensor`): The state(s) being passed Returns: features (:obj:`torch.Tensor`): The feature(s) extracted from the state """ features = self.shared_network(state) return features def get_action(self, state: torch.Tensor, deterministic: bool = False): """Get Actions from the actor Arg: state (:obj:`torch.Tensor`): The state(s) being passed to the critics deterministic (bool): True if the action space is deterministic, else False Returns: action (:obj:`list`): List of actions as estimated by the critic distribution (): The distribution from which the action was sampled (None if determinist """ state = torch.as_tensor(state).float() shared_features = self.get_features(state) action_probs = self.actor(shared_features) action_probs = nn.Softmax(dim=-1)(action_probs) if deterministic: action = torch.argmax(action_probs, dim=-1).unsqueeze(-1).float() distribution = None else: distribution = Categorical(probs=action_probs) action = distribution.sample() return action, distribution def get_value(self, state: torch.Tensor): """Get Values from the Critic Arg: state (:obj:`torch.Tensor`): The state(s) being passed to the critics Returns: values (:obj:`list`): List of values as estimated by the critic """ state = torch.as_tensor(state).float() if self.critic.val_type == "Qsa": # state shape = [batch_size, number of vec envs, (state_dim + action_dim)] # extract shared_features from just the state # state[:, :, :-action_dim] -> [batch_size, number of vec envs, state_dim] shared_features = self.shared_network(state[:, :, : -self.action_dim]) # concatenate the actions to the extracted shared_features # state[:, :, -action_dim:] -> [batch_size, number of vec envs, action_dim] shared_features = torch.cat( [shared_features, state[:, :, -self.action_dim :]], dim=-1 ) value = self.critic(shared_features).float().squeeze(-1) else: shared_features = self.shared_network(state) value = self.critic(shared_features) return value
class CNNActorCritic(BaseActorCritic): """ CNN Actor Critic :param framestack: Number of previous frames to stack together :param action_dim: Action dimensions of the environment :param fc_layers: Sizes of hidden layers :param val_type: Specifies type of value function: ( "V" for V(s), "Qs" for Q(s), "Qsa" for Q(s,a)) :param discrete: True if action space is discrete, else False :param framestack: Number of previous frames to stack together :type action_dim: int :type fc_layers: tuple or list :type val_type: str :type discrete: bool """ def __init__( self, framestack: int, action_dim: spaces.Space, policy_layers: Tuple = (256,), value_layers: Tuple = (256,), val_type: str = "V", discrete: bool = True, *args, **kwargs, ): super(CNNActorCritic, self).__init__() self.feature, output_size = cnn((framestack, 16, 32)) self.actor = MlpPolicy( output_size, action_dim, policy_layers, discrete, **kwargs ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers) def get_params(self): actor_params = list(self.feature.parameters()) + list(self.actor.parameters()) critic_params = list(self.feature.parameters()) + list(self.critic.parameters()) return actor_params, critic_params def get_action( self, state: torch.Tensor, deterministic: bool = False ) -> torch.Tensor: """ Get action from the Actor based on input :param state: The state being passed as input to the Actor :param deterministic: (True if the action space is deterministic, else False) :type state: Tensor :type deterministic: boolean :returns: action """ state = self.feature(state) state = state.view(state.size(0), -1) action_probs = self.actor(state) action_probs = nn.Softmax(dim=-1)(action_probs) if deterministic: action = torch.argmax(action_probs, dim=-1) distribution = None else: distribution = Categorical(probs=action_probs) action = distribution.sample() return action, distribution def get_value(self, inp: torch.Tensor) -> torch.Tensor: """ Get value from the Critic based on input :param inp: Input to the Critic :type inp: Tensor :returns: value """ inp = self.feature(inp) inp = inp.view(inp.size(0), -1) value = self.critic(inp).squeeze(-1) return value