def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) self.to(device)
def __init__(self, args, obs_space, share_obs_space, action_space, device=torch.device("cpu"), cat_self=True): super(R_Model, self).__init__() self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_ReLU = args.use_ReLU self._recurrent_N = args.recurrent_N self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_centralized_V = args.use_centralized_V self.hidden_size = args.hidden_size self.device = device self.tpdv = dict(dtype=torch.float32, device=device) init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] # obs space obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.obs_prep = base(args, obs_shape) # share obs space if self._use_centralized_V: share_obs_shape = get_shape_from_obs_space(share_obs_space) self.share_obs_prep = base(args, share_obs_shape, cat_self) else: self.share_obs_prep = self.obs_prep # common layer self.common = MLPLayer(self.hidden_size, self.hidden_size, layer_N=0, use_orthogonal=self._use_orthogonal, use_ReLU=self._use_ReLU) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) # value self.v_out = init_(nn.Linear(self.hidden_size, 1)) # action self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) self.to(self.device)
def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._activation_id = args.activation_id self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_influence_policy = args.use_influence_policy self._influence_layer_N = args.influence_layer_N self._use_policy_vhead = args.use_policy_vhead self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) if 'Dict' in obs_shape.__class__.__name__: self._mixed_obs = True self.base = MIXBase(args, obs_shape, cnn_layers_params=args.cnn_layers_params) else: self._mixed_obs = False self.base = CNNBase(args, obs_shape) if len(obs_shape)==3 else MLPBase(args, obs_shape, use_attn_internal=args.use_attn_internal, use_cat_self=True) input_size = self.base.output_size if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(input_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) input_size = self.hidden_size if self._use_influence_policy: self.mlp = MLPLayer(obs_shape[0], self.hidden_size, self._influence_layer_N, self._use_orthogonal, self._activation_id) input_size += self.hidden_size self.act = ACTLayer(action_space, input_size, self._use_orthogonal, self._gain) if self._use_policy_vhead: init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) if self._use_popart: self.v_out = init_(PopArt(input_size, 1, device=device)) else: self.v_out = init_(nn.Linear(input_size, 1)) self.to(device)
def __init__(self, args, obs_space, action_space, hidden_size=64, use_recurrent_policy=True): super(actor, self).__init__() self.hidden_size = hidden_size self._use_recurrent_policy = use_recurrent_policy self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain)
def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_policy_vhead = args.use_policy_vhead self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) if self._use_policy_vhead: init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) self.v_out = init_(nn.Linear(self.hidden_size, 1)) self.to(device)
class R_Actor(nn.Module): """ Actor network class for MAPPO. Outputs actions given observations. :param args: (argparse.Namespace) arguments containing relevant model information. :param obs_space: (gym.Space) observation space. :param action_space: (gym.Space) action space. :param device: (torch.device) specifies the device to run on (cpu/gpu). """ def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) self.to(device) def forward(self, obs, rnn_states, masks, available_actions=None, deterministic=False): """ Compute actions from the given inputs. :param obs: (np.ndarray / torch.Tensor) observation inputs into network. :param rnn_states: (np.ndarray / torch.Tensor) if RNN network, hidden states for RNN. :param masks: (np.ndarray / torch.Tensor) mask tensor denoting if hidden states should be reinitialized to zeros. :param available_actions: (np.ndarray / torch.Tensor) denotes which actions are available to agent (if None, all actions available) :param deterministic: (bool) whether to sample from action distribution or return the mode. :return actions: (torch.Tensor) actions to take. :return action_log_probs: (torch.Tensor) log probabilities of taken actions. :return rnn_states: (torch.Tensor) updated RNN hidden states. """ obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) actions, action_log_probs = self.act(actor_features, available_actions, deterministic) return actions, action_log_probs, rnn_states def evaluate_actions(self, obs, rnn_states, action, masks, available_actions=None, active_masks=None): """ Compute log probability and entropy of given actions. :param obs: (torch.Tensor) observation inputs into network. :param action: (torch.Tensor) actions whose entropy and log probability to evaluate. :param rnn_states: (torch.Tensor) if RNN network, hidden states for RNN. :param masks: (torch.Tensor) mask tensor denoting if hidden states should be reinitialized to zeros. :param available_actions: (torch.Tensor) denotes which actions are available to agent (if None, all actions available) :param active_masks: (torch.Tensor) denotes whether an agent is active or dead. :return action_log_probs: (torch.Tensor) log probabilities of the input actions. :return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs. """ obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) action = check(action).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) if active_masks is not None: active_masks = check(active_masks).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) action_log_probs, dist_entropy = self.act.evaluate_actions( actor_features, action, available_actions, active_masks=active_masks if self._use_policy_active_masks else None) return action_log_probs, dist_entropy
class R_Model(nn.Module): def __init__(self, args, obs_space, share_obs_space, action_space, device=torch.device("cpu"), cat_self=True): super(R_Model, self).__init__() self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_ReLU = args.use_ReLU self._recurrent_N = args.recurrent_N self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_centralized_V = args.use_centralized_V self.hidden_size = args.hidden_size self.device = device self.tpdv = dict(dtype=torch.float32, device=device) init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] # obs space obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.obs_prep = base(args, obs_shape) # share obs space if self._use_centralized_V: share_obs_shape = get_shape_from_obs_space(share_obs_space) self.share_obs_prep = base(args, share_obs_shape, cat_self) else: self.share_obs_prep = self.obs_prep # common layer self.common = MLPLayer(self.hidden_size, self.hidden_size, layer_N=0, use_orthogonal=self._use_orthogonal, use_ReLU=self._use_ReLU) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) # value self.v_out = init_(nn.Linear(self.hidden_size, 1)) # action self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) self.to(self.device) def get_actions(self, obs, rnn_states, masks, available_actions=None, deterministic=False): obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) x = obs x = self.obs_prep(x) # common actor_features = self.common(x) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) actions, action_log_probs = self.act(actor_features, available_actions, deterministic) return actions, action_log_probs, rnn_states def get_probs(self, obs, rnn_states, masks, available_actions=None): obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) x = obs x = self.obs_prep(x) # common actor_features = self.common(x) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) action_probs = self.act.get_probs(actor_features, available_actions) return action_probs def evaluate_actions(self, obs, rnn_states, action, masks, available_actions=None, active_masks=None): obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) action = check(action).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if active_masks is not None: active_masks = check(active_masks).to(**self.tpdv) x = obs x = self.obs_prep(x) actor_features = self.common(x) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) action_log_probs, dist_entropy = self.act.evaluate_actions( actor_features, action, available_actions, active_masks) return action_log_probs, dist_entropy def get_values(self, share_obs, rnn_states, masks): share_obs = check(share_obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) share_x = share_obs share_x = self.share_obs_prep(share_x) critic_features = self.common(share_x) if self._use_naive_recurrent_policy or self._use_recurrent_policy: critic_features, rnn_states = self.rnn(critic_features, rnn_states, masks) values = self.v_out(critic_features) return values, rnn_states
class R_Actor(nn.Module): def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._activation_id = args.activation_id self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_influence_policy = args.use_influence_policy self._influence_layer_N = args.influence_layer_N self._use_policy_vhead = args.use_policy_vhead self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) if 'Dict' in obs_shape.__class__.__name__: self._mixed_obs = True self.base = MIXBase(args, obs_shape, cnn_layers_params=args.cnn_layers_params) else: self._mixed_obs = False self.base = CNNBase(args, obs_shape) if len(obs_shape)==3 else MLPBase(args, obs_shape, use_attn_internal=args.use_attn_internal, use_cat_self=True) input_size = self.base.output_size if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(input_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) input_size = self.hidden_size if self._use_influence_policy: self.mlp = MLPLayer(obs_shape[0], self.hidden_size, self._influence_layer_N, self._use_orthogonal, self._activation_id) input_size += self.hidden_size self.act = ACTLayer(action_space, input_size, self._use_orthogonal, self._gain) if self._use_policy_vhead: init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) if self._use_popart: self.v_out = init_(PopArt(input_size, 1, device=device)) else: self.v_out = init_(nn.Linear(input_size, 1)) self.to(device) def forward(self, obs, rnn_states, masks, available_actions=None, deterministic=False): if self._mixed_obs: for key in obs.keys(): obs[key] = check(obs[key]).to(**self.tpdv) else: obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) if self._use_influence_policy: mlp_obs = self.mlp(obs) actor_features = torch.cat([actor_features, mlp_obs], dim=1) actions, action_log_probs = self.act(actor_features, available_actions, deterministic) return actions, action_log_probs, rnn_states def evaluate_actions(self, obs, rnn_states, action, masks, available_actions=None, active_masks=None): if self._mixed_obs: for key in obs.keys(): obs[key] = check(obs[key]).to(**self.tpdv) else: obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) action = check(action).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) if active_masks is not None: active_masks = check(active_masks).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) if self._use_influence_policy: mlp_obs = self.mlp(obs) actor_features = torch.cat([actor_features, mlp_obs], dim=1) action_log_probs, dist_entropy = self.act.evaluate_actions(actor_features, action, available_actions, active_masks = active_masks if self._use_policy_active_masks else None) values = self.v_out(actor_features) if self._use_policy_vhead else None return action_log_probs, dist_entropy, values def get_policy_values(self, obs, rnn_states, masks): if self._mixed_obs: for key in obs.keys(): obs[key] = check(obs[key]).to(**self.tpdv) else: obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) if self._use_influence_policy: mlp_obs = self.mlp(obs) actor_features = torch.cat([actor_features, mlp_obs], dim=1) values = self.v_out(actor_features) return values
class R_Actor(nn.Module): def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_policy_vhead = args.use_policy_vhead self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) if self._use_policy_vhead: init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) self.v_out = init_(nn.Linear(self.hidden_size, 1)) self.to(device) def forward(self, obs, rnn_states, masks, available_actions=None, deterministic=False): obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) actions, action_log_probs = self.act(actor_features, available_actions, deterministic) return actions, action_log_probs, rnn_states def evaluate_actions(self, obs, rnn_states, action, masks, available_actions=None, active_masks=None): obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) action = check(action).to(**self.tpdv) masks = check(masks).to(**self.tpdv) if available_actions is not None: available_actions = check(available_actions).to(**self.tpdv) if active_masks is not None: active_masks = check(active_masks).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) action_log_probs, dist_entropy = self.act.evaluate_actions( actor_features, action, available_actions, active_masks=active_masks if self._use_policy_active_masks else None) values = self.v_out(actor_features) if self._use_policy_vhead else None return action_log_probs, dist_entropy, values def get_policy_values(self, obs, rnn_states, masks): obs = check(obs).to(**self.tpdv) rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) actor_features = self.base(obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks) values = self.v_out(actor_features) return values