def __init__(self, args, obs_space, share_obs_space, action_space, device=torch.device("cpu"), cat_self=True): super(R_Model, self).__init__() self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_ReLU = args.use_ReLU self._recurrent_N = args.recurrent_N self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_centralized_V = args.use_centralized_V self.hidden_size = args.hidden_size self.device = device self.tpdv = dict(dtype=torch.float32, device=device) init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] # obs space obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.obs_prep = base(args, obs_shape) # share obs space if self._use_centralized_V: share_obs_shape = get_shape_from_obs_space(share_obs_space) self.share_obs_prep = base(args, share_obs_shape, cat_self) else: self.share_obs_prep = self.obs_prep # common layer self.common = MLPLayer(self.hidden_size, self.hidden_size, layer_N=0, use_orthogonal=self._use_orthogonal, use_ReLU=self._use_ReLU) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) # value self.v_out = init_(nn.Linear(self.hidden_size, 1)) # action self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) self.to(self.device)
def __init__(self, args, num_agents, obs_space, cent_obs_space, act_space): self.episode_length = args.episode_length self.n_rollout_threads = args.n_rollout_threads self.hidden_size = args.hidden_size self.recurrent_N = args.recurrent_N self.gamma = args.gamma self.gae_lambda = args.gae_lambda self._use_gae = args.use_gae self._use_popart = args.use_popart self._use_valuenorm = args.use_valuenorm self._use_proper_time_limits = args.use_proper_time_limits obs_shape = get_shape_from_obs_space(obs_space) share_obs_shape = get_shape_from_obs_space(cent_obs_space) if type(obs_shape[-1]) == list: obs_shape = obs_shape[:1] if type(share_obs_shape[-1]) == list: share_obs_shape = share_obs_shape[:1] self.share_obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, num_agents, *share_obs_shape), dtype=np.float32) self.obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, num_agents, *obs_shape), dtype=np.float32) self.rnn_states = np.zeros( (self.episode_length + 1, self.n_rollout_threads, num_agents, self.recurrent_N, self.hidden_size), dtype=np.float32) self.rnn_states_critic = np.zeros_like(self.rnn_states) self.value_preds = np.zeros( (self.episode_length + 1, self.n_rollout_threads, num_agents, 1), dtype=np.float32) self.returns = np.zeros_like(self.value_preds) if act_space.__class__.__name__ == 'Discrete': self.available_actions = np.ones((self.episode_length + 1, self.n_rollout_threads, num_agents, act_space.n), dtype=np.float32) else: self.available_actions = None act_shape = get_shape_from_act_space(act_space) self.actions = np.zeros( (self.episode_length, self.n_rollout_threads, num_agents, act_shape), dtype=np.float32) self.action_log_probs = np.zeros( (self.episode_length, self.n_rollout_threads, num_agents, act_shape), dtype=np.float32) self.rewards = np.zeros( (self.episode_length, self.n_rollout_threads, num_agents, 1), dtype=np.float32) self.masks = np.ones((self.episode_length + 1, self.n_rollout_threads, num_agents, 1), dtype=np.float32) self.bad_masks = np.ones_like(self.masks) self.active_masks = np.ones_like(self.masks) self.step = 0
def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) self.to(device)
def __init__(self, args, cent_obs_space, device=torch.device("cpu")): super(R_Critic, self).__init__() self.hidden_size = args.hidden_size self._use_orthogonal = args.use_orthogonal self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._recurrent_N = args.recurrent_N self._use_popart = args.use_popart self.tpdv = dict(dtype=torch.float32, device=device) init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] cent_obs_shape = get_shape_from_obs_space(cent_obs_space) base = CNNBase if len(cent_obs_shape) == 3 else MLPBase self.base = base(args, cent_obs_shape) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) if self._use_popart: self.v_out = init_(PopArt(self.hidden_size, 1, device=device)) else: self.v_out = init_(nn.Linear(self.hidden_size, 1)) self.to(device)
def __init__(self, args, obs_space, action_space, hidden_size, use_recurrent_policy=False): super(actor, self).__init__() self.obs_space = obs_space self.action_space = action_space obs_shape = get_shape_from_obs_space(obs_space) self.tpdv = dict(dtype=torch.float32) self.config = { "model": { "type": "DuelingNetwork", "base_module": { "layers": hidden_size }, "value": { "layers": [hidden_size[1]] }, "advantage": { "layers": [hidden_size[1]] }, "in": int(np.prod(obs_shape)), "out": self.action_space.n }, } self.value_net = model_factory(self.config["model"])
def __init__(self, args, obs_space, action_space, hidden_size=64, use_recurrent_policy=True): super(actor, self).__init__() self.hidden_size = hidden_size self._use_recurrent_policy = use_recurrent_policy self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain)
def __init__(self, args, cent_obs_space, action_space, device=torch.device("cpu")): super(R_Q_Head, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] if args.env_name == 'GRFootball': cent_obs_space = [args.state_shape] else: cent_obs_space = get_shape_from_obs_space(cent_obs_space) base = CNNBase if len(cent_obs_space) == 3 else MLPBase self.base = base(args, cent_obs_space) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) self.v_out = init_(nn.Linear(self.hidden_size, self.hidden_size)) # self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) if action_space.__class__.__name__ == "Discrete": action_dim = action_space.n self.action_out = nn.Linear(self.hidden_size, action_dim) elif action_space.__class__.__name__ == "Box": action_dim = action_space.shape[0] self.action_out = nn.Linear(self.hidden_size, action_dim) elif action_space.__class__.__name__ == "MultiBinary": action_dim = action_space.shape[0] self.action_out = nn.Linear(self.hidden_size, action_dim) elif action_space.__class__.__name__ == "MultiDiscrete": self.multi_discrete = True action_dims = action_space.high - action_space.low + 1 self.action_outs = [] for action_dim in action_dims: self.action_outs.append(nn.Linear(self.hidden_size, action_dim)) self.action_outs = nn.ModuleList(self.action_outs) self.to(device)
def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._activation_id = args.activation_id self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_influence_policy = args.use_influence_policy self._influence_layer_N = args.influence_layer_N self._use_policy_vhead = args.use_policy_vhead self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) if 'Dict' in obs_shape.__class__.__name__: self._mixed_obs = True self.base = MIXBase(args, obs_shape, cnn_layers_params=args.cnn_layers_params) else: self._mixed_obs = False self.base = CNNBase(args, obs_shape) if len(obs_shape)==3 else MLPBase(args, obs_shape, use_attn_internal=args.use_attn_internal, use_cat_self=True) input_size = self.base.output_size if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(input_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) input_size = self.hidden_size if self._use_influence_policy: self.mlp = MLPLayer(obs_shape[0], self.hidden_size, self._influence_layer_N, self._use_orthogonal, self._activation_id) input_size += self.hidden_size self.act = ACTLayer(action_space, input_size, self._use_orthogonal, self._gain) if self._use_policy_vhead: init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) if self._use_popart: self.v_out = init_(PopArt(input_size, 1, device=device)) else: self.v_out = init_(nn.Linear(input_size, 1)) self.to(device)
def __init__(self, args, obs_space, action_space, device=torch.device("cpu")): super(R_Actor, self).__init__() self.hidden_size = args.hidden_size self._gain = args.gain self._use_orthogonal = args.use_orthogonal self._use_policy_active_masks = args.use_policy_active_masks self._use_naive_recurrent_policy = args.use_naive_recurrent_policy self._use_recurrent_policy = args.use_recurrent_policy self._use_policy_vhead = args.use_policy_vhead self._recurrent_N = args.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) obs_shape = get_shape_from_obs_space(obs_space) base = CNNBase if len(obs_shape) == 3 else MLPBase self.base = base(args, obs_shape) if self._use_naive_recurrent_policy or self._use_recurrent_policy: self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal) self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain) if self._use_policy_vhead: init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal] def init_(m): return init(m, init_method, lambda x: nn.init.constant_(x, 0)) self.v_out = init_(nn.Linear(self.hidden_size, 1)) self.to(device)