class BsuiteRnnUnshared1Rnn(nn.Module): """Special case, rnn after processing for each head Going to handle the hidden state by adding an extra dimension """ def __init__(self, input_shape: Tuple, output_size: int, rnn_type: str = 'gru', rnn_size: int = 256, hidden_sizes: [List, Tuple] = None, baselines_init: bool = True, layer_norm: bool = False ): super().__init__() self._obs_dim = 2 self.rnn_is_lstm = rnn_type != 'gru' input_size = int(np.prod(input_shape)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.body_pi = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None) self.body_v = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None) self.rnn_pi = rnn_class(self.body_pi.output_size + output_size + 1, rnn_size) # Concat action, reward self.rnn_v = rnn_class(self.body_v.output_size + output_size + 1, rnn_size) self.pi = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, output_size), nn.Softmax(-1)) # Need to activate after lstm self.v = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, 1)) if baselines_init: self.body_pi.apply(apply_init); self.body_v.apply(apply_init) self.rnn_pi.apply(apply_init); self.rnn_v.apply(apply_init) self.pi.apply(partial(apply_init, O_INIT_VALUES['pi'])) self.v.apply(partial(apply_init, O_INIT_VALUES['v'])) self.body_pi, self.body_v, self.pi, self.v = tscr(self.body_pi), tscr(self.body_v), tscr(self.pi), tscr(self.v) def forward(self, observation, prev_action, prev_reward, init_rnn_state): lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_dim) if init_rnn_state is not None: if self.rnn_is_lstm: init_rnn_pi, init_rnn_v = tuple(init_rnn_state) # DualRnnState -> RnnState_pi, RnnState_v init_rnn_pi, init_rnn_v = tuple(init_rnn_pi), tuple(init_rnn_v) else: init_rnn_pi, init_rnn_v = tuple(init_rnn_state) # DualRnnState -> h, h else: init_rnn_pi, init_rnn_v = None, None o_flat = observation.view(T*B, -1) b_pi, b_v = self.body_pi(o_flat), self.body_v(o_flat) rnn_input_pi = torch.cat([b_pi.view(T,B,-1),prev_action.view(T, B, -1),prev_reward.view(T, B, 1),], dim=2) rnn_input_v = torch.cat([b_v.view(T, B, -1), prev_action.view(T, B, -1), prev_reward.view(T, B, 1), ], dim=2) rnn_pi, next_rnn_state_pi = self.rnn_pi(rnn_input_pi, init_rnn_pi) rnn_v, next_rnn_state_v = self.rnn_pi(rnn_input_v, init_rnn_v) rnn_pi = rnn_pi.view(T*B, -1); rnn_v = rnn_v.view(T*B, -1) pi, v = self.pi(rnn_pi), self.v(rnn_v).squeeze(-1) pi, v = restore_leading_dims((pi, v), lead_dim, T, B) if self.rnn_is_lstm: next_rnn_state = DualRnnState(RnnState(*next_rnn_state_pi), RnnState(*next_rnn_state_v)) else: next_rnn_state = DualRnnState(next_rnn_state_pi, next_rnn_state_v) return pi, v, next_rnn_state
class POMDPOcRnnShared1Model(nn.Module): def __init__(self, input_classes: int, output_size: int, option_size: int, hidden_sizes: [List, Tuple, None] = None, rnn_type: str = 'gru', rnn_size: int = 256, baselines_init: bool = True, layer_norm: bool = False, use_interest: bool = False, # IOC sigmoid interest functions use_diversity: bool = False, # TDEOC q entropy output use_attention: bool = False, prev_action: np.ndarray = np.ones(5, dtype=bool), prev_reward: np.ndarray = np.ones(5, dtype=bool), prev_option: np.ndarray = np.zeros(5, dtype=bool) ): super().__init__() self._obs_ndim = 0 self.rnn_is_lstm = rnn_type != 'gru' self.preprocessor = tscr(OneHotLayer(input_classes)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.body = MlpModel(input_classes, hidden_sizes, None, nn.ReLU, None) self.p_a, self.p_o, self.p_r = prev_action.any().item(), prev_option.any().item(), prev_reward.any().item() rnn_input_size = self.body.output_size + (output_size * self.p_a) + (option_size * self.p_o) + self.p_r self.rnn = rnn_class(rnn_input_size, rnn_size) # Concat action, reward self.oc = tscr(OptionCriticHead_SharedPreprocessor( input_size=rnn_size, output_size=output_size, option_size=option_size, intra_option_policy='discrete', use_interest=use_interest, use_diversity=use_diversity, use_attention=use_attention, baselines_init=baselines_init)) if baselines_init: self.rnn.apply(partial(apply_init, gain=O_INIT_VALUES['lstm'])) self.body.apply(apply_init) self.body = tscr(self.body) def forward(self, observation, prev_action, prev_reward, prev_option, init_rnn_state): lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim) if init_rnn_state is not None and self.rnn_is_lstm: init_rnn_state = tuple(init_rnn_state) # namedarraytuple -> tuple (h, c) o = self.preprocessor(observation.view(T * B)) features = self.body(o) inp_list = [features.view(T,B,-1)] + ([prev_action.view(T, B, -1)] if self.p_a else []) + ([prev_reward.view(T, B, 1)] if self.p_r else []) + ([prev_option.view(T, B, -1)] if self.p_o else []) rnn_input = torch.cat(inp_list, dim=2) rnn_out, next_rnn_state = self.rnn(rnn_input, init_rnn_state) rnn_out = rnn_out.view(T*B, -1) pi, beta, q, pi_omega, q_ent = self.oc(rnn_out) pi, beta, q, pi_omega, q_ent = restore_leading_dims((pi, beta, q, pi_omega, q_ent), lead_dim, T, B) if self.rnn_is_lstm: next_rnn_state = RnnState(next_rnn_state) return pi, beta, q, pi_omega, next_rnn_state
class POMDPRnnShared0Rnn(nn.Module): def __init__(self, input_classes: int, output_size: int, rnn_type: str = 'gru', rnn_size: int = 256, hidden_sizes: [List, Tuple] = None, baselines_init: bool = True, layer_norm: bool = False, prev_action: int = 2, prev_reward: int = 2, ): super().__init__() self._obs_dim = 0 self.rnn_is_lstm = rnn_type != 'gru' self.preprocessor = tscr(OneHotLayer(input_classes)) rnn_class = get_rnn_class(rnn_type, layer_norm) rnn_input_size = input_classes if prev_action: rnn_input_size += output_size # Use previous action as input if prev_reward: rnn_input_size += 1 # Use previous reward as input self.rnn = rnn_class(rnn_input_size, rnn_size) # Concat action, reward self.body = MlpModel(rnn_size, hidden_sizes, None, nn.ReLU, None) self.pi = nn.Sequential(nn.Linear(self.body.output_size, output_size), nn.Softmax(-1)) self.v = nn.Linear(self.body.output_size, 1) if baselines_init: self.rnn.apply(apply_init); self.body.apply(apply_init) self.pi.apply(partial(apply_init, gain=O_INIT_VALUES['pi'])) self.v.apply(partial(apply_init, gain=O_INIT_VALUES['v'])) self.body, self.pi, self.v = tscr(self.body), tscr(self.pi), tscr(self.v) self.p_a = prev_action > 0 self.p_r = prev_reward > 0 def forward(self, observation, prev_action, prev_reward, init_rnn_state): lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_dim) if init_rnn_state is not None and self.rnn_is_lstm: init_rnn_state = tuple(init_rnn_state) # namedarraytuple -> tuple (h, c) oh = self.preprocessor(observation) # Leave in TxB format for lstm inp_list = [oh.view(T,B,-1)] + ([prev_action.view(T, B, -1)] if self.p_a else []) + ([prev_reward.view(T, B, 1)] if self.p_r else []) rnn_input = torch.cat(inp_list, dim=2) rnn_out, next_rnn_state = self.rnn(rnn_input, init_rnn_state) rnn_out = rnn_out.view(T*B, -1) rnn_out = self.body(rnn_out) pi, v = self.pi(rnn_out), self.v(rnn_out).squeeze(-1) pi, v = restore_leading_dims((pi, v), lead_dim, T, B) if self.rnn_is_lstm: next_rnn_state = RnnState(next_rnn_state) return pi, v, next_rnn_state
class CartpoleFfModel(torch.nn.Module): def __init__( self, image_shape, output_size, fc_sizes=[64, 64], basis=None, gain_type="xavier", out=None, ): super().__init__() input_size = image_shape[0] # Main body self.head = MlpModel(input_size, fc_sizes) # Policy output self.pi = torch.nn.Linear(fc_sizes[-1], output_size) # Value output self.value = torch.nn.Linear(fc_sizes[-1], 1) if gain_type == "xavier": self.head.apply(weight_init) self.pi.apply(weight_init) self.value.apply(weight_init) def forward(self, in_state, prev_action, prev_reward): """Feedforward layers process as [T*B,H]. Return same leading dims as input, can be [T,B], [B], or [].""" state = in_state.type(torch.float) # Expect torch.uint8 inputs # Infer (presence of) leading dimensions: [T,B], [B], or []. lead_dim, T, B, state_shape = infer_leading_dims(state, 1) base = self.head(state.view(T * B, -1)) pi = F.softmax(self.pi(base), dim=-1) v = self.value(base).squeeze(-1) # Restore leading dimensions: [T,B], [B], or [], as input. pi, v = restore_leading_dims((pi, v), lead_dim, T, B) return pi, v
class BsuiteRnnShared1Rnn(nn.Module): def __init__(self, input_shape: Tuple, output_size: int, rnn_type: str = 'gru', rnn_size: int = 256, hidden_sizes: [List, Tuple] = None, baselines_init: bool = True, layer_norm: bool = False ): super().__init__() self._obs_dim = 2 self.rnn_is_lstm = rnn_type != 'gru' input_size = int(np.prod(input_shape)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.body = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None) self.rnn = rnn_class(self.body.output_size + output_size + 1, rnn_size) # Concat action, reward self.pi = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, output_size), nn.Softmax(-1)) self.v = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, 1)) if baselines_init: self.rnn.apply(apply_init); self.body.apply(apply_init) self.pi.apply(partial(apply_init, gain=O_INIT_VALUES['pi'])) self.v.apply(partial(apply_init, gain=O_INIT_VALUES['v'])) self.body, self.pi, self.v = tscr(self.body), tscr(self.pi), tscr(self.v) def forward(self, observation, prev_action, prev_reward, init_rnn_state): lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_dim) if init_rnn_state is not None and self.rnn_is_lstm: init_rnn_state = tuple(init_rnn_state) # namedarraytuple -> tuple (h, c) features = self.body(observation.view(T*B, -1)) rnn_input = torch.cat([ features.view(T,B,-1), prev_action.view(T, B, -1), # Assumed onehot. prev_reward.view(T, B, 1), ], dim=2) rnn_out, next_rnn_state = self.rnn(rnn_input, init_rnn_state) rnn_out = rnn_out.view(T*B, -1) pi, v = self.pi(rnn_out), self.v(rnn_out).squeeze(-1) pi, v = restore_leading_dims((pi, v), lead_dim, T, B) if self.rnn_is_lstm: next_rnn_state = RnnState(next_rnn_state) return pi, v, next_rnn_state