def __init__(self, input_shape: int, output_size: int, option_size: int, rnn_type: str = 'lstm', rnn_size: int = 128, hidden_sizes: [List, Tuple, None] = None, inits: [(float, float, float), None] = (np.sqrt(2), 1., 0.01), hidden_nonlinearity=torch.nn.Tanh, # Module form. use_interest=False, # IOC sigmoid interest functions use_diversity=False, # TDEOC q entropy output use_attention=False, ): super().__init__() self._obs_ndim = 0 self.preprocessor = tscr(OneHotLayer(input_shape)) self.rnn_type = rnn_type rnn_class = nn.GRU if rnn_type == 'gru' else nn.LSTM self.rnn = rnn_class(input_shape + output_size + 1, rnn_size) # At some point, want to put option in here too body_mlp_class = partial(MlpModel, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits[:-1]) # MLP with no head (and potentially no body) # Seperate mlp processors for each head self.model = tscr(OptionCriticHead_IndependentPreprocessor( input_size=rnn_size, input_module_class=body_mlp_class, output_size=output_size, option_size=option_size, intra_option_policy='discrete', use_interest=use_interest, use_diversity=use_diversity, use_attention=use_attention, baselines_init=True, orthogonal_init_base=inits[1], orthogonal_init_pol=inits[2] ))
def __init__(self, input_shape: Tuple, output_size: int, rnn_type: str = 'gru', rnn_size: int = 256, hidden_sizes: [List, Tuple] = None, baselines_init: bool = True, layer_norm: bool = False ): super().__init__() self._obs_dim = 2 self.rnn_is_lstm = rnn_type != 'gru' input_size = int(np.prod(input_shape)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.body_pi = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None) self.body_v = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None) self.rnn_pi = rnn_class(self.body_pi.output_size + output_size + 1, rnn_size) # Concat action, reward self.rnn_v = rnn_class(self.body_v.output_size + output_size + 1, rnn_size) self.pi = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, output_size), nn.Softmax(-1)) # Need to activate after lstm self.v = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, 1)) if baselines_init: self.body_pi.apply(apply_init); self.body_v.apply(apply_init) self.rnn_pi.apply(apply_init); self.rnn_v.apply(apply_init) self.pi.apply(partial(apply_init, O_INIT_VALUES['pi'])) self.v.apply(partial(apply_init, O_INIT_VALUES['v'])) self.body_pi, self.body_v, self.pi, self.v = tscr(self.body_pi), tscr(self.body_v), tscr(self.pi), tscr(self.v)
def __init__(self, input_classes: int, output_size: int, rnn_type: str = 'gru', rnn_size: int = 256, hidden_sizes: [List, Tuple] = None, baselines_init: bool = True, layer_norm: bool = False, prev_action: int = 2, prev_reward: int = 2, ): super().__init__() self._obs_dim = 0 self.rnn_is_lstm = rnn_type != 'gru' self.preprocessor = tscr(OneHotLayer(input_classes)) rnn_class = get_rnn_class(rnn_type, layer_norm) rnn_input_size = input_classes if prev_action: rnn_input_size += output_size # Use previous action as input if prev_reward: rnn_input_size += 1 # Use previous reward as input self.rnn = rnn_class(rnn_input_size, rnn_size) # Concat action, reward self.body = MlpModel(rnn_size, hidden_sizes, None, nn.ReLU, None) self.pi = nn.Sequential(nn.Linear(self.body.output_size, output_size), nn.Softmax(-1)) self.v = nn.Linear(self.body.output_size, 1) if baselines_init: self.rnn.apply(apply_init); self.body.apply(apply_init) self.pi.apply(partial(apply_init, gain=O_INIT_VALUES['pi'])) self.v.apply(partial(apply_init, gain=O_INIT_VALUES['v'])) self.body, self.pi, self.v = tscr(self.body), tscr(self.pi), tscr(self.v) self.p_a = prev_action > 0 self.p_r = prev_reward > 0
def __init__(self, input_classes: int, output_size: int, rnn_type: str = 'gru', rnn_size: int = 256, hidden_sizes: [List, Tuple] = None, baselines_init: bool = True, layer_norm: bool = False, prev_action: int = 3, prev_reward: int = 3, ): super().__init__() self._obs_dim = 0 self.rnn_is_lstm = rnn_type != 'gru' self.preprocessor = tscr(OneHotLayer(input_classes)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.body_pi = MlpModel(input_classes, hidden_sizes, None, nn.ReLU, None) self.body_v = MlpModel(input_classes, hidden_sizes, None, nn.ReLU, None) rnn_input_size_pi = self.body_pi.output_size + (prev_action in [1,3]) * output_size + (prev_reward in [1,3]) rnn_input_size_v = self.body_v.output_size + (prev_action in [2,3]) * output_size + (prev_reward in [2,3]) self.rnn_pi = rnn_class(rnn_input_size_pi, rnn_size) # Concat action, reward self.rnn_v = rnn_class(rnn_input_size_v, rnn_size) self.pi = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, output_size), nn.Softmax(-1)) # Need to activate after lstm self.v = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, 1)) if baselines_init: self.body_pi.apply(apply_init); self.body_v.apply(apply_init) self.rnn_pi.apply(apply_init); self.rnn_v.apply(apply_init) self.pi.apply(partial(apply_init, O_INIT_VALUES['pi'])) self.v.apply(partial(apply_init, O_INIT_VALUES['v'])) self.body_pi, self.body_v, self.pi, self.v = tscr(self.body_pi), tscr(self.body_v), tscr(self.pi), tscr(self.v) self.p_a = prev_action self.p_r = prev_reward
def __init__(self, input_shape: Tuple, output_size: int, hidden_sizes: [List, Tuple, None] = None, nonlinearity: nn.Module = nn.ReLU ): super().__init__() self._obs_ndim = 2 # All bsuite obs are 2 (even (1,1)) input_size = input_shape[0] * input_shape[1] self.preprocessor = MlpModel(input_size, hidden_sizes, None, nonlinearity) self.v = tscr(nn.Linear(self.preprocessor.output_size, 1)) self.pi = tscr(nn.Sequential(nn.Linear(self.preprocessor.output_size, output_size), nn.Softmax(-1)))
def __init__(self, input_classes: int, output_size: int, option_size: int, hidden_sizes: [List, Tuple, None] = None, inits: [(float, float, float), None] = (np.sqrt(2), 1., 0.01), shared_processor: bool = True, hidden_nonlinearity=torch.nn.ReLU, # Module form. use_interest=False, # IOC sigmoid interest functions use_diversity=False, # TDEOC q entropy output use_attention=False, ): super().__init__() self._obs_ndim = 0 self.preprocessor = tscr(OneHotLayer(input_classes)) body_mlp_class = partial(MlpModel, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=hidden_nonlinearity, inits=inits[:-1]) # MLP with no head (and potentially no body) if shared_processor: # Same mlp for all heads self.model = tscr(nn.Sequential(body_mlp_class(input_classes), OptionCriticHead_SharedPreprocessor( input_size=hidden_sizes[-1], output_size=output_size, option_size=option_size, intra_option_policy='discrete', use_interest=use_interest, use_attention=use_attention, use_diversity=use_diversity, baselines_init=True, ))) else: # Seperate mlp processors for each head (though if using diversity, q entropy and q share mlp self.model = tscr(OptionCriticHead_IndependentPreprocessor( input_size=input_classes, input_module_class=body_mlp_class, output_size=output_size, option_size=option_size, intra_option_policy='discrete', use_interest=use_interest, use_diversity=use_diversity, use_attention=use_attention, baselines_init=True, ))
def __init__(self, input_shape: Tuple, output_size: int, rnn_type: str = 'gru', rnn_size: int = 256, hidden_sizes: [List, Tuple] = None, baselines_init: bool = True, layer_norm: bool = False ): super().__init__() self._obs_dim = 2 self.rnn_is_lstm = rnn_type != 'gru' input_size = int(np.prod(input_shape)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.rnn = rnn_class(input_size + output_size + 1, rnn_size) # Concat action, reward pi_inits = (O_INIT_VALUES['base'], O_INIT_VALUES['pi']) if baselines_init else None v_inits = (O_INIT_VALUES['base'], O_INIT_VALUES['v']) if baselines_init else None self.pi = nn.Sequential(MlpModel(rnn_size, hidden_sizes, output_size, nn.ReLU, pi_inits), nn.Softmax(-1)) self.v = nn.Sequential(MlpModel(rnn_size, hidden_sizes, 1, nn.ReLU, v_inits)) if baselines_init: self.rnn.apply(apply_init) self.pi, self.v = tscr(self.pi), tscr(self.v)
def __init__(self, input_classes: int, output_size: int, option_size: int, hidden_sizes: [List, Tuple, None] = None, rnn_type: str = 'gru', rnn_size: int = 256, baselines_init: bool = True, layer_norm: bool = False, use_interest: bool = False, # IOC sigmoid interest functions use_diversity: bool = False, # TDEOC q entropy output use_attention: bool = False, prev_action: np.ndarray = np.ones(5, dtype=bool), prev_reward: np.ndarray = np.ones(5, dtype=bool), prev_option: np.ndarray = np.zeros(5, dtype=bool) ): super().__init__() self._obs_ndim = 0 self.rnn_is_lstm = rnn_type != 'gru' self.preprocessor = tscr(OneHotLayer(input_classes)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.body = MlpModel(input_classes, hidden_sizes, None, nn.ReLU, None) self.p_a, self.p_o, self.p_r = prev_action.any().item(), prev_option.any().item(), prev_reward.any().item() rnn_input_size = self.body.output_size + (output_size * self.p_a) + (option_size * self.p_o) + self.p_r self.rnn = rnn_class(rnn_input_size, rnn_size) # Concat action, reward self.oc = tscr(OptionCriticHead_SharedPreprocessor( input_size=rnn_size, output_size=output_size, option_size=option_size, intra_option_policy='discrete', use_interest=use_interest, use_diversity=use_diversity, use_attention=use_attention, baselines_init=baselines_init)) if baselines_init: self.rnn.apply(partial(apply_init, gain=O_INIT_VALUES['lstm'])) self.body.apply(apply_init) self.body = tscr(self.body)
def __init__(self, input_classes: int, output_size: int, option_size: int, hidden_sizes: [List, Tuple, None] = None, rnn_type: str = 'gru', rnn_size: int = 256, baselines_init: bool = True, layer_norm: bool = False, use_interest: bool = False, # IOC sigmoid interest functions use_diversity: bool = False, # TDEOC q entropy output use_attention: bool = False, prev_action: np.ndarray = np.ones(5, dtype=bool), prev_reward: np.ndarray = np.ones(5, dtype=bool), prev_option: np.ndarray = np.zeros(5, dtype=bool) ): super().__init__() self._obs_ndim = 0 self.rnn_is_lstm = rnn_type != 'gru' self.preprocessor = tscr(OneHotLayer(input_classes)) rnn_class = get_rnn_class(rnn_type, layer_norm) self.p_a, self.p_o, self.p_r = prev_action, prev_option, prev_reward body_mlp_class = partial(MlpModel, hidden_sizes=hidden_sizes, output_size=None, nonlinearity=nn.ReLU, inits=None) self.oc = OptionCriticHead_IndependentPreprocessorWithRNN( input_size=input_classes, input_module_class=body_mlp_class, rnn_module_class=rnn_class, output_size=output_size, option_size=option_size, rnn_size=rnn_size, intra_option_policy='discrete', use_interest=use_interest, use_diversity=use_diversity, use_attention=use_attention, baselines_init=baselines_init, prev_action=prev_action, prev_reward=prev_reward, prev_option=prev_option )
def __init__(self, input_classes: int, output_size: int, hidden_sizes: [List, Tuple, None] = None, inits: [(float, float, float), None] = (np.sqrt(2), 1., 0.01), nonlinearity: nn.Module = nn.ReLU, shared_processor: bool = False ): super().__init__() self._obs_ndim = 0 if shared_processor: self.preprocessor = tscr(nn.Sequential(OneHotLayer(input_classes), MlpModel(input_classes, hidden_sizes, None, nonlinearity, inits[:-1] if inits is not None else inits))) self.v = tscr(layer_init(nn.Linear(hidden_sizes[-1], 1), inits[1]) if inits else nn.Linear(hidden_sizes[-1], 1)) self.pi = tscr(nn.Sequential(layer_init(nn.Linear(hidden_sizes[-1], output_size), inits[1]) if inits else nn.Linear(hidden_sizes[-1], output_size), nn.Softmax(-1))) else: self.preprocessor = tscr(OneHotLayer(input_classes)) self.v = tscr(MlpModel(input_classes, hidden_sizes, 1, nonlinearity, inits[:-1] if inits is not None else inits)) self.pi = tscr(nn.Sequential(MlpModel(input_classes, hidden_sizes, output_size, nonlinearity, inits[0::2] if inits is not None else inits), nn.Softmax(-1)))