Esempio n. 1
0
class BsuiteRnnUnshared1Rnn(nn.Module):
    """Special case, rnn after processing for each head

    Going to handle the hidden state by adding an extra dimension
    """
    def __init__(self,
                 input_shape: Tuple,
                 output_size: int,
                 rnn_type: str = 'gru',
                 rnn_size: int = 256,
                 hidden_sizes: [List, Tuple] = None,
                 baselines_init: bool = True,
                 layer_norm: bool = False
                 ):
        super().__init__()
        self._obs_dim = 2
        self.rnn_is_lstm = rnn_type != 'gru'
        input_size = int(np.prod(input_shape))
        rnn_class = get_rnn_class(rnn_type, layer_norm)
        self.body_pi = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None)
        self.body_v = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None)
        self.rnn_pi = rnn_class(self.body_pi.output_size + output_size + 1, rnn_size)  # Concat action, reward
        self.rnn_v = rnn_class(self.body_v.output_size + output_size + 1, rnn_size)
        self.pi = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, output_size), nn.Softmax(-1))  # Need to activate after lstm
        self.v = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, 1))
        if baselines_init:
            self.body_pi.apply(apply_init); self.body_v.apply(apply_init)
            self.rnn_pi.apply(apply_init); self.rnn_v.apply(apply_init)
            self.pi.apply(partial(apply_init, O_INIT_VALUES['pi']))
            self.v.apply(partial(apply_init, O_INIT_VALUES['v']))
        self.body_pi, self.body_v, self.pi, self.v = tscr(self.body_pi), tscr(self.body_v), tscr(self.pi), tscr(self.v)

    def forward(self, observation, prev_action, prev_reward, init_rnn_state):
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_dim)
        if init_rnn_state is not None:
            if self.rnn_is_lstm:
                init_rnn_pi, init_rnn_v = tuple(init_rnn_state)  # DualRnnState -> RnnState_pi, RnnState_v
                init_rnn_pi, init_rnn_v = tuple(init_rnn_pi), tuple(init_rnn_v)
            else:
                init_rnn_pi, init_rnn_v = tuple(init_rnn_state)  # DualRnnState -> h, h
        else:
            init_rnn_pi, init_rnn_v = None, None
        o_flat = observation.view(T*B, -1)
        b_pi, b_v = self.body_pi(o_flat), self.body_v(o_flat)
        rnn_input_pi = torch.cat([b_pi.view(T,B,-1),prev_action.view(T, B, -1),prev_reward.view(T, B, 1),], dim=2)
        rnn_input_v = torch.cat([b_v.view(T, B, -1), prev_action.view(T, B, -1), prev_reward.view(T, B, 1), ], dim=2)
        rnn_pi, next_rnn_state_pi = self.rnn_pi(rnn_input_pi, init_rnn_pi)
        rnn_v, next_rnn_state_v = self.rnn_pi(rnn_input_v, init_rnn_v)
        rnn_pi = rnn_pi.view(T*B, -1); rnn_v = rnn_v.view(T*B, -1)
        pi, v = self.pi(rnn_pi), self.v(rnn_v).squeeze(-1)
        pi, v = restore_leading_dims((pi, v), lead_dim, T, B)
        if self.rnn_is_lstm:
            next_rnn_state = DualRnnState(RnnState(*next_rnn_state_pi), RnnState(*next_rnn_state_v))
        else:
            next_rnn_state = DualRnnState(next_rnn_state_pi, next_rnn_state_v)
        return pi, v, next_rnn_state
Esempio n. 2
0
class POMDPOcRnnShared1Model(nn.Module):
    def __init__(self,
                 input_classes: int,
                 output_size: int,
                 option_size: int,
                 hidden_sizes: [List, Tuple, None] = None,
                 rnn_type: str = 'gru',
                 rnn_size: int = 256,
                 baselines_init: bool = True,
                 layer_norm: bool = False,
                 use_interest: bool = False,  # IOC sigmoid interest functions
                 use_diversity: bool = False,  # TDEOC q entropy output
                 use_attention: bool = False,
                 prev_action: np.ndarray = np.ones(5, dtype=bool),
                 prev_reward: np.ndarray = np.ones(5, dtype=bool),
                 prev_option: np.ndarray = np.zeros(5, dtype=bool)
                 ):
        super().__init__()
        self._obs_ndim = 0
        self.rnn_is_lstm = rnn_type != 'gru'
        self.preprocessor = tscr(OneHotLayer(input_classes))
        rnn_class = get_rnn_class(rnn_type, layer_norm)
        self.body = MlpModel(input_classes, hidden_sizes, None, nn.ReLU, None)
        self.p_a, self.p_o, self.p_r = prev_action.any().item(), prev_option.any().item(), prev_reward.any().item()
        rnn_input_size = self.body.output_size + (output_size * self.p_a) + (option_size * self.p_o) + self.p_r
        self.rnn = rnn_class(rnn_input_size, rnn_size)  # Concat action, reward
        self.oc = tscr(OptionCriticHead_SharedPreprocessor(
            input_size=rnn_size,
            output_size=output_size,
            option_size=option_size,
            intra_option_policy='discrete',
            use_interest=use_interest,
            use_diversity=use_diversity,
            use_attention=use_attention,
            baselines_init=baselines_init))
        if baselines_init:
            self.rnn.apply(partial(apply_init, gain=O_INIT_VALUES['lstm']))
            self.body.apply(apply_init)
        self.body = tscr(self.body)

    def forward(self, observation, prev_action, prev_reward, prev_option, init_rnn_state):
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_ndim)
        if init_rnn_state is not None and self.rnn_is_lstm: init_rnn_state = tuple(init_rnn_state)  # namedarraytuple -> tuple (h, c)
        o = self.preprocessor(observation.view(T * B))
        features = self.body(o)
        inp_list = [features.view(T,B,-1)] + ([prev_action.view(T, B, -1)] if self.p_a else []) + ([prev_reward.view(T, B, 1)] if self.p_r else []) + ([prev_option.view(T, B, -1)] if self.p_o else [])
        rnn_input = torch.cat(inp_list, dim=2)
        rnn_out, next_rnn_state = self.rnn(rnn_input, init_rnn_state)
        rnn_out = rnn_out.view(T*B, -1)
        pi, beta, q, pi_omega, q_ent = self.oc(rnn_out)
        pi, beta, q, pi_omega, q_ent = restore_leading_dims((pi, beta, q, pi_omega, q_ent), lead_dim, T, B)
        if self.rnn_is_lstm: next_rnn_state = RnnState(next_rnn_state)
        return pi, beta, q, pi_omega, next_rnn_state
Esempio n. 3
0
class POMDPRnnShared0Rnn(nn.Module):
    def __init__(self,
                 input_classes: int,
                 output_size: int,
                 rnn_type: str = 'gru',
                 rnn_size: int = 256,
                 hidden_sizes: [List, Tuple] = None,
                 baselines_init: bool = True,
                 layer_norm: bool = False,
                 prev_action: int = 2,
                 prev_reward: int = 2,
                 ):
        super().__init__()
        self._obs_dim = 0
        self.rnn_is_lstm = rnn_type != 'gru'
        self.preprocessor = tscr(OneHotLayer(input_classes))
        rnn_class = get_rnn_class(rnn_type, layer_norm)
        rnn_input_size = input_classes
        if prev_action: rnn_input_size += output_size  # Use previous action as input
        if prev_reward: rnn_input_size += 1  # Use previous reward as input
        self.rnn = rnn_class(rnn_input_size, rnn_size)  # Concat action, reward
        self.body = MlpModel(rnn_size, hidden_sizes, None, nn.ReLU, None)
        self.pi = nn.Sequential(nn.Linear(self.body.output_size, output_size), nn.Softmax(-1))
        self.v = nn.Linear(self.body.output_size, 1)
        if baselines_init:
            self.rnn.apply(apply_init); self.body.apply(apply_init)
            self.pi.apply(partial(apply_init, gain=O_INIT_VALUES['pi']))
            self.v.apply(partial(apply_init, gain=O_INIT_VALUES['v']))
        self.body, self.pi, self.v = tscr(self.body), tscr(self.pi), tscr(self.v)
        self.p_a = prev_action > 0
        self.p_r = prev_reward > 0

    def forward(self, observation, prev_action, prev_reward, init_rnn_state):
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_dim)
        if init_rnn_state is not None and self.rnn_is_lstm: init_rnn_state = tuple(init_rnn_state)  # namedarraytuple -> tuple (h, c)
        oh = self.preprocessor(observation)  # Leave in TxB format for lstm
        inp_list = [oh.view(T,B,-1)] + ([prev_action.view(T, B, -1)] if self.p_a else []) + ([prev_reward.view(T, B, 1)] if self.p_r else [])
        rnn_input = torch.cat(inp_list, dim=2)
        rnn_out, next_rnn_state = self.rnn(rnn_input, init_rnn_state)
        rnn_out = rnn_out.view(T*B, -1)
        rnn_out = self.body(rnn_out)
        pi, v = self.pi(rnn_out), self.v(rnn_out).squeeze(-1)
        pi, v = restore_leading_dims((pi, v), lead_dim, T, B)
        if self.rnn_is_lstm: next_rnn_state = RnnState(next_rnn_state)
        return pi, v, next_rnn_state
class CartpoleFfModel(torch.nn.Module):

    def __init__(
            self,
            image_shape,
            output_size,
            fc_sizes=[64, 64],
            basis=None,
            gain_type="xavier",
            out=None,
            ):
        super().__init__()

        input_size = image_shape[0]
        # Main body
        self.head = MlpModel(input_size, fc_sizes)
        # Policy output
        self.pi = torch.nn.Linear(fc_sizes[-1], output_size)
        # Value output
        self.value = torch.nn.Linear(fc_sizes[-1], 1)

        if gain_type == "xavier":
            self.head.apply(weight_init)
            self.pi.apply(weight_init)
            self.value.apply(weight_init)


    def forward(self, in_state, prev_action, prev_reward):
        """Feedforward layers process as [T*B,H]. Return same leading dims as
        input, can be [T,B], [B], or []."""
        state = in_state.type(torch.float)  # Expect torch.uint8 inputs
        # Infer (presence of) leading dimensions: [T,B], [B], or [].
        lead_dim, T, B, state_shape = infer_leading_dims(state, 1)

        base = self.head(state.view(T * B, -1))
        pi = F.softmax(self.pi(base), dim=-1)
        v = self.value(base).squeeze(-1)

        # Restore leading dimensions: [T,B], [B], or [], as input.
        pi, v = restore_leading_dims((pi, v), lead_dim, T, B)
        return pi, v
Esempio n. 5
0
class BsuiteRnnShared1Rnn(nn.Module):
    def __init__(self,
                 input_shape: Tuple,
                 output_size: int,
                 rnn_type: str = 'gru',
                 rnn_size: int = 256,
                 hidden_sizes: [List, Tuple] = None,
                 baselines_init: bool = True,
                 layer_norm: bool = False
                 ):
        super().__init__()
        self._obs_dim = 2
        self.rnn_is_lstm = rnn_type != 'gru'
        input_size = int(np.prod(input_shape))
        rnn_class = get_rnn_class(rnn_type, layer_norm)
        self.body = MlpModel(input_size, hidden_sizes, None, nn.ReLU, None)
        self.rnn = rnn_class(self.body.output_size + output_size + 1, rnn_size)  # Concat action, reward
        self.pi = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, output_size), nn.Softmax(-1))
        self.v = nn.Sequential(nn.ReLU(), nn.Linear(rnn_size, 1))
        if baselines_init:
            self.rnn.apply(apply_init); self.body.apply(apply_init)
            self.pi.apply(partial(apply_init, gain=O_INIT_VALUES['pi']))
            self.v.apply(partial(apply_init, gain=O_INIT_VALUES['v']))
        self.body, self.pi, self.v = tscr(self.body), tscr(self.pi), tscr(self.v)

    def forward(self, observation, prev_action, prev_reward, init_rnn_state):
        lead_dim, T, B, _ = infer_leading_dims(observation, self._obs_dim)
        if init_rnn_state is not None and self.rnn_is_lstm: init_rnn_state = tuple(init_rnn_state)  # namedarraytuple -> tuple (h, c)
        features = self.body(observation.view(T*B, -1))
        rnn_input = torch.cat([
            features.view(T,B,-1),
            prev_action.view(T, B, -1),  # Assumed onehot.
            prev_reward.view(T, B, 1),
            ], dim=2)
        rnn_out, next_rnn_state = self.rnn(rnn_input, init_rnn_state)
        rnn_out = rnn_out.view(T*B, -1)
        pi, v = self.pi(rnn_out), self.v(rnn_out).squeeze(-1)
        pi, v = restore_leading_dims((pi, v), lead_dim, T, B)
        if self.rnn_is_lstm: next_rnn_state = RnnState(next_rnn_state)
        return pi, v, next_rnn_state