Example #1
0
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_heads: int,
                 n_layers: int,
                 n_highway: int,
                 use_position: bool = False,
                 use_relative_position: bool = False,
                 dropout: float = 0.0):
        super(SelfAttentiveLBLBiLMV3, self).__init__()
        self.use_position = use_position
        self.use_relative_position_weights = use_relative_position
        self.n_layers = n_layers
        self.n_highway = n_highway
        self.n_heads = n_heads
        self.input_size = input_size
        self.width = width
        self.hidden_size = hidden_size

        forward_attns, backward_attns = [], []
        forward_blocks, backward_blocks = [], []

        for _ in range(n_layers):
            if self.use_relative_position_weights:
                forward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings(
                    n_heads,
                    hidden_size,
                    width=width + 1,
                    left_to_right=True,
                    dropout=dropout)
                backward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings(
                    n_heads,
                    hidden_size,
                    width=width + 1,
                    left_to_right=False,
                    dropout=dropout)
            else:
                forward_attn = MultiHeadedAttention(n_heads, hidden_size,
                                                    dropout)
                backward_attn = MultiHeadedAttention(n_heads, hidden_size,
                                                     dropout)

            forward_attns.append(forward_attn)
            backward_attns.append(backward_attn)
            forward_blocks.append(Highway(hidden_size, n_highway))
            backward_blocks.append(Highway(hidden_size, n_highway))

        self.forward_attns = torch.nn.ModuleList(forward_attns)
        self.backward_attns = torch.nn.ModuleList(backward_attns)

        self.forward_blocks = torch.nn.ModuleList(forward_blocks)
        self.backward_blocks = torch.nn.ModuleList(backward_blocks)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)
Example #2
0
    def __init__(
        self,
        decoding_dim: int,
        target_embedding_dim: int,
        feedforward_hidden_dim: int,
        num_layers: int,
        num_attention_heads: int,
        use_positional_encoding: bool = True,
        positional_encoding_max_steps: int = 5000,
        dropout_prob: float = 0.1,
        residual_dropout_prob: float = 0.2,
        attention_dropout_prob: float = 0.1,
    ) -> None:

        super().__init__(decoding_dim=decoding_dim,
                         target_embedding_dim=target_embedding_dim,
                         decodes_parallel=True)

        attn = MultiHeadedAttention(num_attention_heads, decoding_dim,
                                    attention_dropout_prob)
        feed_forward = PositionwiseFeedForward(decoding_dim,
                                               feedforward_hidden_dim,
                                               dropout_prob)
        self._embed_scale = math.sqrt(decoding_dim)
        self._positional_embedder = PositionalEncoding(decoding_dim,
                                                       positional_encoding_max_steps) \
                                                       if use_positional_encoding else None
        self._dropout = nn.Dropout(dropout_prob)
        self._self_attention = Decoder(
            DecoderLayer(decoding_dim, deepcopy(attn), deepcopy(attn),
                         feed_forward, residual_dropout_prob), num_layers)
Example #3
0
    def __init__(
        self,
        num_layers: int,
        decoding_dim: int,
        target_embedding_dim: int,
        feedforward_hidden_dim: int,
        num_attention_heads: int,
        combiner: TransformerCombiner,
        num_sources: int,
        use_positional_encoding: bool = True,
        positional_encoding_max_steps: int = 5000,
        dropout_prob: float = 0.1,
        residual_dropout_prob: float = 0.2,
        attention_dropout_prob: float = 0.2,
    ) -> None:
        super().__init__(decoding_dim,
                         target_embedding_dim,
                         decodes_parallel=True)

        self._decoding_dim = decoding_dim
        self._embed_scale = math.sqrt(decoding_dim)

        self._positional_embedder = (PositionalEncoding(
            input_dim=decoding_dim, max_len=positional_encoding_max_steps)
                                     if use_positional_encoding else None)
        self._dropout = nn.Dropout(dropout_prob)

        generic_attn = MultiHeadedAttention(num_attention_heads, decoding_dim,
                                            attention_dropout_prob)
        combined_attn = AttentionCombiner(num_sources, generic_attn, combiner)
        feed_forward = PositionwiseFeedForward(decoding_dim,
                                               feedforward_hidden_dim,
                                               dropout_prob)

        layer = DecoderLayer(size=decoding_dim,
                             self_attn=deepcopy(generic_attn),
                             src_attn=deepcopy(combined_attn),
                             feed_forward=feed_forward,
                             dropout=residual_dropout_prob)

        self._self_attention_layers = _clones(layer, num_layers)
        self.norm = nn.LayerNorm(layer.size)
Example #4
0
    def __init__(self,
                 width: int,
                 input_size: int,
                 hidden_size: int,
                 n_heads: int,
                 n_layers: int,
                 n_highway: int,
                 use_position: bool = False,
                 use_relative_position: bool = False,
                 dropout: float = 0.0):
        super(SelfAttentiveLBLBiLM, self).__init__()
        self.use_position = use_position
        self.use_relative_position_weights = use_relative_position
        self.n_layers = n_layers
        self.n_highway = n_highway
        self.n_heads = n_heads
        self.input_size = input_size
        self.width = width
        self.hidden_size = hidden_size

        forward_attns, backward_attns = [], []
        forward_paddings, backward_paddings = [], []
        forward_blocks, backward_blocks = [], []
        forward_weights, backward_weights = [], []

        for _ in range(n_layers):
            forward_attns.append(
                MultiHeadedAttention(n_heads, hidden_size, dropout))
            backward_attns.append(
                MultiHeadedAttention(n_heads, hidden_size, dropout))

            forward_paddings.append(
                torch.nn.Parameter(
                    torch.randn(width, hidden_size) / np.sqrt(hidden_size)))
            backward_paddings.append(
                torch.nn.Parameter(
                    torch.randn(width, hidden_size) / np.sqrt(hidden_size)))

            forward_blocks.append(Highway(hidden_size, n_highway))
            backward_blocks.append(Highway(hidden_size, n_highway))

            if self.use_relative_position_weights:
                forward_weights.append(
                    torch.nn.Parameter(torch.randn(width + 1)))
                backward_weights.append(
                    torch.nn.Parameter(torch.randn(width + 1)))

        self.forward_attns = torch.nn.ModuleList(forward_attns)
        self.backward_attns = torch.nn.ModuleList(backward_attns)

        self.forward_paddings = torch.nn.ParameterList(forward_paddings)
        self.backward_paddings = torch.nn.ParameterList(backward_paddings)

        self.forward_blocks = torch.nn.ModuleList(forward_blocks)
        self.backward_blocks = torch.nn.ModuleList(backward_blocks)

        if self.use_relative_position_weights:
            self.forward_weights = torch.nn.ParameterList(forward_weights)
            self.backward_weights = torch.nn.ParameterList(backward_weights)

        if self.use_position:
            self.position = PositionalEncoding(hidden_size)