Exemple #1
0
    def __init__(self,
                 arch_flag,
                 syntax_embedding_size,
                 d_model=512,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=2048,
                 dropout: float = 0.1,
                 activation="relu"):
        super(TransformerZ, self).__init__()

        self.arch_flag = arch_flag

        encoder_layer = TransformerEncoderLayer(d_model, nhead,
                                                dim_feedforward, dropout)
        encoder_norm = LayerNorm(d_model)
        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                          encoder_norm)

        decoder_layer = TransformerDecoderLayer(d_model, nhead,
                                                dim_feedforward, dropout)
        decoder_norm = LayerNorm(d_model)
        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                          decoder_norm)

        if self.arch_flag == "ENC_DEC":
            self.fc_enc_dec = nn.Linear(syntax_embedding_size + d_model,
                                        d_model)

        self._reset_parameters()

        self.d_model = d_model
        self.nhead = nhead
    def __init__(self, config):
        """
        :param input_hidden: first-hidden layer input embed-dim
        :type input_hidden: int
        :param intermediate_hidden: layer-(hidden)-layer middle point weight
        :type intermediate_hidden: int
        :param dropout_rate: dropout rate, defaults to None
        :type dropout_rate: float, optional
        """
        # paper specifies,skip connections,layer normalization, and orthogonal initialization

        super().__init__()
        # 3,679,744 x2 params
        self.linear_1 = nn.Linear(config.feed_forward2_hidden,
                                  config.feed_forward2_hidden)
        self.linear_2 = nn.Linear(config.feed_forward2_hidden,
                                  config.feed_forward2_hidden)
        # self.linear_3 = nn.Linear(
        #     config.feed_forward2_hidden, config.feed_forward2_hidden
        # )
        self.norm1 = LayerNorm(config.feed_forward2_hidden)
        self.norm2 = LayerNorm(config.feed_forward2_hidden)
        # self.norm3 = LayerNorm(config.feed_forward2_hidden)
        self.final = nn.Linear(config.feed_forward2_hidden,
                               config.num_embed_hidden)
        self.orthogonal_initialization(
        )  # torch implementation works perfectly out the box,
Exemple #3
0
    def __init__(self, max_span_size, hidden_size, num_heads, query_encoder,
                 passage_encoder, passage_selection):
        super(SupportingTokenIdentification, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.query_encoder = query_encoder
        self.passage_encoder = passage_encoder
        self.max_span_size = max_span_size

        self.passage_selection = passage_selection

        self.interaction = Interaction(hidden_size)
        self.query_blocks = nn.ModuleList(
            [TransformerBlock(num_heads, 5 * hidden_size, hidden_size)] + [
                TransformerBlock(num_heads, hidden_size, hidden_size)
                for i in range(1)
            ])
        self.passage_blocks = nn.ModuleList(
            [TransformerBlock(num_heads, 5 * hidden_size, hidden_size)] + [
                TransformerBlock(num_heads, hidden_size, hidden_size)
                for i in range(2)
            ])
        self.norm1 = LayerNorm(hidden_size)
        self.norm2 = LayerNorm(hidden_size)
        self.scorer = nn.Linear(hidden_size, 1)
    def __init__(self, input_channels=8, output_channels=8, input_height=32):
        super(attention_block, self).__init__()

        reduce_dim = int(output_channels / 2)
        self.reduce_dim = reduce_dim

        # When we increase the dimension in such a layer we have to "increase" the input for the residual connection
        if input_channels == output_channels:
            self.residual = False
        else:
            self.reduce_residual = nn.Conv2d(input_channels, output_channels, kernel_size=1)
            self.residual = True

        self.reduce1 = nn.Conv2d(input_channels, reduce_dim, kernel_size=1)  # 1x1
        self.conv1 = depthwise_separable_conv_bn(reduce_dim, reduce_dim)

        self.el = nn.ELU()
        self.reduce2 = nn.Conv2d(reduce_dim, reduce_dim * 3, kernel_size=1)
        self.tanh1 = nn.Tanh()
        self.tanh2 = nn.Tanh()
        self.sig1 = nn.Sigmoid()

        self.reduce3 = nn.Conv2d(reduce_dim, output_channels, kernel_size=1)
        self.conv2 = depthwise_separable_conv_bn(output_channels, output_channels)
        self.el2 = nn.ELU()
        self.MISH = Mish()

        w = reduce_dim
        # elementwise affine true means we learn if we want to normalize, false we always normalize
        self.ln_1 = LayerNorm(w, elementwise_affine=True)
        self.ln_2 = LayerNorm(w, elementwise_affine=False)
        self.ln_3 = LayerNorm(w, elementwise_affine=False)
        self.ln_4 = LayerNorm(w, elementwise_affine=False)
        self.ln_5 = LayerNorm(w, elementwise_affine=False)
Exemple #5
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 use_gate=False):
        #fill in reordering of operations as done in https://arxiv.org/pdf/1910.06764.pdf
        #d_model: dimension of embedding for each input
        super(StableTransformerLayer, self).__init__()

        self.use_gate = use_gate
        self.gate_mha = GRUGate(d_model)
        self.gate_mlp = GRUGate(d_model)
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = F.relu
Exemple #6
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu"):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = MultiheadAttention(d_model,
                                                 nhead,
                                                 dropout=dropout)
        # Implementation of Feedforward model
        if activation == "glu":
            self.linear1 = Linear(d_model, 2 * dim_feedforward)
        else:
            self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Exemple #7
0
    def __init__(self,
                 d_model=512,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=2048,
                 dropout: float = 0.1,
                 activation="relu"):
        super(TransformerZ, self).__init__()

        encoder_layer = TransformerEncoderLayer(d_model, nhead,
                                                dim_feedforward, dropout)
        encoder_norm = LayerNorm(d_model)
        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                          encoder_norm)

        decoder_layer = TransformerDecoderLayer(d_model, nhead,
                                                dim_feedforward, dropout)
        decoder_norm = LayerNorm(d_model)
        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                          decoder_norm)

        self._reset_parameters()

        self.d_model = d_model
        self.nhead = nhead
    def __init__(self,
                 d_model,
                 n_cat_embeddings,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu"):
        super().__init__()

        self.self_attn = MultiheadAttention(d_model,
                                            n_cat_embeddings,
                                            nhead,
                                            dropout=dropout)

        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Exemple #9
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 d_global2=None):
        super(TransformerEncoderLayerImproved, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        if d_global2 is not None:
            self.linear_global2 = Linear(d_global2, d_model)

        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2_2 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Exemple #10
0
    def __init__(self, num_features=22, nhead=3, dim_feedforward=2048, dropout=0.1, activation = "relu", 
                 use_LayerNorm = True, init_resweight = 0, resweight_trainable = True):
        super(ReZeroEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(num_features, nhead, dropout=dropout)
        
        # Define the Resisdual Weight for ReZero
        self.resweight = torch.nn.Parameter(torch.Tensor([init_resweight]), requires_grad = resweight_trainable)

        # Implementation of Feedforward model
        self.linear1 = Linear(num_features, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, num_features)
        self.use_LayerNorm = use_LayerNorm
        if self.use_LayerNorm != False:
            self.norm1 = LayerNorm(num_features)
            self.norm2 = LayerNorm(num_features)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        if activation == "relu":
            self.activation = F.relu
        elif activation == "gelu":
            self.activation = F.gelu
        elif activation == "tanh":
            self.activation = torch.tanh
Exemple #11
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=256,
                 dropout=0,
                 activation="relu"):

        from torch.nn.modules.activation import MultiheadAttention
        from torch.nn.modules.normalization import LayerNorm
        from torch.nn.modules.dropout import Dropout
        from torch.nn.modules.rnn import LSTM
        from torch.nn.modules.linear import Linear

        super(DPTNetBlock, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        # self.linear1 = Linear(d_model, dim_feedforward)
        self.rnn = LSTM(d_model, d_model * 2, 1, bidirectional=True)
        self.dropout = Dropout(dropout)
        # self.linear2 = Linear(dim_feedforward, d_model)
        self.linear2 = Linear(d_model * 2 * 2, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Exemple #12
0
    def __init__(self, d_model, nhead, num_encoder_layers, num_decoder_layers,
                 dim_feedforward, dropout, activation, src_vocab_size, tgt_vocab_size):
        super(TransformerModel, self).__init__()
        self.pos_encoder = PositionalEncoding(
            d_model=d_model, dropout=0.1)  # , max_len=100)
        encoder_layer = TransformerEncoderLayer(
                        d_model, nhead, dim_feedforward, dropout, activation)
        encoder_norm = LayerNorm(d_model)
        self.encoder = TransformerEncoder(
            encoder_layer, num_encoder_layers, encoder_norm)
        decoder_layer = TransformerDecoderLayer(
            d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = LayerNorm(d_model)
        self.decoder = TransformerDecoder(
            decoder_layer, num_decoder_layers, decoder_norm)

        self.d_model = d_model
        self.nhead = nhead
        self.linear = Linear(d_model, tgt_vocab_size)
        self.transformer = Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward,
                                       dropout=dropout, activation=activation)
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

        self._reset_parameters()
Exemple #13
0
    def __init__(self,
                 d_model: int = 512,
                 nhead: int = 8,
                 num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1,
                 activation: str = "relu",
                 custom_encoder: Optional[Any] = None,
                 custom_decoder: Optional[Any] = None) -> None:
        super(TransformerZ, self).__init__()

        encoder_layer = TransformerEncoderLayer(d_model, nhead,
                                                dim_feedforward, dropout,
                                                activation)
        encoder_norm = LayerNorm(d_model)
        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                          encoder_norm)

        decoder_layer = TransformerDecoderLayer(d_model, nhead,
                                                dim_feedforward, dropout,
                                                activation)
        decoder_norm = LayerNorm(d_model)
        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                          decoder_norm)

        self._reset_parameters()

        self.d_model = d_model
        self.nhead = nhead
Exemple #14
0
    def __init__(self,
                 d_model=512,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=2048,
                 dropout=0.1,
                 custom_encoder=None,
                 custom_decoder=None):
        super(Transformer, self).__init__()

        if custom_encoder is not None:
            self.encoder = custom_encoder
        else:
            encoder_layer = TransformerEncoderLayer(d_model, nhead,
                                                    dim_feedforward, dropout)
            encoder_norm = LayerNorm(d_model)
            self.encoder = TransformerEncoder(encoder_layer,
                                              num_encoder_layers, encoder_norm)

        if custom_decoder is not None:
            self.decoder = custom_decoder
        else:
            decoder_layer = TransformerDecoderLayer(d_model, nhead,
                                                    dim_feedforward, dropout)
            decoder_norm = LayerNorm(d_model)
            self.decoder = TransformerDecoder(decoder_layer,
                                              num_decoder_layers, decoder_norm)

        self._reset_parameters()

        self.d_model = d_model
        self.nhead = nhead
Exemple #15
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu"):
        super(TransformerEncoderLayer, self).__init__()
        # global countz
        # countz += 1
        # self.count = countz
        # print("enc", countz)
        self.self_attn = MultiheadAttentionZSelf(d_model,
                                                 nhead,
                                                 dropout=dropout,
                                                 name="EncoderSelfAttn")
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
    def __init__(self,
                 d_model: int,
                 nhead: int,
                 dim_feedforward: int = 2048,
                 dropout=0.1) -> None:
        super(TransformerDecoderLayerCustom3,
              self).__init__(d_model, nhead, dim_feedforward, dropout)
        self.self_attn = MultiheadAttentionCustom2(d_model,
                                                   nhead,
                                                   dropout=dropout)
        self.multihead_attn = MultiheadAttentionCustom2(d_model,
                                                        nhead,
                                                        dropout=dropout)

        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)

        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)
    def __init__(self,
                 embed_dim,
                 dropout=0.1,
                 dim_feedforward=128,
                 cycles=3,
                 passthrough_mode=False,
                 q_k_sim='dot'):
        '''
        Hierarchical attention is a way to put keys in long sequences into slots/buckets 
        to improve sparcity and time-space efficiency from O(n^2) to O(n log n)
        This is achieved in two passes, first we populates the slots with representative 
        samples of the tokens bellow. Then when computing token level attention, the queries
        are compared to the slots first, then the derived attention scores weigh the tokens 
        and lower level attention scores under under that slot. 
        '''
        super().__init__()
        self.embed_dim = embed_dim
        self.cycles = cycles
        self.passthrough_mode = passthrough_mode
        self.q_k_sim = q_k_sim

        self.scaling = float(embed_dim)**-0.5

        self.slot_Wq = Linear(embed_dim, embed_dim, bias=False)
        self.slot_Wk = Linear(embed_dim, embed_dim, bias=False)
        self.slot_Wv = Linear(embed_dim, embed_dim, bias=False)

        self.Wq = Linear(embed_dim, embed_dim, bias=False)
        self.Wk = Linear(embed_dim, embed_dim, bias=False)
        self.Wv = Linear(embed_dim, embed_dim, bias=False)

        self.linear1 = Linear(embed_dim, dim_feedforward)
        self.linear2 = Linear(dim_feedforward, embed_dim)

        if passthrough_mode:
            dropout = 0
            self.slot_Wq.weight.data = torch.eye(embed_dim, embed_dim)
            self.slot_Wk.weight.data = torch.eye(embed_dim, embed_dim)
            self.slot_Wv.weight.data = torch.eye(embed_dim, embed_dim)

            self.Wq.weight.data = torch.eye(embed_dim, embed_dim)
            self.Wk.weight.data = torch.eye(embed_dim, embed_dim)
            self.Wv.weight.data = torch.eye(embed_dim, embed_dim)

            self.linear1.weight.data = torch.eye(dim_feedforward, embed_dim)
            self.linear2.weight.data = torch.eye(embed_dim, dim_feedforward)
            self.linear1.bias.data = torch.zeros((dim_feedforward, ))
            self.linear2.bias.data = torch.zeros((embed_dim, ))

            self.norm1 = lambda x: x
            self.norm2 = lambda x: x

            self.scaling = 1.0
        else:
            self.norm1 = LayerNorm(embed_dim)
            self.norm2 = LayerNorm(embed_dim)

        self.dropout = Dropout(dropout)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
 def __init__(self, d_model=512, d_ff=2048, n_heads=8, dropout=0.1):
     super().__init__()
     self.attn_head = MultiHeadAttention(d_model, n_heads, dropout)
     self.layer_norm1 = LayerNorm(d_model)
     self.dropout = nn.Dropout(dropout)
     self.position_wise_feed_forward = PositionwiseFeedForward(d_model,
                                                               d_ff,
                                                               dropout=0.1)
     self.layer_norm2 = LayerNorm(d_model)
Exemple #19
0
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.0):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttentionZ(d_model, nhead)

        self.linear1 = Linear(d_model, dim_feedforward)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

        self.activation = F.relu  #get_activation_fn(activation)
Exemple #20
0
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
Exemple #21
0
 def __init__(self, d_model=768, n_head=12, dropout=0.1):
     super(TransformerBlock, self).__init__()
     self.attn = Attention(d_model=768,
                           n_head=12,
                           d_head=64,
                           n_ctx=1024,
                           bias=True,
                           scale=False)
     self.feedforward = FeedForward(dropout=0.1, d_model=768, nx=768 * 4)
     self.ln_1 = LayerNorm(d_model)
     self.ln_2 = LayerNorm(d_model)
Exemple #22
0
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, er_len=None):
        super(TransformerEncoderLayerRPR, self).__init__()
        self.self_attn = MultiheadAttentionRPR(d_model, nhead, dropout=dropout, er_len=er_len)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
Exemple #23
0
    def __init__(self, num_heads, input_hidden_size, output_hidden_size, activation=None):
        super(TransformerBlock, self).__init__()
        self.output_hidden_size=output_hidden_size
        self.self_attn = nn.MultiheadAttention(input_hidden_size, num_heads, dropout=0.1)
        self.norm1 = LayerNorm(input_hidden_size)
        self.norm2 = LayerNorm(input_hidden_size)
        self.linear1 = nn.Linear(input_hidden_size, output_hidden_size)
        self.linear2 = nn.Linear(output_hidden_size, output_hidden_size)

        if activation is None:
            self.activation= F.relu
        else:
            self.activation=activation
Exemple #24
0
class ConveRTEncoderLayer(nn.Module):
    """Single Transformer block which is same architecture with Attention is All You Need"""
    def __init__(self, config: ConveRTModelConfig):
        """ initialize single encoder layer (Transformer Block)

        single encoder layer is consisted with under layers.

        1. single-head self-attention
        2. fead-forward-1 layer

        :param config: model config
        :type config: ConveRTModelConfig
        """
        super().__init__()
        # TODO: relative position self attention
        self.self_attention = SelfAttention(config)
        self.norm1 = LayerNorm(config.num_embed_hidden)
        self.dropout1 = nn.Dropout(config.dropout_rate)

        self.fead_forward = ConveRTInnerFeedForward(
            config.num_embed_hidden, config.feed_forward1_hidden,
            config.dropout_rate)
        self.norm2 = LayerNorm(config.num_embed_hidden)
        self.dropout2 = nn.Dropout(config.dropout_rate)

    def forward(self,
                embed_output: torch.Tensor,
                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """calculating single Transformer block with under procedure.

        1. single-self attention (EMBED_DIM -> ATTEN_PROJ -> EMBED_DIM)
        2. first noramlization + residual connection
        3. fead-forward-1 layer (EMBED_DIM -> FFD-1-DIM -> EMBED_DIM)
        4. second normalization + residual connection

        :param embed_output: sub-word, positional embedding sum output
        :type embed_output: torch.Tensor
        :param attention_mask: 1.0 for token position, 0.0 for padding position, defaults to None
        :type attention_mask: Optional[torch.Tensor], optional
        :return: Transformer block forward output
        :rtype: torch.Tensor
        """
        self_attn_output = self.self_attention.forward(
            embed_output, attention_mask=attention_mask)
        self_attn_output = self.dropout1(self_attn_output)
        norm1_output = self.norm1.forward(self_attn_output + embed_output)

        feed_forward_output = self.fead_forward.forward(norm1_output)
        feed_forward_output = self.dropout2(feed_forward_output)
        norm2_output = self.norm2.forward(feed_forward_output + norm1_output)
        return norm2_output
Exemple #25
0
    def __init__(self, d_model, nhead, min_dist, max_dist, dim_feedforward=2048, dropout=0.1, activation="relu"):
        super(TransformerEncoderLayerWithRelativePositionalEncoding, self).__init__()
        self.self_attn = MultiheadAttentionRelativePositionalEncoding(d_model, nhead, dropout=dropout, min_dist=min_dist,max_dist=max_dist)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Exemple #26
0
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = F.relu  #get_activation_fn(activation)
    def __init__(self, n_layers, nclasses=5, model_dim=128, input_dim=3):
        super(cnn_attention_ocr, self).__init__()

        self.classes = nclasses + 1
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.atn_blocks_0 = attention_block(19, model_dim)
        #what we can do then is whenever we reduce size we are allowed to increase dimension
        self.atn_blocks_1 = attention_block(model_dim, model_dim)
        self.atn_blocks_2 = attention_block(model_dim, model_dim)
        self.mp1 = MaxPool2d((2, 2))
        self.atn_blocks_3 = attention_block(model_dim,
                                            model_dim * 4,
                                            input_height=16)
        self.mp2 = MaxPool2d((2, 1))

        #For now we do 8 layers only
        if n_layers > 4:

            self.atn_blocks_4 = attention_block(model_dim * 4,
                                                model_dim * 8,
                                                input_height=16)

            atn_blocks = nn.ModuleList([
                attention_block(model_dim * 8, model_dim * 8, input_height=8)
                for i in range(n_layers - 5)
            ])
            self.layers = nn.Sequential(*atn_blocks)
        #For now we do 8 layers only
        if n_layers > 8:

            atn_blocks_16 = nn.ModuleList([
                attention_block(model_dim * 8, model_dim * 8, input_height=8)
                for i in range(n_layers - 8)
            ])
            self.layers_16 = nn.Sequential(*atn_blocks)

        self.conv1 = depthwise_separable_conv_bn(16, 16, 13, 6)

        self.reduce1 = nn.Conv2d(3, 16, kernel_size=1)
        self.reduce2 = nn.Conv2d(model_dim * 8, self.classes, kernel_size=1)

        self.drop1 = nn.Dropout2d(0.75)
        self.drop2 = nn.Dropout2d(0.5)

        self.ln_3 = LayerNorm(self.classes)
        self.ln_1 = LayerNorm(3).cuda()
        self.ln_4 = LayerNorm(16).cuda()
        self.soft_norm = nn.Softmax2d()
Exemple #28
0
    def __init__(self,
                 src_dim,
                 dest_dim,
                 edge_dim,
                 hidden_size,
                 nhead=4,
                 position_encoding=True):
        super().__init__()
        self.src_dim = src_dim
        self.dest_dim = dest_dim
        self.edge_dim = edge_dim
        self.hidden_size = hidden_size
        self.nhead = nhead
        src_layers = []
        src_layers.append(nn.Linear(src_dim + edge_dim, hidden_size))
        src_layers.append(GeLU())
        self.src_pre_layer = nn.Sequential(*src_layers)

        dest_layers = []
        dest_layers.append(nn.Linear(dest_dim, hidden_size))
        dest_layers.append(GeLU())
        self.dest_pre_layer = nn.Sequential(*dest_layers)

        self.att = MultiheadAttention(embed_dim=hidden_size, num_heads=nhead)
        self.att_dropout = Dropout(0.1)
        self.att_norm = LayerNorm(hidden_size)

        self.zero_padding_template = torch.zeros((1, src_dim),
                                                 dtype=torch.float)
Exemple #29
0
    def __init__(self, config):
        super(Transformer, self).__init__()
        self.config = config     

        self.input_dim = config["input_dim"]
        self.d_model = config["d_model"]
        self.nhead = config["nhead"]
        self.dim_feedforward = config["dim_feedforward"]
        self.num_layers = config["num_layers"]
        self.dropout_rate = config["dropout_rate"]
        self.activation = config["activation"]
        self.subconf = config["sub"]
        if self.subconf["type"] == "ConvV1":
            self.sub = Conv2dSubsample(self.input_dim, self.d_model) 
        elif self.subconf["type"] == "ConvV2":
            self.sub = Conv2dSubsampleV2(self.input_dim, self.d_model, self.subconf["layer_num"]) 
        elif self.subconf["type"] == "Stack":
            self.context_width = config["context_width"]
            self.subsample = config["subsample"]
            self.sub = Conv1dSubsample(self.input_dim, self.d_model, self.context_width, self.subsample)
        
        self.scale = self.d_model ** 0.5

        self.pe = modules.PositionalEncoding(self.d_model)
        self.dropout = nn.Dropout(self.dropout_rate)
        encoder_norm = LayerNorm(self.d_model)
        encoder_layer = transformer.TransformerEncoderLayer(d_model=self.d_model, 
                nhead=self.nhead, dim_feedforward=self.dim_feedforward, 
                dropout=self.dropout_rate, activation=self.activation)
        self.transformer_encoder = transformer.TransformerEncoder(encoder_layer, self.num_layers, encoder_norm)
Exemple #30
0
    def __init__(self, n_head, d_model, d_head, d_inner, dropout,use_gate,use_stable_version,
                 **kwargs):
        super(RelPartialLearnableDecoderLayer, self).__init__()

        self.use_gate = use_gate
        self.use_stable_version = use_stable_version
        self.gate_mha = GRUGate(d_model)
        self.gate_mlp = GRUGate(d_model)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
                            d_head, dropout, **kwargs)
        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)