Beispiel #1
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu"):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = MultiheadAttention(d_model,
                                                 nhead,
                                                 dropout=dropout)
        # Implementation of Feedforward model
        if activation == "glu":
            self.linear1 = Linear(d_model, 2 * dim_feedforward)
        else:
            self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Beispiel #2
0
    def __init__(self,
                 d_model: int,
                 num_heads,
                 feedforward_dimension: int = 2048,
                 dropout: float = 0.1):
        super(TransformerDecoderLayer, self).__init__()
        # Masked Multi-Head Self-Attention
        self.masked_self_attention = MultiheadAttention(d_model,
                                                        num_heads,
                                                        dropout=dropout)
        self.dropout_a1 = Dropout(dropout)

        # Normalization after Self-Attention
        self.norm1 = LayerNorm(d_model)

        # Encoder-Decoder Attention
        self.self_attention = MultiheadAttention(d_model,
                                                 num_heads,
                                                 dropout=dropout)
        self.dropout_a2 = Dropout(dropout)

        # Normalization after Attention
        self.norm2 = LayerNorm(d_model)

        # Position-Wise Feed Forward NN
        self.linear1 = Linear(d_model, feedforward_dimension)
        self.relu = ReLU()
        self.dropout1 = Dropout(dropout)
        self.linear2 = Linear(feedforward_dimension, d_model)
        self.dropout2 = Dropout(dropout)

        # Normalization after PW-FFNN
        self.norm3 = LayerNorm(d_model)
Beispiel #3
0
    def __init__(self,
                 d_model: int = 512,
                 nhead: int = 8,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1,
                 activation: str = "relu"):

        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model,
                                            nhead,
                                            dropout=dropout,
                                            add_bias_kv=True,
                                            add_zero_attn=True)
        self.multihead_attn = MultiheadAttention(d_model,
                                                 nhead,
                                                 dropout=dropout,
                                                 add_bias_kv=True,
                                                 add_zero_attn=True)

        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Beispiel #4
0
    def __init__(self,
                 model_type: str = 'sep',
                 input_size: int = 3500,
                 d_model: int = 512,
                 d_embedding: int = 256,
                 n_head: int = 8,
                 dim_feedforward: int = 2048,
                 num_encoder_layer: int = 10,
                 dropout: float = 0.3):

        super(Transformer, self).__init__()

        self.model_type = model_type
        self.dropout = nn.Dropout(dropout)
        if model_type == 'sep':
            n_classes = 1
        elif model_type == 'total':
            n_classes = 2
        else:
            raise NameError(f'{model_type} is not defined')

        # Image embedding part
        self.src_input_linear = nn.Embedding(input_size, d_embedding)
        self.src_input_norm = nn.LayerNorm(d_embedding, eps=1e-12)
        self.src_input_linear2 = nn.Linear(d_embedding, d_model)

        # Transformer Encoder part
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \
                for i in range(num_encoder_layer)])

        # Transformer Encoder part 2: Seperate model type (sep)
        self_attn2 = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders2 = nn.ModuleList([
            TransformerEncoderLayer(d_model, self_attn2, dim_feedforward, dropout=dropout) \
                for i in range(num_encoder_layer)])

        # Target linear part (Not averaging)
        self.trg_output_linear = nn.Linear(d_model, d_embedding)
        self.trg_output_norm = nn.LayerNorm(d_embedding, eps=1e-12)
        self.trg_output_linear2 = nn.Linear(d_embedding, n_classes)

        if model_type == 'sep':
            self.trg_output_linear_sep = nn.Linear(d_model, d_embedding)
            self.trg_output_norm_sep = nn.LayerNorm(d_embedding, eps=1e-12)
            self.trg_output_linear2_sep = nn.Linear(d_embedding, n_classes)

        # Initialization
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.kaiming_uniform_(p)
Beispiel #5
0
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)
Beispiel #6
0
    def __init__(self,
                 src_dim,
                 dest_dim,
                 edge_dim,
                 hidden_size,
                 nhead=4,
                 position_encoding=True):
        super().__init__()
        self.src_dim = src_dim
        self.dest_dim = dest_dim
        self.edge_dim = edge_dim
        self.hidden_size = hidden_size
        self.nhead = nhead
        src_layers = []
        src_layers.append(nn.Linear(src_dim + edge_dim, hidden_size))
        src_layers.append(GeLU())
        self.src_pre_layer = nn.Sequential(*src_layers)

        dest_layers = []
        dest_layers.append(nn.Linear(dest_dim, hidden_size))
        dest_layers.append(GeLU())
        self.dest_pre_layer = nn.Sequential(*dest_layers)

        self.att = MultiheadAttention(embed_dim=hidden_size, num_heads=nhead)
        self.att_dropout = Dropout(0.1)
        self.att_norm = LayerNorm(hidden_size)

        self.zero_padding_template = torch.zeros((1, src_dim),
                                                 dtype=torch.float)
Beispiel #7
0
    def __init__(self, num_features=22, nhead=3, dim_feedforward=2048, dropout=0.1, activation = "relu", 
                 use_LayerNorm = True, init_resweight = 0, resweight_trainable = True):
        super(ReZeroEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(num_features, nhead, dropout=dropout)
        
        # Define the Resisdual Weight for ReZero
        self.resweight = torch.nn.Parameter(torch.Tensor([init_resweight]), requires_grad = resweight_trainable)

        # Implementation of Feedforward model
        self.linear1 = Linear(num_features, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, num_features)
        self.use_LayerNorm = use_LayerNorm
        if self.use_LayerNorm != False:
            self.norm1 = LayerNorm(num_features)
            self.norm2 = LayerNorm(num_features)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        if activation == "relu":
            self.activation = F.relu
        elif activation == "gelu":
            self.activation = F.gelu
        elif activation == "tanh":
            self.activation = torch.tanh
Beispiel #8
0
    def __init__(self, pad_idx=0, bos_idx=1, eos_idx=2, max_len=300, d_model=512, d_embedding=256, n_head=8, 
                 dim_feedforward=2048, n_layers=10, dropout=0.1, device=None):
        super(littleBert, self).__init__()

        self.pad_idx = pad_idx
        self.bos_idx = bos_idx
        self.eos_idx = eos_idx
        self.max_len = max_len

        self.dropout = nn.Dropout(dropout)
        self.device = device

        # Source embedding part
        self.src_embedding = CustomEmbedding(d_embedding, d_model, device=self.device, pad_idx=self.pad_idx)

        # Transformer
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model, self_attn, dim_feedforward,
                activation='gelu', dropout=dropout) for i in range(n_layers)])

        # Output Linear Part
        self.src_output_linear = nn.Linear(d_model, d_embedding)
        self.src_output_concatlinear = nn.Linear((d_embedding + d_embedding), d_embedding)
        self.src_output_bilinear = nn.Bilinear(d_embedding, d_embedding, d_embedding)
        self.src_output_linear2 = nn.Linear(d_embedding, 1)
Beispiel #9
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=256,
                 dropout=0,
                 activation="relu"):

        from torch.nn.modules.activation import MultiheadAttention
        from torch.nn.modules.normalization import LayerNorm
        from torch.nn.modules.dropout import Dropout
        from torch.nn.modules.rnn import LSTM
        from torch.nn.modules.linear import Linear

        super(DPTNetBlock, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        # self.linear1 = Linear(d_model, dim_feedforward)
        self.rnn = LSTM(d_model, d_model * 2, 1, bidirectional=True)
        self.dropout = Dropout(dropout)
        # self.linear2 = Linear(dim_feedforward, d_model)
        self.linear2 = Linear(d_model * 2 * 2, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Beispiel #10
0
    def __init__(self,
                 number_time_series: int,
                 seq_len=10,
                 output_seq_len=None,
                 d_model=128,
                 num_heads=8,
                 dropout=0.1,
                 output_dim=1,
                 final_layer=False):

        super().__init__()
        self.dense_shape = torch.nn.Linear(number_time_series, d_model)
        self.pe = SimplePositionalEncoding(d_model)
        self.multi_attn = MultiheadAttention(embed_dim=d_model,
                                             num_heads=num_heads,
                                             dropout=dropout)
        self.final_layer = torch.nn.Linear(d_model, output_dim)
        self.length_data = seq_len
        self.forecast_length = output_seq_len
        self.sigmoid = None
        self.output_dim = output_dim
        if self.forecast_length:
            self.last_layer = torch.nn.Linear(seq_len, output_seq_len)
        if final_layer:
            self.sigmoid = activation_dict[final_layer]()
Beispiel #11
0
    def _run_multihead(self, q, k, v, **kwargs):
        original_layer = MultiheadAttention(self.EMBED_SIZE, **kwargs)
        dp_layer = DPMultiheadAttention(self.EMBED_SIZE, **kwargs)
        dp_layer.load_state_dict(original_layer.state_dict())

        self._reset_seeds()
        original_y, original_attn_weights = original_layer(q, k, v)

        self._reset_seeds()
        dp_y, dp_attn_weights = dp_layer(q, k, v)

        self.assertTrue(torch.allclose(original_y, dp_y, atol=10e-4, rtol=10e-2))
        self.assertTrue(
            torch.allclose(
                original_attn_weights, dp_attn_weights, atol=10e-4, rtol=10e-2
            )
        )
    def __init__(self,
                 d_model: int,
                 nhead: int,
                 dim_feedforward: int = 2048,
                 dropout=0.1) -> None:
        super(TransformerDecoderLayerCustom, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = MultiheadAttention(d_model,
                                                 nhead,
                                                 dropout=dropout)

        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)

        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)
    def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal):
        super().__init__()
        self.causal = causal
        self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim)
        self.position_embeddings = nn.Embedding(num_max_positions, embed_dim)
        self.dropout = nn.Dropout(dropout)

        self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList()
        self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList()
        for _ in range(num_layers):
            self.attentions.append(MultiheadAttention(embed_dim, num_heads, dropout=dropout))
            self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim),
                                                    nn.ReLU(),
                                                    nn.Linear(hidden_dim, embed_dim)))
            self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-12))
            self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-12))
    def __init__(self, n_classes, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048,
            num_encoder_layer=10, num_decoder_layer=10, img_size=224, patch_size=16,
            dropout=0.3):
    
        super(Trans_GAN, self).__init__()

        self.dropout = nn.Dropout(dropout)

        # Image embedding part
        self.patch_embedding = PatchEmbedding(in_channels=3, patch_size=patch_size,
            d_model=d_model, d_embedding=d_embedding, img_size=img_size)

        # Transformer Encoder part
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \
                for i in range(num_encoder_layer)])
Beispiel #15
0
    def __init__(self,
                 vocab_num,
                 pad_idx=0,
                 bos_idx=1,
                 eos_idx=2,
                 max_len=300,
                 d_model=512,
                 d_embedding=256,
                 n_head=8,
                 dim_feedforward=2048,
                 dropout=0.1,
                 embedding_dropout=0.1,
                 n_layers=8,
                 device=None):

        super(Transformer, self).__init__()

        #
        self.pad_idx = pad_idx
        self.bos_idx = bos_idx
        self.eos_idx = eos_idx
        self.max_len = max_len
        self.dropout = nn.Dropout(dropout)

        self.transformer_embedding = TransformerEmbedding(
            vocab_num,
            d_model,
            d_embedding,
            pad_idx=self.pad_idx,
            max_len=self.max_len,
            embedding_dropout=embedding_dropout)

        # Output model
        self.output_linear = nn.Linear(d_model, d_embedding, bias=False)
        self.output_norm = nn.LayerNorm(d_embedding)
        self.output_linear2 = nn.Linear(d_embedding, 3, bias=True)

        # Transformer model
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model,
                                    self_attn,
                                    dim_feedforward,
                                    activation='gelu',
                                    dropout=dropout) for i in range(n_layers)
        ])
    def __init__(
        self,
        embed_dim,
        n_heads,
        dim_ff,
        dropout=0.0,
        activation="relu",
        norm="gLN",
    ):
        super(PreLNTransformerLayer, self).__init__()

        self.mha = MultiheadAttention(embed_dim, n_heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(embed_dim, dim_ff)
        self.linear2 = nn.Linear(dim_ff, embed_dim)
        self.activation = activations.get(activation)()
        self.norm_mha = norms.get(norm)(embed_dim)
        self.norm_ff = norms.get(norm)(embed_dim)
Beispiel #17
0
    def __init__(self, d_model, nhead, bidirectional=True, dropout=0, activation="relu"):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        # self.linear1 = Linear(d_model, dim_feedforward)
        self.gru = GRU(d_model, d_model*2, 1, bidirectional=bidirectional)
        self.dropout = Dropout(dropout)
        # self.linear2 = Linear(dim_feedforward, d_model)
        if bidirectional:
            self.linear2 = Linear(d_model*2*2, d_model)
        else:
            self.linear2 = Linear(d_model*2, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu"):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.slot_attn = Hierarchical_Attention(d_model, cycles=1)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)
Beispiel #19
0
    def __init__(
        self,
        embed_dim,
        n_heads,
        dim_ff,
        dropout=0.0,
        activation="relu",
        bidirectional=True,
        norm="gLN",
    ):
        super(ImprovedTransformedLayer, self).__init__()

        self.mha = MultiheadAttention(embed_dim, n_heads, dropout=dropout)
        self.recurrent = nn.LSTM(embed_dim, dim_ff, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        ff_inner_dim = 2 * dim_ff if bidirectional else dim_ff
        self.linear = nn.Linear(ff_inner_dim, embed_dim)
        self.activation = activations.get(activation)()
        self.norm_mha = norms.get(norm)(embed_dim)
        self.norm_ff = norms.get(norm)(embed_dim)
Beispiel #20
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 layer_norm_eps=1e-5):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps)
        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Beispiel #21
0
    def __init__(self,
                 vocab_num,
                 pad_idx=0,
                 bos_idx=1,
                 eos_idx=2,
                 max_len=100,
                 d_model=512,
                 d_embedding=256,
                 n_head=8,
                 dim_feedforward=2048,
                 n_layer=10,
                 dropout=0.1):
        super(littleBert, self).__init__()

        self.pad_idx = pad_idx
        self.bos_idx = bos_idx
        self.eos_idx = eos_idx
        self.max_len = max_len

        self.dropout = nn.Dropout(dropout)

        # Source embedding part
        self.src_embedding = TransformerEmbedding(vocab_num,
                                                  d_model,
                                                  d_embedding,
                                                  pad_idx=self.pad_idx,
                                                  max_len=self.max_len)

        self.src_output_linear = nn.Linear(d_model, d_embedding)
        self.src_output_linear2 = nn.Linear(d_embedding, src_vocab_num)

        # Transformer
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model,
                                    self_attn,
                                    dim_feedforward,
                                    activation='gelu',
                                    dropout=dropout)
            for i in range(num_encoder_layer)
        ])
Beispiel #22
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation='relu'):
        super().__init__()

        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.resweight = nn.Parameter(torch.Tensor([0]))

        if activation == "relu":
            self.activation = F.relu
        elif activation == "gelu":
            self.activation = F.gelu
Beispiel #23
0
    def __init__(self,
                 d_model,
                 nhead,
                 hidden_size,
                 dim_feedforward,
                 dropout,
                 activation="relu"):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        # Implementation of improved part
        self.lstm = LSTM(d_model, hidden_size, 1, bidirectional=True)
        self.dropout = Dropout(dropout)
        self.linear = Linear(hidden_size * 2, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Beispiel #24
0
    def __init__(self, vocab_num, pad_idx=0, bos_idx=1, eos_idx=2, max_len=300, d_model=512, d_embedding=256, n_head=8, 
                 dim_feedforward=2048, n_layer=10, dropout=0.1):
        super().__init__()
        self.logger = logging.getLogger(__class__.__qualname__)
        self.pad_idx = pad_idx
        self.bos_idx = bos_idx
        self.eos_idx = eos_idx
        self.max_len = max_len

        self.dropout = nn.Dropout(dropout)

        # Source embedding part
        self.src_embedding = CustomEmbedding(vocab_num, d_embedding, d_model,pad_idx=self.pad_idx, max_len=self.max_len)

        self.src_output_linear = nn.Linear(d_model, d_embedding)
        self.src_output_linear2 = nn.Linear(d_embedding, vocab_num)

        # Transformer
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model, self_attn, dim_feedforward,
                activation='gelu', dropout=dropout) for i in range(n_layer)])
    def __init__(self, n_classes, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048,
            num_encoder_layer=10, num_decoder_layer=10, img_size=224, patch_size=16,
            dropout=0.3):
    
        super(Vision_Transformer, self).__init__()

        self.dropout = nn.Dropout(dropout)

        # Image embedding part
        self.patch_embedding = PatchEmbedding(in_channels=3, patch_size=patch_size,
            d_model=d_model, d_embedding=d_embedding, img_size=img_size)

        # Transformer Encoder part
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \
                for i in range(num_encoder_layer)])

        # Target linear part (Not averaging)
        self.trg_output_linear = nn.Linear(d_model, d_embedding)
        self.trg_output_norm = nn.LayerNorm(d_embedding, eps=1e-12)
        self.trg_output_linear2 = nn.Linear(d_embedding, n_classes)
Beispiel #26
0
    def __init__(self,
                 d_model,
                 nhead2,
                 dim_feedforward=2,
                 dropout=0.1,
                 activation="relu",
                 column_num=None):
        super(CAAN_Layer, self).__init__()

        self.d_model = d_model
        self.self_attn = MultiheadAttention(d_model, nhead2, dropout=dropout)
        self.period = 12
        self.column_num = column_num
        self.dim_feedforward = dim_feedforward
        self.num_hidden_node1 = self.d_model * self.period  # 1 year
        self.num_hidden_node2 = self.d_model * self.period // 2  # 6 months
        self.linear_w1 = Linear(self.d_model * self.period,
                                self.num_hidden_node1)
        self.linear_w2 = Linear(self.d_model * self.period,
                                self.num_hidden_node2)
        self.linear_w3 = Linear(self.num_hidden_node2, 1)
        self.tanh = torch.tanh
        self.relu = torch.relu
        self.dropout = Dropout(dropout)
Beispiel #27
0
    def __init__(self,
                 src_vocab_num,
                 trg_vocab_num,
                 pad_idx=0,
                 bos_idx=1,
                 eos_idx=2,
                 d_model=512,
                 d_embedding=256,
                 n_head=8,
                 dim_feedforward=2048,
                 num_common_layer=10,
                 num_encoder_layer=10,
                 num_decoder_layer=10,
                 src_max_len=100,
                 trg_max_len=100,
                 trg_emb_prj_weight_sharing=False,
                 emb_src_trg_weight_sharing=True,
                 dropout=0.1,
                 embedding_dropout=0.1,
                 parallel=False):

        super(Transformer, self).__init__()

        # Hyper-paramter setting
        self.pad_idx = pad_idx
        self.bos_idx = bos_idx
        self.eos_idx = eos_idx
        self.src_max_len = src_max_len
        self.trg_max_len = trg_max_len

        # Parallel Transformer setting
        self.parallel = parallel
        if self.parallel:
            assert num_encoder_layer == num_encoder_layer
            self.num_common_layer = num_common_layer
            self.num_encoder_nonparallel = num_encoder_layer - num_common_layer

        # Dropout setting
        self.dropout = nn.Dropout(dropout)

        # Source embedding part
        self.src_embedding = TransformerEmbedding(src_vocab_num,
                                                  d_model,
                                                  d_embedding,
                                                  pad_idx=self.pad_idx,
                                                  max_len=self.src_max_len,
                                                  dropout=embedding_dropout)

        # Target embedding part
        self.trg_embedding = TransformerEmbedding(trg_vocab_num,
                                                  d_model,
                                                  d_embedding,
                                                  pad_idx=self.pad_idx,
                                                  max_len=self.trg_max_len,
                                                  dropout=embedding_dropout)

        # Transformer Encoder part
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \
                for i in range(num_encoder_layer)])

        # Transformer Decoder part
        self_attn = MultiheadAttention(d_model, n_head, dropout=dropout)
        decoder_mask_attn = MultiheadAttention(d_model,
                                               n_head,
                                               dropout=dropout)
        self.decoders = nn.ModuleList([
            TransformerDecoderLayer(d_model,
                                    self_attn,
                                    decoder_mask_attn,
                                    dim_feedforward,
                                    dropout=dropout)
            for i in range(num_decoder_layer)
        ])

        # Target linear part
        self.trg_output_linear = nn.Linear(d_model, d_embedding)
        self.trg_output_norm = nn.LayerNorm(d_embedding, eps=1e-12)
        self.trg_output_linear2 = nn.Linear(d_embedding, trg_vocab_num)

        # Weight sharing
        self.x_logit_scale = 1.
        if trg_emb_prj_weight_sharing:
            # Share the weight between target word embedding & last dense layer
            self.trg_output_linear2.weight = self.trg_embedding.token.weight
            self.x_logit_scale = (d_model**-0.5)

        if emb_src_trg_weight_sharing:
            self.src_embedding.token.weight = self.trg_embedding.token.weight
Beispiel #28
0
 def __init__(self, dim_model, h, prob_dropout):
     super(MultiHeadAttentionLayer, self).__init__()
     self.attention = MultiheadAttention(dim_model, h, prob_dropout)
# 神经网络的初始化方法,
# 正太分布 初始化
w = torch.empty(3, 5)
nn.init.xavier_normal_(w, gain=nn.init.calculate_gain("relu"))
#  均匀分布 初始化
# nn.init.xavier_uniform_(w,)
#  初始化为常数
# nn.init.constant_(w)
# 多头注意力机制
from torch.nn.modules.activation import MultiheadAttention

query = torch.randn(11, 20, 40)
key = torch.randn(6, 20, 40)
value = torch.randn(6, 20, 40)
attn = MultiheadAttention(embed_dim=40, num_heads=4)
print(attn)
for param in attn.named_parameters():
    # print(param,param.size())
    print(param[0], "++", param[1].shape)

# 切片
x_embedding = torch.randn(3, 4)
print(x_embedding)
indicies = torch.LongTensor([0, 2])
# 进行切片,根据dim和indicies 获取相关数据
print(torch.index_select(x_embedding, 0, indicies))
print(torch.index_select(x_embedding, 1, indicies))

# pytorch 归一化层:BatchNorm、LayerNorm、InstanceNorm、GroupNorm
# Norm 最归一化,所以输入输出的维度是不变化的
Beispiel #30
0
 def __init__(self, d_model, n_heads, dropout):
     self.main_layer = MultiheadAttention(d_model, n_heads, dropout)