def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model if activation == "glu": self.linear1 = Linear(d_model, 2 * dim_feedforward) else: self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model: int, num_heads, feedforward_dimension: int = 2048, dropout: float = 0.1): super(TransformerDecoderLayer, self).__init__() # Masked Multi-Head Self-Attention self.masked_self_attention = MultiheadAttention(d_model, num_heads, dropout=dropout) self.dropout_a1 = Dropout(dropout) # Normalization after Self-Attention self.norm1 = LayerNorm(d_model) # Encoder-Decoder Attention self.self_attention = MultiheadAttention(d_model, num_heads, dropout=dropout) self.dropout_a2 = Dropout(dropout) # Normalization after Attention self.norm2 = LayerNorm(d_model) # Position-Wise Feed Forward NN self.linear1 = Linear(d_model, feedforward_dimension) self.relu = ReLU() self.dropout1 = Dropout(dropout) self.linear2 = Linear(feedforward_dimension, d_model) self.dropout2 = Dropout(dropout) # Normalization after PW-FFNN self.norm3 = LayerNorm(d_model)
def __init__(self, d_model: int = 512, nhead: int = 8, dim_feedforward: int = 2048, dropout: float = 0.1, activation: str = "relu"): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, add_bias_kv=True, add_zero_attn=True) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, add_bias_kv=True, add_zero_attn=True) self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, model_type: str = 'sep', input_size: int = 3500, d_model: int = 512, d_embedding: int = 256, n_head: int = 8, dim_feedforward: int = 2048, num_encoder_layer: int = 10, dropout: float = 0.3): super(Transformer, self).__init__() self.model_type = model_type self.dropout = nn.Dropout(dropout) if model_type == 'sep': n_classes = 1 elif model_type == 'total': n_classes = 2 else: raise NameError(f'{model_type} is not defined') # Image embedding part self.src_input_linear = nn.Embedding(input_size, d_embedding) self.src_input_norm = nn.LayerNorm(d_embedding, eps=1e-12) self.src_input_linear2 = nn.Linear(d_embedding, d_model) # Transformer Encoder part self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \ for i in range(num_encoder_layer)]) # Transformer Encoder part 2: Seperate model type (sep) self_attn2 = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders2 = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn2, dim_feedforward, dropout=dropout) \ for i in range(num_encoder_layer)]) # Target linear part (Not averaging) self.trg_output_linear = nn.Linear(d_model, d_embedding) self.trg_output_norm = nn.LayerNorm(d_embedding, eps=1e-12) self.trg_output_linear2 = nn.Linear(d_embedding, n_classes) if model_type == 'sep': self.trg_output_linear_sep = nn.Linear(d_model, d_embedding) self.trg_output_norm_sep = nn.LayerNorm(d_embedding, eps=1e-12) self.trg_output_linear2_sep = nn.Linear(d_embedding, n_classes) # Initialization for p in self.parameters(): if p.dim() > 1: nn.init.kaiming_uniform_(p)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout)
def __init__(self, src_dim, dest_dim, edge_dim, hidden_size, nhead=4, position_encoding=True): super().__init__() self.src_dim = src_dim self.dest_dim = dest_dim self.edge_dim = edge_dim self.hidden_size = hidden_size self.nhead = nhead src_layers = [] src_layers.append(nn.Linear(src_dim + edge_dim, hidden_size)) src_layers.append(GeLU()) self.src_pre_layer = nn.Sequential(*src_layers) dest_layers = [] dest_layers.append(nn.Linear(dest_dim, hidden_size)) dest_layers.append(GeLU()) self.dest_pre_layer = nn.Sequential(*dest_layers) self.att = MultiheadAttention(embed_dim=hidden_size, num_heads=nhead) self.att_dropout = Dropout(0.1) self.att_norm = LayerNorm(hidden_size) self.zero_padding_template = torch.zeros((1, src_dim), dtype=torch.float)
def __init__(self, num_features=22, nhead=3, dim_feedforward=2048, dropout=0.1, activation = "relu", use_LayerNorm = True, init_resweight = 0, resweight_trainable = True): super(ReZeroEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(num_features, nhead, dropout=dropout) # Define the Resisdual Weight for ReZero self.resweight = torch.nn.Parameter(torch.Tensor([init_resweight]), requires_grad = resweight_trainable) # Implementation of Feedforward model self.linear1 = Linear(num_features, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, num_features) self.use_LayerNorm = use_LayerNorm if self.use_LayerNorm != False: self.norm1 = LayerNorm(num_features) self.norm2 = LayerNorm(num_features) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) if activation == "relu": self.activation = F.relu elif activation == "gelu": self.activation = F.gelu elif activation == "tanh": self.activation = torch.tanh
def __init__(self, pad_idx=0, bos_idx=1, eos_idx=2, max_len=300, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048, n_layers=10, dropout=0.1, device=None): super(littleBert, self).__init__() self.pad_idx = pad_idx self.bos_idx = bos_idx self.eos_idx = eos_idx self.max_len = max_len self.dropout = nn.Dropout(dropout) self.device = device # Source embedding part self.src_embedding = CustomEmbedding(d_embedding, d_model, device=self.device, pad_idx=self.pad_idx) # Transformer self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, activation='gelu', dropout=dropout) for i in range(n_layers)]) # Output Linear Part self.src_output_linear = nn.Linear(d_model, d_embedding) self.src_output_concatlinear = nn.Linear((d_embedding + d_embedding), d_embedding) self.src_output_bilinear = nn.Bilinear(d_embedding, d_embedding, d_embedding) self.src_output_linear2 = nn.Linear(d_embedding, 1)
def __init__(self, d_model, nhead, dim_feedforward=256, dropout=0, activation="relu"): from torch.nn.modules.activation import MultiheadAttention from torch.nn.modules.normalization import LayerNorm from torch.nn.modules.dropout import Dropout from torch.nn.modules.rnn import LSTM from torch.nn.modules.linear import Linear super(DPTNetBlock, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model # self.linear1 = Linear(d_model, dim_feedforward) self.rnn = LSTM(d_model, d_model * 2, 1, bidirectional=True) self.dropout = Dropout(dropout) # self.linear2 = Linear(dim_feedforward, d_model) self.linear2 = Linear(d_model * 2 * 2, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, number_time_series: int, seq_len=10, output_seq_len=None, d_model=128, num_heads=8, dropout=0.1, output_dim=1, final_layer=False): super().__init__() self.dense_shape = torch.nn.Linear(number_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) self.multi_attn = MultiheadAttention(embed_dim=d_model, num_heads=num_heads, dropout=dropout) self.final_layer = torch.nn.Linear(d_model, output_dim) self.length_data = seq_len self.forecast_length = output_seq_len self.sigmoid = None self.output_dim = output_dim if self.forecast_length: self.last_layer = torch.nn.Linear(seq_len, output_seq_len) if final_layer: self.sigmoid = activation_dict[final_layer]()
def _run_multihead(self, q, k, v, **kwargs): original_layer = MultiheadAttention(self.EMBED_SIZE, **kwargs) dp_layer = DPMultiheadAttention(self.EMBED_SIZE, **kwargs) dp_layer.load_state_dict(original_layer.state_dict()) self._reset_seeds() original_y, original_attn_weights = original_layer(q, k, v) self._reset_seeds() dp_y, dp_attn_weights = dp_layer(q, k, v) self.assertTrue(torch.allclose(original_y, dp_y, atol=10e-4, rtol=10e-2)) self.assertTrue( torch.allclose( original_attn_weights, dp_attn_weights, atol=10e-4, rtol=10e-2 ) )
def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout=0.1) -> None: super(TransformerDecoderLayerCustom, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout)
def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal): super().__init__() self.causal = causal self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim) self.position_embeddings = nn.Embedding(num_max_positions, embed_dim) self.dropout = nn.Dropout(dropout) self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList() self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList() for _ in range(num_layers): self.attentions.append(MultiheadAttention(embed_dim, num_heads, dropout=dropout)) self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, embed_dim))) self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-12)) self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-12))
def __init__(self, n_classes, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048, num_encoder_layer=10, num_decoder_layer=10, img_size=224, patch_size=16, dropout=0.3): super(Trans_GAN, self).__init__() self.dropout = nn.Dropout(dropout) # Image embedding part self.patch_embedding = PatchEmbedding(in_channels=3, patch_size=patch_size, d_model=d_model, d_embedding=d_embedding, img_size=img_size) # Transformer Encoder part self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \ for i in range(num_encoder_layer)])
def __init__(self, vocab_num, pad_idx=0, bos_idx=1, eos_idx=2, max_len=300, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048, dropout=0.1, embedding_dropout=0.1, n_layers=8, device=None): super(Transformer, self).__init__() # self.pad_idx = pad_idx self.bos_idx = bos_idx self.eos_idx = eos_idx self.max_len = max_len self.dropout = nn.Dropout(dropout) self.transformer_embedding = TransformerEmbedding( vocab_num, d_model, d_embedding, pad_idx=self.pad_idx, max_len=self.max_len, embedding_dropout=embedding_dropout) # Output model self.output_linear = nn.Linear(d_model, d_embedding, bias=False) self.output_norm = nn.LayerNorm(d_embedding) self.output_linear2 = nn.Linear(d_embedding, 3, bias=True) # Transformer model self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, activation='gelu', dropout=dropout) for i in range(n_layers) ])
def __init__( self, embed_dim, n_heads, dim_ff, dropout=0.0, activation="relu", norm="gLN", ): super(PreLNTransformerLayer, self).__init__() self.mha = MultiheadAttention(embed_dim, n_heads, dropout=dropout) self.dropout = nn.Dropout(dropout) self.linear1 = nn.Linear(embed_dim, dim_ff) self.linear2 = nn.Linear(dim_ff, embed_dim) self.activation = activations.get(activation)() self.norm_mha = norms.get(norm)(embed_dim) self.norm_ff = norms.get(norm)(embed_dim)
def __init__(self, d_model, nhead, bidirectional=True, dropout=0, activation="relu"): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model # self.linear1 = Linear(d_model, dim_feedforward) self.gru = GRU(d_model, d_model*2, 1, bidirectional=bidirectional) self.dropout = Dropout(dropout) # self.linear2 = Linear(dim_feedforward, d_model) if bidirectional: self.linear2 = Linear(d_model*2*2, d_model) else: self.linear2 = Linear(d_model*2, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.slot_attn = Hierarchical_Attention(d_model, cycles=1) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout)
def __init__( self, embed_dim, n_heads, dim_ff, dropout=0.0, activation="relu", bidirectional=True, norm="gLN", ): super(ImprovedTransformedLayer, self).__init__() self.mha = MultiheadAttention(embed_dim, n_heads, dropout=dropout) self.recurrent = nn.LSTM(embed_dim, dim_ff, bidirectional=bidirectional) self.dropout = nn.Dropout(dropout) ff_inner_dim = 2 * dim_ff if bidirectional else dim_ff self.linear = nn.Linear(ff_inner_dim, embed_dim) self.activation = activations.get(activation)() self.norm_mha = norms.get(norm)(embed_dim) self.norm_ff = norms.get(norm)(embed_dim)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", layer_norm_eps=1e-5): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model, eps=layer_norm_eps) self.norm2 = LayerNorm(d_model, eps=layer_norm_eps) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, vocab_num, pad_idx=0, bos_idx=1, eos_idx=2, max_len=100, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048, n_layer=10, dropout=0.1): super(littleBert, self).__init__() self.pad_idx = pad_idx self.bos_idx = bos_idx self.eos_idx = eos_idx self.max_len = max_len self.dropout = nn.Dropout(dropout) # Source embedding part self.src_embedding = TransformerEmbedding(vocab_num, d_model, d_embedding, pad_idx=self.pad_idx, max_len=self.max_len) self.src_output_linear = nn.Linear(d_model, d_embedding) self.src_output_linear2 = nn.Linear(d_embedding, src_vocab_num) # Transformer self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, activation='gelu', dropout=dropout) for i in range(num_encoder_layer) ])
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'): super().__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.resweight = nn.Parameter(torch.Tensor([0])) if activation == "relu": self.activation = F.relu elif activation == "gelu": self.activation = F.gelu
def __init__(self, d_model, nhead, hidden_size, dim_feedforward, dropout, activation="relu"): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of improved part self.lstm = LSTM(d_model, hidden_size, 1, bidirectional=True) self.dropout = Dropout(dropout) self.linear = Linear(hidden_size * 2, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, vocab_num, pad_idx=0, bos_idx=1, eos_idx=2, max_len=300, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048, n_layer=10, dropout=0.1): super().__init__() self.logger = logging.getLogger(__class__.__qualname__) self.pad_idx = pad_idx self.bos_idx = bos_idx self.eos_idx = eos_idx self.max_len = max_len self.dropout = nn.Dropout(dropout) # Source embedding part self.src_embedding = CustomEmbedding(vocab_num, d_embedding, d_model,pad_idx=self.pad_idx, max_len=self.max_len) self.src_output_linear = nn.Linear(d_model, d_embedding) self.src_output_linear2 = nn.Linear(d_embedding, vocab_num) # Transformer self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, activation='gelu', dropout=dropout) for i in range(n_layer)])
def __init__(self, n_classes, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048, num_encoder_layer=10, num_decoder_layer=10, img_size=224, patch_size=16, dropout=0.3): super(Vision_Transformer, self).__init__() self.dropout = nn.Dropout(dropout) # Image embedding part self.patch_embedding = PatchEmbedding(in_channels=3, patch_size=patch_size, d_model=d_model, d_embedding=d_embedding, img_size=img_size) # Transformer Encoder part self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \ for i in range(num_encoder_layer)]) # Target linear part (Not averaging) self.trg_output_linear = nn.Linear(d_model, d_embedding) self.trg_output_norm = nn.LayerNorm(d_embedding, eps=1e-12) self.trg_output_linear2 = nn.Linear(d_embedding, n_classes)
def __init__(self, d_model, nhead2, dim_feedforward=2, dropout=0.1, activation="relu", column_num=None): super(CAAN_Layer, self).__init__() self.d_model = d_model self.self_attn = MultiheadAttention(d_model, nhead2, dropout=dropout) self.period = 12 self.column_num = column_num self.dim_feedforward = dim_feedforward self.num_hidden_node1 = self.d_model * self.period # 1 year self.num_hidden_node2 = self.d_model * self.period // 2 # 6 months self.linear_w1 = Linear(self.d_model * self.period, self.num_hidden_node1) self.linear_w2 = Linear(self.d_model * self.period, self.num_hidden_node2) self.linear_w3 = Linear(self.num_hidden_node2, 1) self.tanh = torch.tanh self.relu = torch.relu self.dropout = Dropout(dropout)
def __init__(self, src_vocab_num, trg_vocab_num, pad_idx=0, bos_idx=1, eos_idx=2, d_model=512, d_embedding=256, n_head=8, dim_feedforward=2048, num_common_layer=10, num_encoder_layer=10, num_decoder_layer=10, src_max_len=100, trg_max_len=100, trg_emb_prj_weight_sharing=False, emb_src_trg_weight_sharing=True, dropout=0.1, embedding_dropout=0.1, parallel=False): super(Transformer, self).__init__() # Hyper-paramter setting self.pad_idx = pad_idx self.bos_idx = bos_idx self.eos_idx = eos_idx self.src_max_len = src_max_len self.trg_max_len = trg_max_len # Parallel Transformer setting self.parallel = parallel if self.parallel: assert num_encoder_layer == num_encoder_layer self.num_common_layer = num_common_layer self.num_encoder_nonparallel = num_encoder_layer - num_common_layer # Dropout setting self.dropout = nn.Dropout(dropout) # Source embedding part self.src_embedding = TransformerEmbedding(src_vocab_num, d_model, d_embedding, pad_idx=self.pad_idx, max_len=self.src_max_len, dropout=embedding_dropout) # Target embedding part self.trg_embedding = TransformerEmbedding(trg_vocab_num, d_model, d_embedding, pad_idx=self.pad_idx, max_len=self.trg_max_len, dropout=embedding_dropout) # Transformer Encoder part self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.encoders = nn.ModuleList([ TransformerEncoderLayer(d_model, self_attn, dim_feedforward, dropout=dropout) \ for i in range(num_encoder_layer)]) # Transformer Decoder part self_attn = MultiheadAttention(d_model, n_head, dropout=dropout) decoder_mask_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.decoders = nn.ModuleList([ TransformerDecoderLayer(d_model, self_attn, decoder_mask_attn, dim_feedforward, dropout=dropout) for i in range(num_decoder_layer) ]) # Target linear part self.trg_output_linear = nn.Linear(d_model, d_embedding) self.trg_output_norm = nn.LayerNorm(d_embedding, eps=1e-12) self.trg_output_linear2 = nn.Linear(d_embedding, trg_vocab_num) # Weight sharing self.x_logit_scale = 1. if trg_emb_prj_weight_sharing: # Share the weight between target word embedding & last dense layer self.trg_output_linear2.weight = self.trg_embedding.token.weight self.x_logit_scale = (d_model**-0.5) if emb_src_trg_weight_sharing: self.src_embedding.token.weight = self.trg_embedding.token.weight
def __init__(self, dim_model, h, prob_dropout): super(MultiHeadAttentionLayer, self).__init__() self.attention = MultiheadAttention(dim_model, h, prob_dropout)
# 神经网络的初始化方法, # 正太分布 初始化 w = torch.empty(3, 5) nn.init.xavier_normal_(w, gain=nn.init.calculate_gain("relu")) # 均匀分布 初始化 # nn.init.xavier_uniform_(w,) # 初始化为常数 # nn.init.constant_(w) # 多头注意力机制 from torch.nn.modules.activation import MultiheadAttention query = torch.randn(11, 20, 40) key = torch.randn(6, 20, 40) value = torch.randn(6, 20, 40) attn = MultiheadAttention(embed_dim=40, num_heads=4) print(attn) for param in attn.named_parameters(): # print(param,param.size()) print(param[0], "++", param[1].shape) # 切片 x_embedding = torch.randn(3, 4) print(x_embedding) indicies = torch.LongTensor([0, 2]) # 进行切片,根据dim和indicies 获取相关数据 print(torch.index_select(x_embedding, 0, indicies)) print(torch.index_select(x_embedding, 1, indicies)) # pytorch 归一化层:BatchNorm、LayerNorm、InstanceNorm、GroupNorm # Norm 最归一化,所以输入输出的维度是不变化的
def __init__(self, d_model, n_heads, dropout): self.main_layer = MultiheadAttention(d_model, n_heads, dropout)