Example #1
0
 def __init__(self, params):
     super(MultiHeadAttention, self).__init__()
     assert params.hidden_dim % params.n_head == 0
     self.attentions = nn.ModuleList([SelfAttention(params)
                                      for _ in range(params.n_head)])
     self.o_w = nn.Linear(params.hidden_dim, params.hidden_dim, bias=False)
     init_weight(self.o_w)
     self.dropout = nn.Dropout(params.dropout)
Example #2
0
    def __init__(self, params):
        super(PositionWiseFeedForward, self).__init__()
        # nn.Conv1d input = (batch size , # of channels)
        """
        Multi Head Attention에서 각 head가 만들어낸 self-attention을 치우치치 않게 균등하게 섞는 역할
	선형변환이 position마다 동일하게 적용이 되지만, layer마다 다른 파라미터를 사용한다. 이를 kernel size가 1d인 2 개의 convolution들로 나타냄
	"""
        self.conv1 = nn.Conv1d(params.hidden_dim,
                               params.feed_forward_dim,
                               kernel_size=1)
        self.conv2 = nn.Conv1d(params.feed_forward_dim,
                               params.hidden_dim,
                               kernel_size=1)
        init_weight(self.conv1)
        init_weight(self.conv2)
        self.dropout = nn.Dropout(params.dropout)
Example #3
0
 def __init__(self, params):
     super(MultiHeadAttention, self).__init__()
     assert params.hidden_dim % params.n_head == 0
     """
     self_attentions : self-attention을 num_head번 반복하도록 선언
     """
     self.attentions = nn.ModuleList(
         [SelfAttention(params) for _ in range(params.n_head)])
     """
     self.o_w : 가중치 행렬 선언 및 초기화
     """
     self.o_w = nn.Linear(params.hidden_dim, params.hidden_dim, bias=False)
     init_weight(self.o_w)
     """
     self.dropout : Dropout 선언
     """
     self.dropout = nn.Dropout(params.dropout)
Example #4
0
    def __init__(self, params):
        super(SelfAttention, self).__init__()
        self.hidden_dim = params.hidden_dim
        self.attention_dim = params.hidden_dim // params.n_head

        self.q_w = nn.Linear(self.hidden_dim, self.attention_dim, bias=False)
        self.k_w = nn.Linear(self.hidden_dim, self.attention_dim, bias=False)
        self.v_w = nn.Linear(self.hidden_dim, self.attention_dim, bias=False)
        init_weight(self.q_w)
        init_weight(self.k_w)
        init_weight(self.v_w)

        self.dropout = nn.Dropout(params.dropout)
        self.scale_factor = torch.sqrt(torch.FloatTensor([self.attention_dim])).to(params.device)