def __init__(self, d_model: int, q: int, v: int, h: int, attention_size: int = None, window_size: Optional[int] = 168, padding: Optional[int] = 168 // 4, **kwargs): """Initialize the Multi Head Block.""" super().__init__(d_model, q, v, h, attention_size, **kwargs) self._window_size = window_size self._padding = padding self._q = q self._v = v # Step size for the moving window self._step = self._window_size - 2 * self._padding # Score mask for decoder self._future_mask = nn.Parameter( torch.triu(torch.ones((self._window_size, self._window_size)), diagonal=1).bool(), requires_grad=False) if self._attention_size is not None: self._attention_mask = nn.Parameter(generate_local_map_mask(self._window_size, self._attention_size), requires_grad=False)
def __init__(self, d_model: int, q: int, v: int, h: int, attention_size: int = None, chunk_size: Optional[int] = 168, **kwargs): """Initialize the Multi Head Block.""" super().__init__(d_model, q, v, h, attention_size, **kwargs) self._chunk_size = chunk_size # Score mask for decoder self._future_mask = nn.Parameter( torch.triu(torch.ones((self._chunk_size, self._chunk_size)), diagonal=1).bool(), requires_grad=False) if self._attention_size is not None: self._attention_mask = nn.Parameter(generate_local_map_mask(self._chunk_size, self._attention_size), requires_grad=False)
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[str] = None) -> torch.Tensor: """Propagate forward the input through the MHB. We compute for each head the queries, keys and values matrices, followed by the Scaled Dot-Product. The result is concatenated and returned with shape (batch_size, K, d_model). Parameters ---------- query: Input tensor with shape (batch_size, K, d_model) used to compute queries. key: Input tensor with shape (batch_size, K, d_model) used to compute keys. value: Input tensor with shape (batch_size, K, d_model) used to compute values. mask: Mask to apply on scores before computing attention. One of ``'subsequent'``, None. Default is None. Returns ------- Self attention tensor with shape (batch_size, K, d_model). """ K = query.shape[1] # seq_len ############################################### # Conv MHA # conv1dは(N,channel,Len)を要求するのでtransposeで対応した後に戻す # (batch_size, seq_lne, d_model)->(N,channel,Len)_>(batch_size, seq_lne, d_model) query = self.conv1d(query.transpose(1, 2)).transpose(1, 2) key = self.conv1d(key.transpose(1, 2)).transpose(1, 2) ############################################### # Compute Q, K and V, concatenate heads on batch dimension queries = torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0) keys = torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0) values = torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0) # Scaled Dot Product self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(K) ##ここまちがってない?d_modelかshunk_sizeな気がする # Compute local map mask if self._attention_size is not None: attention_mask = generate_local_map_mask(K, self._attention_size, self._scores.device) self._scores = self._scores.masked_fill(attention_mask, float('-inf')) # Compute future mask if mask == "subsequent": future_mask = torch.triu(torch.ones((K, K)), diagonal=1).bool() future_mask = future_mask.to(self._scores.device) self._scores = self._scores.masked_fill(future_mask, float('-inf')) # Apply sotfmax self._scores = F.softmax(self._scores, dim=-1) attention = torch.bmm(self._scores, values) # Concatenat the heads attention_heads = torch.cat(attention.chunk(self._h, dim=0), dim=-1) # Apply linear transformation W^O self_attention = self._W_o(attention_heads) return self_attention