def forward(self, x: Tensor, encoder_padding_mask: Optional[Tensor]): residual = x x, _ = self.self_attn(query=x, key=x, value=x, mask_future_timesteps=False, key_padding_mask=encoder_padding_mask, incremental_state=None, need_weights=False, static_kv=False) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob) elif self.platform == "gpu": x = self.dropout(x) x = residual + x x = self.ln1(x) residual = x x = F.threshold(self.fc1(x), 0.0, 0.0) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.relu_prob) elif self.platform == "gpu": x = self.relu_dropout(x) x = self.fc2(x) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.relu_prob) elif self.platform == "gpu": x = self.relu_dropout(x) x = residual + x x = self.ln2(x) return x
def forward(self, src_tokens, src_lengths): x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob) elif self.platform == "gpu": x = self.dropout(x) # B:batch size ; T: seq length ; C: embedding dim 512 # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): _encoder_padding_mask = None else: _encoder_padding_mask = encoder_padding_mask # encoder layers for layer in self.layers: x = layer(x, _encoder_padding_mask) return x, encoder_padding_mask # x.shape == T x B x C, encoder_padding_mask.shape == B x T
def forward(self, src): embedded = self.embedding(src) if self.training: if self.platform == "npu": embedded, _, _ = torch.dropoutV2(embedded, self.seed, p=self.prob) elif self.platform == "gpu": embedded = self.dropout(embedded) outputs, hidden = self.rnn(embedded) return hidden
def forward(self, prev_output_tokens: Tensor, encoder_out: Tensor, encoder_padding_mask: Tensor, incremental_state: Optional[Dict[str, Dict[str, Tensor]]] = None): positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if positions is not None: x += positions if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob) elif self.platform == "gpu": x = self.dropout(x) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out, encoder_padding_mask if encoder_padding_mask.any() else None, incremental_state, ) # T x B x C -> B x T x C x = x.transpose(0, 1) x = F.linear(x, self.embed_out) return x, attn
def forward(self, input, hidden, context): input = input.unsqueeze(0) embedded = self.embedding(input) if self.training: if self.platform == "npu": embedded, _, _ = torch.dropoutV2(embedded, self.seed, p=self.prob) elif self.platform == "gpu": embedded = self.dropout(embedded) emb_con = torch.cat((embedded, context), dim=2) output, hidden = self.rnn(emb_con, hidden) output = torch.cat( (embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1) prediction = self.fc_out(output) return prediction, hidden
def forward(self, x: Tensor, encoder_out: Tensor, encoder_padding_mask: Optional[Tensor], incremental_state: Optional[Dict[str, Dict[str, Tensor]]]): residual = x x, _ = self.self_attn(query=x, key=x, value=x, mask_future_timesteps=True, key_padding_mask=None, incremental_state=incremental_state, need_weights=False, static_kv=False) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob) elif self.platform == "gpu": x = self.dropout(x) x = residual + x x = self.self_attn_layer_norm(x) attn = None if self.encoder_attn is not None: residual = x x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, mask_future_timesteps=False, need_weights=(not self.training and self.need_attn), ) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob) elif self.platform == "gpu": x = self.dropout(x) x = residual + x x = self.encoder_attn_layer_norm(x) residual = x x = F.threshold(self.fc1(x), 0.0, 0.0) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.relu_prob) elif self.platform == "gpu": x = self.relu_dropout(x) x = self.fc2(x) if self.training: if self.platform == "npu": x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob) elif self.platform == "gpu": x = self.dropout(x) x = residual + x x = self.layer_norm(x) return x, attn
def forward(self, query: Tensor, key: Tensor, value: Tensor, mask_future_timesteps: bool, key_padding_mask: Optional[Tensor], incremental_state: Optional[Dict[str, Dict[str, Tensor]]], need_weights: bool, static_kv: bool): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Future timesteps can be masked with the `mask_future_timesteps` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ qkv_same, kv_same = self._fast_same_check(query, key, value) tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() k = v = query.new_empty(0) if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) else: saved_state = None if qkv_same: q, k, v = self_attn_linears(query, self.q_proj_weight, self.k_proj_weight, self.v_proj_weight, self.scaling_cpu, self.scaling_device) elif kv_same: q = query_linear(query, self.q_proj_weight, self.scaling_cpu, self.scaling_device) if not (saved_state is not None and 'prev_key' in saved_state and static_kv): k, v = key_value_linears(key, self.k_proj_weight, self.v_proj_weight) else: q = torch.addmm( query.view(query.size(0) * query.size(1), query.size(2)), query.view(query.size(0) * query.size(1), query.size(2)), self.q_proj_weight, beta=0.0, alpha=self.scaling) if not (saved_state is not None and 'prev_key' in saved_state and static_kv): k = F.linear(key, self.k_proj_weight, self.in_proj_bias_k) v = F.linear(value, self.v_proj_weight, self.in_proj_bias_v) if saved_state is not None: if 'prev_key' in saved_state: k = torch.cat((saved_state['prev_key'], k), dim=0) if 'prev_value' in saved_state: v = torch.cat((saved_state['prev_value'], v), dim=0) saved_state['prev_key'] = k saved_state['prev_value'] = v self._set_input_buffer(incremental_state, saved_state) src_len = k.size(0) if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).clone().transpose( 0, 1).contiguous() k = k.contiguous().view(src_len, bsz * self.num_heads, self.head_dim).clone().transpose( 0, 1).contiguous() v = v.contiguous().view(src_len, bsz * self.num_heads, self.head_dim).clone().transpose( 0, 1).contiguous() attn_weights = strided_bmm1(q, k.transpose(1, 2)) assert list( attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] # only apply masking at training time (when incremental state is None) if mask_future_timesteps and incremental_state is None: assert query.size() == key.size(), \ 'mask_future_timesteps only applies to self-attention' attn_weights += self.buffered_mask(attn_weights).unsqueeze(0) if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.float().masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), torch.finfo(torch.float32).min, ).type_as(attn_weights) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = F.softmax(attn_weights, dim=-1) if self.training: if self.platform == "npu": attn_weights, _, _ = torch.dropoutV2(attn_weights, self.seed, p=self.prob) elif self.platform == "gpu": attn_weights = self.dropout(attn_weights) attn = strided_bmm2(attn_weights, v) assert list( attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) # linear attn = self.out_proj(attn) if need_weights: # average attention weights over heads attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.sum(dim=1) / self.num_heads else: attn_weights = attn_weights.new_empty( 0) # Can't set to None because jit script reasons return attn, attn_weights