コード例 #1
0
    def forward(self, x: Tensor, encoder_padding_mask: Optional[Tensor]):
        residual = x

        x, _ = self.self_attn(query=x,
                              key=x,
                              value=x,
                              mask_future_timesteps=False,
                              key_padding_mask=encoder_padding_mask,
                              incremental_state=None,
                              need_weights=False,
                              static_kv=False)
        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob)
            elif self.platform == "gpu":
                x = self.dropout(x)
        x = residual + x
        x = self.ln1(x)

        residual = x
        x = F.threshold(self.fc1(x), 0.0, 0.0)
        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.relu_prob)
            elif self.platform == "gpu":
                x = self.relu_dropout(x)
        x = self.fc2(x)
        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.relu_prob)
            elif self.platform == "gpu":
                x = self.relu_dropout(x)
        x = residual + x
        x = self.ln2(x)
        return x
コード例 #2
0
    def forward(self, src_tokens, src_lengths):
        x = self.embed_scale * self.embed_tokens(src_tokens)
        if self.embed_positions is not None:
            x += self.embed_positions(src_tokens)
        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob)
            elif self.platform == "gpu":
                x = self.dropout(x)

        # B:batch size ; T: seq length ; C: embedding dim 512
        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # compute padding mask
        encoder_padding_mask = src_tokens.eq(self.padding_idx)
        if not encoder_padding_mask.any():
            _encoder_padding_mask = None
        else:
            _encoder_padding_mask = encoder_padding_mask

        # encoder layers
        for layer in self.layers:
            x = layer(x, _encoder_padding_mask)

        return x, encoder_padding_mask  # x.shape == T x B x C, encoder_padding_mask.shape == B x T
コード例 #3
0
ファイル: model.py プロジェクト: luweizheng/nn-bench
    def forward(self, src):
        embedded = self.embedding(src)

        if self.training:
            if self.platform == "npu":
                embedded, _, _ = torch.dropoutV2(embedded,
                                                 self.seed,
                                                 p=self.prob)
            elif self.platform == "gpu":
                embedded = self.dropout(embedded)

        outputs, hidden = self.rnn(embedded)

        return hidden
コード例 #4
0
    def forward(self,
                prev_output_tokens: Tensor,
                encoder_out: Tensor,
                encoder_padding_mask: Tensor,
                incremental_state: Optional[Dict[str, Dict[str,
                                                           Tensor]]] = None):
        positions = self.embed_positions(
            prev_output_tokens,
            incremental_state=incremental_state,
        ) if self.embed_positions is not None else None

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
        if positions is not None:
            x += positions
        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob)
            elif self.platform == "gpu":
                x = self.dropout(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
        attn = None

        # decoder layers
        for layer in self.layers:
            x, attn = layer(
                x,
                encoder_out,
                encoder_padding_mask if encoder_padding_mask.any() else None,
                incremental_state,
            )

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)
        x = F.linear(x, self.embed_out)

        return x, attn
コード例 #5
0
ファイル: model.py プロジェクト: luweizheng/nn-bench
    def forward(self, input, hidden, context):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)

        if self.training:
            if self.platform == "npu":
                embedded, _, _ = torch.dropoutV2(embedded,
                                                 self.seed,
                                                 p=self.prob)
            elif self.platform == "gpu":
                embedded = self.dropout(embedded)

        emb_con = torch.cat((embedded, context), dim=2)
        output, hidden = self.rnn(emb_con, hidden)
        output = torch.cat(
            (embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)),
            dim=1)
        prediction = self.fc_out(output)

        return prediction, hidden
コード例 #6
0
    def forward(self, x: Tensor, encoder_out: Tensor,
                encoder_padding_mask: Optional[Tensor],
                incremental_state: Optional[Dict[str, Dict[str, Tensor]]]):
        residual = x

        x, _ = self.self_attn(query=x,
                              key=x,
                              value=x,
                              mask_future_timesteps=True,
                              key_padding_mask=None,
                              incremental_state=incremental_state,
                              need_weights=False,
                              static_kv=False)

        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob)
            elif self.platform == "gpu":
                x = self.dropout(x)
        x = residual + x
        x = self.self_attn_layer_norm(x)

        attn = None
        if self.encoder_attn is not None:
            residual = x

            x, attn = self.encoder_attn(
                query=x,
                key=encoder_out,
                value=encoder_out,
                key_padding_mask=encoder_padding_mask,
                incremental_state=incremental_state,
                static_kv=True,
                mask_future_timesteps=False,
                need_weights=(not self.training and self.need_attn),
            )
            if self.training:
                if self.platform == "npu":
                    x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob)
                elif self.platform == "gpu":
                    x = self.dropout(x)
            x = residual + x

            x = self.encoder_attn_layer_norm(x)

        residual = x
        x = F.threshold(self.fc1(x), 0.0, 0.0)
        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.relu_prob)
            elif self.platform == "gpu":
                x = self.relu_dropout(x)
        x = self.fc2(x)
        if self.training:
            if self.platform == "npu":
                x, _, _ = torch.dropoutV2(x, self.seed, p=self.prob)
            elif self.platform == "gpu":
                x = self.dropout(x)
        x = residual + x
        x = self.layer_norm(x)
        return x, attn
コード例 #7
0
    def forward(self, query: Tensor, key: Tensor, value: Tensor,
                mask_future_timesteps: bool,
                key_padding_mask: Optional[Tensor],
                incremental_state: Optional[Dict[str, Dict[str, Tensor]]],
                need_weights: bool, static_kv: bool):
        """Input shape: Time x Batch x Channel

        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Future timesteps can be masked with the
        `mask_future_timesteps` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """
        qkv_same, kv_same = self._fast_same_check(query, key, value)

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]
        assert key.size() == value.size()

        k = v = query.new_empty(0)
        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
        else:
            saved_state = None

        if qkv_same:
            q, k, v = self_attn_linears(query, self.q_proj_weight,
                                        self.k_proj_weight, self.v_proj_weight,
                                        self.scaling_cpu, self.scaling_device)
        elif kv_same:
            q = query_linear(query, self.q_proj_weight, self.scaling_cpu,
                             self.scaling_device)
            if not (saved_state is not None and 'prev_key' in saved_state
                    and static_kv):
                k, v = key_value_linears(key, self.k_proj_weight,
                                         self.v_proj_weight)
        else:
            q = torch.addmm(
                query.view(query.size(0) * query.size(1), query.size(2)),
                query.view(query.size(0) * query.size(1), query.size(2)),
                self.q_proj_weight,
                beta=0.0,
                alpha=self.scaling)
            if not (saved_state is not None and 'prev_key' in saved_state
                    and static_kv):
                k = F.linear(key, self.k_proj_weight, self.in_proj_bias_k)
                v = F.linear(value, self.v_proj_weight, self.in_proj_bias_v)

        if saved_state is not None:
            if 'prev_key' in saved_state:
                k = torch.cat((saved_state['prev_key'], k), dim=0)
            if 'prev_value' in saved_state:
                v = torch.cat((saved_state['prev_value'], v), dim=0)
            saved_state['prev_key'] = k
            saved_state['prev_value'] = v
            self._set_input_buffer(incremental_state, saved_state)

        src_len = k.size(0)

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len

        q = q.contiguous().view(tgt_len, bsz * self.num_heads,
                                self.head_dim).clone().transpose(
                                    0, 1).contiguous()
        k = k.contiguous().view(src_len, bsz * self.num_heads,
                                self.head_dim).clone().transpose(
                                    0, 1).contiguous()
        v = v.contiguous().view(src_len, bsz * self.num_heads,
                                self.head_dim).clone().transpose(
                                    0, 1).contiguous()

        attn_weights = strided_bmm1(q, k.transpose(1, 2))

        assert list(
            attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

        # only apply masking at training time (when incremental state is None)
        if mask_future_timesteps and incremental_state is None:
            assert query.size() == key.size(), \
                'mask_future_timesteps only applies to self-attention'
            attn_weights += self.buffered_mask(attn_weights).unsqueeze(0)
        if key_padding_mask is not None:
            # don't attend to padding symbols
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)

            attn_weights = attn_weights.float().masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                torch.finfo(torch.float32).min,
            ).type_as(attn_weights)

            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
                                             src_len)

        attn_weights = F.softmax(attn_weights, dim=-1)
        if self.training:
            if self.platform == "npu":
                attn_weights, _, _ = torch.dropoutV2(attn_weights,
                                                     self.seed,
                                                     p=self.prob)
            elif self.platform == "gpu":
                attn_weights = self.dropout(attn_weights)

        attn = strided_bmm2(attn_weights, v)
        assert list(
            attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)

        # linear
        attn = self.out_proj(attn)

        if need_weights:
            # average attention weights over heads
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            attn_weights = attn_weights.sum(dim=1) / self.num_heads
        else:
            attn_weights = attn_weights.new_empty(
                0)  # Can't set to None because jit script reasons

        return attn, attn_weights