Beispiel #1
0
    def test_convert_padding_direction(self):
        t1 = (torch.tensor([
            [4.5, 2.3, 1.2, 0.0],
            [6.7, 9.8, 0.0, 0.0],
            [7.7, 5.4, 6.2, 8.0],
            [1.5, 0.0, 0.0, 0.0],
        ]).unsqueeze(-1).expand(-1, -1, 10))
        t2 = (torch.tensor([
            [0.0, 4.5, 2.3, 1.2],
            [0.0, 0.0, 6.7, 9.8],
            [7.7, 5.4, 6.2, 8.0],
            [0.0, 0.0, 0.0, 1.5],
        ]).unsqueeze(-1).expand(-1, -1, 10))
        seq_len = torch.tensor([3, 2, 4, 1]).int()

        t1_to_t2 = utils.convert_padding_direction(
            t1,
            seq_len,
            right_to_left=True,
        )
        self.assertTensorEqual(t1_to_t2, t2)

        t2_to_t1 = utils.convert_padding_direction(
            t2,
            seq_len,
            left_to_right=True,
        )
        self.assertTensorEqual(t2_to_t1, t1)
Beispiel #2
0
    def forward(self, src_tokens, src_lengths: Tensor, **unused):
        if self.left_pad:
            # nn.utils.rnn.pack_padded_sequence requires right-padding;
            # convert left-padding to right-padding
            src_tokens = speech_utils.convert_padding_direction(
                src_tokens,
                src_lengths,
                left_to_right=True,
            )

        if self.conv_layers_before is not None:
            x, src_lengths, padding_mask = self.conv_layers_before(
                src_tokens, src_lengths)
        else:
            x, padding_mask = src_tokens, \
                ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1))

        bsz, seqlen = x.size(0), x.size(1)

        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size
        h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size)

        for i in range(len(self.lstm)):
            if self.residual and i > 0:  # residual connection starts from the 2nd layer
                prev_x = x
            # pack embedded source tokens into a PackedSequence
            packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data)

            # apply LSTM
            packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0))

            # unpack outputs and apply dropout
            x, _ = nn.utils.rnn.pad_packed_sequence(
                packed_outs, padding_value=self.padding_value * 1.0)
            if i < len(
                    self.lstm) - 1:  # not applying dropout for the last layer
                x = F.dropout(x, p=self.dropout_out, training=self.training)
            x = x + prev_x if self.residual and i > 0 else x
        assert list(x.size()) == [seqlen, bsz, self.output_units]

        encoder_padding_mask = padding_mask.t()

        return EncoderOut(
            encoder_out=x,  # T x B x C
            encoder_padding_mask=encoder_padding_mask
            if encoder_padding_mask.any() else None,  # T x B
            encoder_embedding=None,
            encoder_states=None,
            src_tokens=None,
            src_lengths=src_lengths,  # B
        )
Beispiel #3
0
    def forward(
        self,
        src_tokens: Tensor,
        src_lengths: Tensor,
        enforce_sorted: bool = True,
        **unused,
    ):
        """
        Args:
            src_tokens (LongTensor): tokens in the source language of
                shape `(batch, src_len)`
            src_lengths (LongTensor): lengths of each source sentence of
                shape `(batch)`
            enforce_sorted (bool, optional): if True, `src_tokens` is
                expected to contain sequences sorted by length in a
                decreasing order. If False, this condition is not
                required. Default: True.
        """
        if self.left_pad:
            # nn.utils.rnn.pack_padded_sequence requires right-padding;
            # convert left-padding to right-padding
            src_tokens = speech_utils.convert_padding_direction(
                src_tokens,
                src_lengths,
                left_to_right=True,
            )

        if self.pre_encoder is not None:
            x, src_lengths, padding_mask = self.pre_encoder(src_tokens, src_lengths)
        else:
            x, padding_mask = (
                src_tokens,
                ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)),
            )

        bsz, seqlen = x.size(0), x.size(1)

        x = self.dropout_in_module(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        if self.multilayer_rnn_as_single_module:
            state_size = (
                (2 if self.bidirectional else 1) * self.num_layers,
                bsz,
                self.hidden_size,
            )
            h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size)

            # pack embedded source tokens into a PackedSequence
            packed_x = nn.utils.rnn.pack_padded_sequence(
                x,
                (
                    src_lengths.cpu()
                    if not self.src_bucketed
                    else src_lengths.new_full(
                        src_lengths.size(), x.size(0), device="cpu"
                    )
                ),
                enforce_sorted=enforce_sorted,
            )
            # apply LSTM
            packed_outs, (_, _) = self.lstm(packed_x, (h0, c0))
            # unpack outputs
            x, _ = nn.utils.rnn.pad_packed_sequence(
                packed_outs, padding_value=self.padding_value * 1.0
            )
        else:  # for back-compatibility
            state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size
            h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size)

            for i in range(len(self.lstm)):
                if (
                    self.residual and i > 0
                ):  # residual connection starts from the 2nd layer
                    prev_x = x
                # pack embedded source tokens into a PackedSequence
                packed_x = nn.utils.rnn.pack_padded_sequence(
                    x,
                    (
                        src_lengths.cpu()
                        if not self.src_bucketed
                        else src_lengths.new_full(
                            src_lengths.size(), x.size(0), device="cpu"
                        )
                    ),
                    enforce_sorted=enforce_sorted,
                )

                # apply LSTM
                packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0))

                # unpack outputs and apply dropout
                x, _ = nn.utils.rnn.pad_packed_sequence(
                    packed_outs, padding_value=self.padding_value * 1.0
                )
                if i < len(self.lstm) - 1:  # not applying dropout for the last layer
                    x = self.dropout_out_module(x)
                x = x + prev_x if self.residual and i > 0 else x
        assert list(x.size()) == [seqlen, bsz, self.output_units]

        encoder_padding_mask = padding_mask.t()

        # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
        # `foward` so we use a dictionary instead.
        # TorchScript does not support mixed values so the values are all lists.
        # The empty list is equivalent to None.
        return {
            "encoder_out": [x],  # T x B x C
            "encoder_padding_mask": [encoder_padding_mask]
            if encoder_padding_mask.any()
            else [],  # T x B
            "encoder_embedding": [],
            "encoder_states": [],
            "src_tokens": [],
            "src_lengths": [src_lengths],  # B
        }
Beispiel #4
0
    def forward(
        self,
        src_tokens: Tensor,
        src_lengths: Tensor,
        enforce_sorted: bool = True,
        **unused,
    ):
        """
        Args:
            src_tokens (LongTensor): tokens in the source language of
                shape `(batch, src_len)`
            src_lengths (LongTensor): lengths of each source sentence of
                shape `(batch)`
            enforce_sorted (bool, optional): if True, `src_tokens` is
                expected to contain sequences sorted by length in a
                decreasing order. If False, this condition is not
                required. Default: True.
        """
        if self.left_pad:
            # nn.utils.rnn.pack_padded_sequence requires right-padding;
            # convert left-padding to right-padding
            src_tokens = speech_utils.convert_padding_direction(
                src_tokens,
                src_lengths,
                left_to_right=True,
            )

        if self.conv_layers_before is not None:
            x, src_lengths, padding_mask = self.conv_layers_before(src_tokens, src_lengths)
        else:
            x, padding_mask = src_tokens, \
                ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1))

        bsz, seqlen = x.size(0), x.size(1)

        x = self.dropout_in_module(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size
        h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size)

        for i in range(len(self.lstm)):
            if self.residual and i > 0:  # residual connection starts from the 2nd layer
                prev_x = x
            # pack embedded source tokens into a PackedSequence
            packed_x = nn.utils.rnn.pack_padded_sequence(
                x,
                (
                    src_lengths.cpu() if not self.src_bucketed else
                    src_lengths.new_full(src_lengths.size(), x.size(0), device="cpu")
                ),
                enforce_sorted=enforce_sorted
            )

            # apply LSTM
            packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0))

            # unpack outputs and apply dropout
            x, _ = nn.utils.rnn.pad_packed_sequence(packed_outs, padding_value=self.padding_value*1.0)
            if i < len(self.lstm) - 1:  # not applying dropout for the last layer
                x = self.dropout_out_module(x)
            x = x + prev_x if self.residual and i > 0 else x
        assert list(x.size()) == [seqlen, bsz, self.output_units]

        encoder_padding_mask = padding_mask.t()

        return EncoderOut(
            encoder_out=x,  # T x B x C
            encoder_padding_mask=encoder_padding_mask if encoder_padding_mask.any() else None,  # T x B
            encoder_embedding=None,
            encoder_states=None,
            src_tokens=None,
            src_lengths=src_lengths,  # B
        )