Example #1
0
def noise_estimation_loss(model,
                          x0: paddle.Tensor,
                          t: paddle.Tensor,
                          e: paddle.Tensor,
                          b: paddle.Tensor,
                          keepdim=False):
    a = (1 - b).cumprod(0).index_select(t, 0).reshape((-1, 1, 1, 1))
    x = x0 * a.sqrt() + e * (1.0 - a).sqrt()
    output = model(x, t.astype('float32'))
    if keepdim:
        return (e - output).square().sum((1, 2, 3))
    else:
        return (e - output).square().sum((1, 2, 3)).mean(0)
Example #2
0
    def _get_feat_extract_output_lengths(self, input_lengths: paddle.Tensor):
        """
        Computes the output length of the convolutional layers
        """
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1D.html
            return (input_length - kernel_size) // stride + 1

        for kernel_size, stride in zip(self.config.conv_kernel,
                                       self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size,
                                             stride)

        return input_lengths.astype('int64')
Example #3
0
def melspectrogram(x: Tensor,
                   sr: int = 22050,
                   n_fft: int = 2048,
                   hop_length: Optional[int] = None,
                   win_length: Optional[int] = None,
                   window: str = 'hann',
                   center: bool = True,
                   pad_mode: str = 'reflect',
                   power: float = 2.0,
                   n_mels: int = 128,
                   f_min: float = 0.0,
                   f_max: Optional[float] = None,
                   htk: bool = True,
                   norm: Union[str, float] = 'slaney',
                   dtype: str = 'float64',
                   to_db: bool = False,
                   **kwargs) -> Tensor:
    """Compute the melspectrogram of a given signal, typically an audio waveform.
        The melspectrogram is also known as filterbank or fbank feature in audio community.
        It is computed by multiplying spectrogram with Mel filter bank matrix.

        Parameters:
            sr(int): the audio sample rate.
                The default value is 22050.
            n_fft(int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window(str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            power(float): The power of the complex norm.
                The default value is 2.0
            n_mels(int): the mel bins, comman choices are 32, 40, 64, 80, 128.
            f_min(float): the lower cut-off frequency, below which the filter response is zero. Tips:
                set f_min to slightly higher than 0.
                The default value is 0.
            f_max(float): the upper cut-off frequency, above which the filter response is zero.
                If None, it is set to half of the sample rate, i.e., sr//2. Tips: set it a slightly
                smaller than half of sample rate.
                The default value is None.
            htk(bool): whether to use HTK formula in computing fbank matrix.
            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            dtype(str): the datatype of fbank matrix used in the transform. Use float64(default) to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
            to_db(bool): whether to convert the magnitude to db scale.
                The default value is False.
            kwargs: the key-word arguments that are passed to F.power_to_db if to_db is True

        Shape:
            - x: 1-D tensor with shape: (signal_length,) or 2-D tensor with shape (N, signal_length).
            - output: 2-D tensor with shape (N, n_mels, frame_number),
            The batch size N is set to 1 if input singal x is 1D tensor.

        Notes:
            1. The melspectrogram function relies on F.spectrogram and F.compute_fbank_matrix.
            2. The melspectrogram function does not convert magnitude to db by default.

        Examples:

            .. code-block:: python

            import paddle
            import paddleaudio.functional as F
            x = F.melspectrogram(paddle.randn((8, 16000,)))
            print(x.shape)
            >> [8, 128, 32]

    """

    x = spectrogram(x,
                    n_fft=n_fft,
                    hop_length=hop_length,
                    win_length=win_length,
                    window=window,
                    center=center,
                    pad_mode=pad_mode,
                    power=power,
                    dtype=dtype)
    if f_max is None:
        f_max = sr // 2
    fbank_matrix = compute_fbank_matrix(sr=sr,
                                        n_fft=n_fft,
                                        n_mels=n_mels,
                                        f_min=f_min,
                                        f_max=f_max,
                                        htk=htk,
                                        norm=norm,
                                        dtype=dtype)
    fbank_matrix = fbank_matrix.unsqueeze(0)
    mel_feature = paddle.matmul(fbank_matrix, x.astype(fbank_matrix.dtype))
    if to_db:
        mel_feature = power_to_db(mel_feature, **kwargs)

    return mel_feature