Esempio n. 1
0
    def __init__(self,
                 filter_length=1024,
                 hop_length=256, win_length=1024,
                 n_mel_channels=80,
                #  sampling_rate=22050,
                 sampling_rate=16000,
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        """
        PyTorch layer which calculates mel spectrograms

        Args:
            filter_length (int): the number of fft components
            hop_length (int): the window slide over the waveform,
                with suggested value 12.5ms
            win_length (int): the size of the window to be applied, suggested
                value is 50ms
            n_mel_channels (int): the number of mel bands/bins to be generated
            sampling_rate (int): the sampling rate of the audio waveform, given
                data of the same sr suggested value can be found via sox
            mel_fmin (float): pls refer to librosa documentation
            mel_fmax (flaot): pls refer to librosa documentation
        """
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(
            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
Esempio n. 2
0
 def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
              n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
              mel_fmax=8000.0):
     super(TacotronSTFT, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.stft_fn = STFT(filter_length, hop_length, win_length)
     mel_basis = librosa_mel_fn(
         sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer('mel_basis', mel_basis)
Esempio n. 3
0
 def __init__(self,
              filter_length=1024,
              hop_length=256,
              win_length=1024,
              n_mel_channels=80,
              sampling_rate=22050,
              mel_fmin=50.0,
              mel_fmax=7600.0):
     super(TacotronSTFT, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.stft_fn = STFT(filter_length, hop_length, win_length)
     mel_basis = librosa.filters.mel(sampling_rate, filter_length,
                                     n_mel_channels, mel_fmin, mel_fmax)
     import numpy as np
     inv_mel_basis = np.linalg.pinv(mel_basis)
     mel_basis = torch.from_numpy(mel_basis).float()
     inv_mel_basis = torch.from_numpy(inv_mel_basis).float()
     self.register_buffer('mel_basis', mel_basis)
     self.register_buffer('inv_mel_basis', inv_mel_basis)
Esempio n. 4
0
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=24000,
                 mel_fmin=0.0,
                 mel_fmax=12000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """

        if ((torch.max(y.data) > 1) or (torch.min(y.data) < -1)):
            y.data /= torch.max(torch.min(y.data).abs(), torch.max(y.data))
            print(f"Normalized: {torch.min(y.data)} ... {torch.max(y.data)}")

        assert (torch.min(y.data) >=
                -1), f"Min value of audio tensor: {torch.min(y.data)} < -1"
        assert (torch.max(y.data) <=
                1), f"Max value of audio tensor: {torch.max(y.data)} > 1"

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output
Esempio n. 5
0
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=2048,
                 hop_length=275,
                 win_length=1100,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=125.0,
                 mel_fmax=7600.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa.filters.mel(sampling_rate, filter_length,
                                        n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output
Esempio n. 6
0
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=50.0,
                 mel_fmax=7600.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa.filters.mel(sampling_rate, filter_length,
                                        n_mel_channels, mel_fmin, mel_fmax)
        import numpy as np
        inv_mel_basis = np.linalg.pinv(mel_basis)
        mel_basis = torch.from_numpy(mel_basis).float()
        inv_mel_basis = torch.from_numpy(inv_mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
        self.register_buffer('inv_mel_basis', inv_mel_basis)

    def spectral_normalize(self, magnitudes):
        return dynamic_range_compression(magnitudes)

    def spectral_de_normalize(self, magnitudes):
        return dynamic_range_decompression(magnitudes)

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        #assert(torch.min(y.data) >= -1)
        #assert(torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output

    def inv_mel_spectrogram(self, mel):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        mel = self.spectral_de_normalize(mel.float())
        magnitudes = torch.matmul(self.inv_mel_basis, mel.data)
        magnitudes = torch.max(magnitudes.clone().detach().fill_(1e-10),
                               magnitudes)
        return magnitudes.data
Esempio n. 7
0
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256, win_length=1024,
                 n_mel_channels=80,
                #  sampling_rate=22050,
                 sampling_rate=16000,
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        """
        PyTorch layer which calculates mel spectrograms

        Args:
            filter_length (int): the number of fft components
            hop_length (int): the window slide over the waveform,
                with suggested value 12.5ms
            win_length (int): the size of the window to be applied, suggested
                value is 50ms
            n_mel_channels (int): the number of mel bands/bins to be generated
            sampling_rate (int): the sampling rate of the audio waveform, given
                data of the same sr suggested value can be found via sox
            mel_fmin (float): pls refer to librosa documentation
            mel_fmax (flaot): pls refer to librosa documentation
        """
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(
            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
        # print(f"The mel channels are {self.n_mel_channels}")
        # print(f"The sr is {self.sampling_rate}")
        # print(f"The filter length is {filter_length}")
        # print(f"The hop length is {hop_length}")
        # print(f"The win length is {win_length}")
        # print(f"The mel basis has size {self.mel_basis.size()}")

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def spectrogram(self, y):
        """
        Computes spectrogram from batch of waves

        Args:
            y (torch.FloatTensor) (B, T) in range [-1, 1]
        Returns:
            magnitudes (torch.FloatTensor) (B, n_spec_channels, T)
        """
        assert(torch.min(y.data) >= -1)
        assert(torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        return magnitudes

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert(torch.min(y.data) >= -1)
        assert(torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output