def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, # sampling_rate=22050, sampling_rate=16000, mel_fmin=0.0, mel_fmax=8000.0): """ PyTorch layer which calculates mel spectrograms Args: filter_length (int): the number of fft components hop_length (int): the window slide over the waveform, with suggested value 12.5ms win_length (int): the size of the window to be applied, suggested value is 50ms n_mel_channels (int): the number of mel bands/bins to be generated sampling_rate (int): the sampling rate of the audio waveform, given data of the same sr suggested value can be found via sox mel_fmin (float): pls refer to librosa documentation mel_fmax (flaot): pls refer to librosa documentation """ super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn( sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn( sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=50.0, mel_fmax=7600.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa.filters.mel(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) import numpy as np inv_mel_basis = np.linalg.pinv(mel_basis) mel_basis = torch.from_numpy(mel_basis).float() inv_mel_basis = torch.from_numpy(inv_mel_basis).float() self.register_buffer('mel_basis', mel_basis) self.register_buffer('inv_mel_basis', inv_mel_basis)
class TacotronSTFT(torch.nn.Module): def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=24000, mel_fmin=0.0, mel_fmax=12000.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) def spectral_normalize(self, magnitudes): output = dynamic_range_compression(magnitudes) return output def spectral_de_normalize(self, magnitudes): output = dynamic_range_decompression(magnitudes) return output def mel_spectrogram(self, y): """Computes mel-spectrograms from a batch of waves PARAMS ------ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] RETURNS ------- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) """ if ((torch.max(y.data) > 1) or (torch.min(y.data) < -1)): y.data /= torch.max(torch.min(y.data).abs(), torch.max(y.data)) print(f"Normalized: {torch.min(y.data)} ... {torch.max(y.data)}") assert (torch.min(y.data) >= -1), f"Min value of audio tensor: {torch.min(y.data)} < -1" assert (torch.max(y.data) <= 1), f"Max value of audio tensor: {torch.max(y.data)} > 1" magnitudes, phases = self.stft_fn.transform(y) magnitudes = magnitudes.data mel_output = torch.matmul(self.mel_basis, magnitudes) mel_output = self.spectral_normalize(mel_output) return mel_output
class TacotronSTFT(torch.nn.Module): def __init__(self, filter_length=2048, hop_length=275, win_length=1100, n_mel_channels=80, sampling_rate=22050, mel_fmin=125.0, mel_fmax=7600.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa.filters.mel(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) def spectral_normalize(self, magnitudes): output = dynamic_range_compression(magnitudes) return output def spectral_de_normalize(self, magnitudes): output = dynamic_range_decompression(magnitudes) return output def mel_spectrogram(self, y): """Computes mel-spectrograms from a batch of waves PARAMS ------ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] RETURNS ------- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) """ assert (torch.min(y.data) >= -1) assert (torch.max(y.data) <= 1) magnitudes, phases = self.stft_fn.transform(y) magnitudes = magnitudes.data mel_output = torch.matmul(self.mel_basis, magnitudes) mel_output = self.spectral_normalize(mel_output) return mel_output
class TacotronSTFT(torch.nn.Module): def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=50.0, mel_fmax=7600.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa.filters.mel(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) import numpy as np inv_mel_basis = np.linalg.pinv(mel_basis) mel_basis = torch.from_numpy(mel_basis).float() inv_mel_basis = torch.from_numpy(inv_mel_basis).float() self.register_buffer('mel_basis', mel_basis) self.register_buffer('inv_mel_basis', inv_mel_basis) def spectral_normalize(self, magnitudes): return dynamic_range_compression(magnitudes) def spectral_de_normalize(self, magnitudes): return dynamic_range_decompression(magnitudes) def mel_spectrogram(self, y): """Computes mel-spectrograms from a batch of waves PARAMS ------ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] RETURNS ------- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) """ #assert(torch.min(y.data) >= -1) #assert(torch.max(y.data) <= 1) magnitudes, phases = self.stft_fn.transform(y) magnitudes = magnitudes.data mel_output = torch.matmul(self.mel_basis, magnitudes) mel_output = self.spectral_normalize(mel_output) return mel_output def inv_mel_spectrogram(self, mel): """Computes mel-spectrograms from a batch of waves PARAMS ------ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] RETURNS ------- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) """ mel = self.spectral_de_normalize(mel.float()) magnitudes = torch.matmul(self.inv_mel_basis, mel.data) magnitudes = torch.max(magnitudes.clone().detach().fill_(1e-10), magnitudes) return magnitudes.data
class TacotronSTFT(torch.nn.Module): def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, # sampling_rate=22050, sampling_rate=16000, mel_fmin=0.0, mel_fmax=8000.0): """ PyTorch layer which calculates mel spectrograms Args: filter_length (int): the number of fft components hop_length (int): the window slide over the waveform, with suggested value 12.5ms win_length (int): the size of the window to be applied, suggested value is 50ms n_mel_channels (int): the number of mel bands/bins to be generated sampling_rate (int): the sampling rate of the audio waveform, given data of the same sr suggested value can be found via sox mel_fmin (float): pls refer to librosa documentation mel_fmax (flaot): pls refer to librosa documentation """ super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn( sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) # print(f"The mel channels are {self.n_mel_channels}") # print(f"The sr is {self.sampling_rate}") # print(f"The filter length is {filter_length}") # print(f"The hop length is {hop_length}") # print(f"The win length is {win_length}") # print(f"The mel basis has size {self.mel_basis.size()}") def spectral_normalize(self, magnitudes): output = dynamic_range_compression(magnitudes) return output def spectral_de_normalize(self, magnitudes): output = dynamic_range_decompression(magnitudes) return output def spectrogram(self, y): """ Computes spectrogram from batch of waves Args: y (torch.FloatTensor) (B, T) in range [-1, 1] Returns: magnitudes (torch.FloatTensor) (B, n_spec_channels, T) """ assert(torch.min(y.data) >= -1) assert(torch.max(y.data) <= 1) magnitudes, phases = self.stft_fn.transform(y) return magnitudes def mel_spectrogram(self, y): """Computes mel-spectrograms from a batch of waves PARAMS ------ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] RETURNS ------- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) """ assert(torch.min(y.data) >= -1) assert(torch.max(y.data) <= 1) magnitudes, phases = self.stft_fn.transform(y) magnitudes = magnitudes.data mel_output = torch.matmul(self.mel_basis, magnitudes) mel_output = self.spectral_normalize(mel_output) return mel_output