def decibel_statistics(wav, sampling_rate): """ Calculate (min, max) values for the decibel values of both the linear scale magnitude spectrogram and a mel scale magnitude spectrogram. Arguments: wav (np.ndarray): Audio time series. The shape is expected to be shape=(n,). sampling_rate (int): Sampling rate using in the calculation of `wav`. Returns: np.ndarray: Min and max values of the decibel representations. Calculation: np.array[min(linear_db), max(linear_db), min(mel_db), max(mel_db)] """ n_fft = 1024 hop_length = n_fft // 4 win_length = n_fft n_mels = 80 # Get the linear scale spectrogram. linear_spec = linear_scale_spectrogram(wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # Get the mel scale spectrogram. mel_spec = mel_scale_spectrogram(wav, n_fft=n_fft, sampling_rate=sampling_rate, n_mels=n_mels, fmin=0, fmax=sampling_rate // 2, hop_length=hop_length, win_length=win_length, power=1) # Convert the linear spectrogram into decibel representation. linear_mag = np.abs(linear_spec) linear_mag_db = magnitude_to_decibel(linear_mag) # linear_mag_db = normalize_decibel(linear_mag_db, 20, 100) # Convert the mel spectrogram into decibel representation. mel_mag = np.abs(mel_spec) mel_mag_db = magnitude_to_decibel(mel_mag) # mel_mag_db = normalize_decibel(mel_mag_db, -7.7, 95.8) return np.array([ np.min(linear_mag_db), np.max(linear_mag_db), np.min(mel_mag_db), np.max(mel_mag_db) ])
def load_audio(file_path): # Window length in audio samples. win_len = ms_to_samples(model_params.win_len, model_params.sampling_rate) # Window hop in audio samples. hop_len = ms_to_samples(model_params.win_hop, model_params.sampling_rate) # Load the actual audio file. wav, sr = load_wav(file_path.decode()) # TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9). # Remove silence at the beginning and end of the wav so the network does not have to learn # some random initial silence delay after which it is allowed to speak. wav, _ = librosa.effects.trim(wav) # Calculate the linear scale spectrogram. # Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers # for example are applied to each frame automatically. linear_spec = linear_scale_spectrogram(wav, model_params.n_fft, hop_len, win_len).T # Calculate the Mel. scale spectrogram. # Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for # example are applied to each frame automatically. mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr, model_params.n_mels, model_params.mel_fmin, model_params.mel_fmax, hop_len, win_len, 1).T # Convert the linear spectrogram into decibel representation. linear_mag = np.abs(linear_spec) linear_mag_db = magnitude_to_decibel(linear_mag) linear_mag_db = normalize_decibel(linear_mag_db, CMUDatasetHelper.linear_ref_db, CMUDatasetHelper.linear_mag_max_db) # => linear_mag_db.shape = (T_spec, 1 + n_fft // 2) # Convert the mel spectrogram into decibel representation. mel_mag = np.abs(mel_spec) mel_mag_db = magnitude_to_decibel(mel_mag) mel_mag_db = normalize_decibel(mel_mag_db, CMUDatasetHelper.mel_mag_ref_db, CMUDatasetHelper.mel_mag_max_db) # => mel_mag_db.shape = (T_spec, n_mels) # Tacotron reduction factor. if model_params.reduction > 1: mel_mag_db, linear_mag_db = DatasetHelper.apply_reduction_padding( mel_mag_db, linear_mag_db, model_params.reduction) return np.array(mel_mag_db).astype(np.float32), \ np.array(linear_mag_db).astype(np.float32)
def plot_liner_mel_spec_comparasion(): ms_win_len = 50.0 ms_win_hop = 12.5 n_fft = 1024 wav_path = '/thesis/datasets/blizzard_nancy/wav/RURAL-02198.wav' wav, sr = load_wav(wav_path) win_len = ms_to_samples(ms_win_len, sampling_rate=sr) hop_len = ms_to_samples(ms_win_hop, sampling_rate=sr) linear_spec = linear_scale_spectrogram(wav, n_fft, hop_len, win_len).T mel_spec = mel_scale_spectrogram(wav, n_fft=n_fft, sampling_rate=sr, n_mels=80, fmin=0, fmax=sr // 2, hop_length=hop_len, win_length=win_len, power=1).T # ================================================================================================== # Convert the linear spectrogram into decibel representation. # ================================================================================================== linear_mag = np.abs(linear_spec) linear_mag_db = magnitude_to_decibel(linear_mag) # ================================================================================================== # Convert the mel spectrogram into decibel representation. # ================================================================================================== mel_mag = np.abs(mel_spec) mel_mag_db = magnitude_to_decibel(mel_mag) rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 13}) rc('text', usetex=True) y_formater = ticker.FuncFormatter( lambda x, pos: '{:.0f}'.format(x / 1000.0)) linear_mag_db = linear_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) / hop_len), :] fig = plot_spectrogram(linear_mag_db.T, sr, hop_len, 0.0, sr // 2.0, 'linear', figsize=((1.0 / 1.35) * (14.0 / 2.54), 7.7 / 2.54), _formater=y_formater) fig.savefig("/tmp/linear_spectrogram_raw_mag_db.pdf", bbox_inches='tight') def __tmp_fmt(x): if x == 0.0: return '{:.0f}'.format(x / 1000.0) elif x < 1000: return '{:.1f}'.format(x / 1000.0) else: return '{:.0f}'.format(math.floor(x / 1000.0)) y_formater = ticker.FuncFormatter(lambda x, pos: __tmp_fmt(x)) mel_mag_db = mel_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) / hop_len), :] fig = plot_spectrogram(mel_mag_db.T, sr, hop_len, 0.0, sr // 2.0, 'mel', figsize=((1.025 / 1.35) * (14.0 / 2.54), 7.7 / 2.54), _formater=y_formater) fig.savefig("/tmp/mel_spectrogram_raw_mag_db.pdf", bbox_inches='tight')