def collect_decibel_statistics(path_listing): """ Calculate the average (min, max) values for the decibel values of both the linear scale magnitude spectrogram's and a mel scale magnitude spectrogram's of a list of wav files. Arguments: path_listing (list): List of wav file paths. Returns: np.ndarray: Average min and max values of the decibel representations. Calculation: (avg(linear_min_db), avg(linear_max_db), avg(mel_min_db), avg(mel_max_db)). """ # (min_linear, max_linear, min_mel, max_mel) stats = np.zeros(4) # Accumulate statistics for a list of wav files. for path in path_listing: wav, sampling_rate = load_wav(path) # Accumulate the calculated min and max values. stats += decibel_statistics(wav, sampling_rate) # Calculate the average min and max values. n_files = len(path_listing) stats /= n_files return stats
def load_audio(file_path): # Window length in audio samples. win_len = ms_to_samples(model_params.win_len, model_params.sampling_rate) # Window hop in audio samples. hop_len = ms_to_samples(model_params.win_hop, model_params.sampling_rate) # Load the actual audio file. wav, sr = load_wav(file_path.decode()) # TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9). # Remove silence at the beginning and end of the wav so the network does not have to learn # some random initial silence delay after which it is allowed to speak. wav, _ = librosa.effects.trim(wav) # Calculate the linear scale spectrogram. # Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers # for example are applied to each frame automatically. linear_spec = linear_scale_spectrogram(wav, model_params.n_fft, hop_len, win_len).T # Calculate the Mel. scale spectrogram. # Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for # example are applied to each frame automatically. mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr, model_params.n_mels, model_params.mel_fmin, model_params.mel_fmax, hop_len, win_len, 1).T # Convert the linear spectrogram into decibel representation. linear_mag = np.abs(linear_spec) linear_mag_db = magnitude_to_decibel(linear_mag) linear_mag_db = normalize_decibel(linear_mag_db, CMUDatasetHelper.linear_ref_db, CMUDatasetHelper.linear_mag_max_db) # => linear_mag_db.shape = (T_spec, 1 + n_fft // 2) # Convert the mel spectrogram into decibel representation. mel_mag = np.abs(mel_spec) mel_mag_db = magnitude_to_decibel(mel_mag) mel_mag_db = normalize_decibel(mel_mag_db, CMUDatasetHelper.mel_mag_ref_db, CMUDatasetHelper.mel_mag_max_db) # => mel_mag_db.shape = (T_spec, n_mels) # Tacotron reduction factor. if model_params.reduction > 1: mel_mag_db, linear_mag_db = DatasetHelper.apply_reduction_padding( mel_mag_db, linear_mag_db, model_params.reduction) return np.array(mel_mag_db).astype(np.float32), \ np.array(linear_mag_db).astype(np.float32)
def collect_duration_statistics(dataset_name, path_listing): durations = [] print("Collecting duration statistics for {} files ...".format( len(path_listing))) for path in path_listing: # Load the audio file. wav, sampling_rate = load_wav(path) # Get the duration in seconds. duration = get_duration(wav, sampling_rate) # Collect durations. durations.append(duration) durations_sum = sum(durations) durations_avg = durations_sum / len(durations) durations_min = min(durations) durations_max = max(durations) print("durations_sum: {} sec.".format(durations_sum)) print("durations_avg: {} sec.".format(durations_avg)) print("durations_min: {} sec.".format(durations_min)) print("durations_max: {} sec.".format(durations_max)) from matplotlib import rc rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 13}) rc('text', usetex=True) # Create a histogram of the individual file durations. fig = plt.figure(figsize=(1.5 * 14.0 / 2.54, 7.7 / 2.54), dpi=100) plt.hist(durations, bins=100, normed=False, color="#6C8EBF") plt.grid(linestyle='dashed') plt.xlim([0, 21]) # plt.title('"{}" file duration distribution'.format(dataset_name)) plt.xlabel("Duration (seconds)") plt.ylabel("Count") plt.show() # DEBUG: Dump plot into a pdf file. fig.savefig("/tmp/durations.pdf", bbox_inches='tight') # DEBUG: Dump statistics into a csv file. np.savetxt("/tmp/durations.csv", durations, delimiter=",", fmt='%s', header="duration")
def collect_reconstruction_error(path_listing, n_iters): mse_errors = [] n_fft = 2048 # Window length in ms. win_len = 50.0 # Window stride in ms. win_hop = 12.5 print("Collecting reconstruction statistics for {} files ...".format( len(path_listing))) for path in path_listing: # Load the audio file. wav, sampling_rate = load_wav(path) win_len_samples = ms_to_samples(win_len, sampling_rate=sampling_rate) win_hop_samples = ms_to_samples(win_hop, sampling_rate=sampling_rate) stft = linear_scale_spectrogram(wav, win_length=win_len_samples, hop_length=win_hop_samples, n_fft=n_fft) mag = np.abs(stft) # mag = np.power(mag, 1.2) _, mse = griffin_lim_v2(spectrogram=mag, win_length=win_len_samples, hop_length=win_hop_samples, n_fft=n_fft, n_iter=n_iters) # Collect mean-squared errors. mse_errors.append(mse) # For debugging purposes only. # print('"{}" => iters: {}, mse: {}'.format(path, n_iters, mse)) total_mse = sum(mse_errors) / len(mse_errors) print('Dataset MSE with {} iterations: {}'.format(n_iters, total_mse)) return total_mse
def plot_liner_mel_spec_comparasion(): ms_win_len = 50.0 ms_win_hop = 12.5 n_fft = 1024 wav_path = '/thesis/datasets/blizzard_nancy/wav/RURAL-02198.wav' wav, sr = load_wav(wav_path) win_len = ms_to_samples(ms_win_len, sampling_rate=sr) hop_len = ms_to_samples(ms_win_hop, sampling_rate=sr) linear_spec = linear_scale_spectrogram(wav, n_fft, hop_len, win_len).T mel_spec = mel_scale_spectrogram(wav, n_fft=n_fft, sampling_rate=sr, n_mels=80, fmin=0, fmax=sr // 2, hop_length=hop_len, win_length=win_len, power=1).T # ================================================================================================== # Convert the linear spectrogram into decibel representation. # ================================================================================================== linear_mag = np.abs(linear_spec) linear_mag_db = magnitude_to_decibel(linear_mag) # ================================================================================================== # Convert the mel spectrogram into decibel representation. # ================================================================================================== mel_mag = np.abs(mel_spec) mel_mag_db = magnitude_to_decibel(mel_mag) rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 13}) rc('text', usetex=True) y_formater = ticker.FuncFormatter( lambda x, pos: '{:.0f}'.format(x / 1000.0)) linear_mag_db = linear_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) / hop_len), :] fig = plot_spectrogram(linear_mag_db.T, sr, hop_len, 0.0, sr // 2.0, 'linear', figsize=((1.0 / 1.35) * (14.0 / 2.54), 7.7 / 2.54), _formater=y_formater) fig.savefig("/tmp/linear_spectrogram_raw_mag_db.pdf", bbox_inches='tight') def __tmp_fmt(x): if x == 0.0: return '{:.0f}'.format(x / 1000.0) elif x < 1000: return '{:.1f}'.format(x / 1000.0) else: return '{:.0f}'.format(math.floor(x / 1000.0)) y_formater = ticker.FuncFormatter(lambda x, pos: __tmp_fmt(x)) mel_mag_db = mel_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) / hop_len), :] fig = plot_spectrogram(mel_mag_db.T, sr, hop_len, 0.0, sr // 2.0, 'mel', figsize=((1.025 / 1.35) * (14.0 / 2.54), 7.7 / 2.54), _formater=y_formater) fig.savefig("/tmp/mel_spectrogram_raw_mag_db.pdf", bbox_inches='tight')