Example #1
0
def collect_decibel_statistics(path_listing):
    """
    Calculate the average (min, max) values for the decibel values of
    both the linear scale magnitude spectrogram's and a mel scale
    magnitude spectrogram's of a list of wav files.

    Arguments:
        path_listing (list):
            List of wav file paths.

    Returns:
        np.ndarray:
            Average min and max values of the decibel representations.

            Calculation: (avg(linear_min_db), avg(linear_max_db), avg(mel_min_db), avg(mel_max_db)).
    """
    # (min_linear, max_linear, min_mel, max_mel)
    stats = np.zeros(4)

    # Accumulate statistics for a list of wav files.
    for path in path_listing:
        wav, sampling_rate = load_wav(path)
        # Accumulate the calculated min and max values.
        stats += decibel_statistics(wav, sampling_rate)

    # Calculate the average min and max values.
    n_files = len(path_listing)
    stats /= n_files

    return stats
Example #2
0
    def load_audio(file_path):
        # Window length in audio samples.
        win_len = ms_to_samples(model_params.win_len,
                                model_params.sampling_rate)
        # Window hop in audio samples.
        hop_len = ms_to_samples(model_params.win_hop,
                                model_params.sampling_rate)

        # Load the actual audio file.
        wav, sr = load_wav(file_path.decode())

        # TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9).
        # Remove silence at the beginning and end of the wav so the network does not have to learn
        # some random initial silence delay after which it is allowed to speak.
        wav, _ = librosa.effects.trim(wav)

        # Calculate the linear scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers
        # for example are applied to each frame automatically.
        linear_spec = linear_scale_spectrogram(wav, model_params.n_fft,
                                               hop_len, win_len).T

        # Calculate the Mel. scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for
        # example are applied to each frame automatically.
        mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr,
                                         model_params.n_mels,
                                         model_params.mel_fmin,
                                         model_params.mel_fmax, hop_len,
                                         win_len, 1).T

        # Convert the linear spectrogram into decibel representation.
        linear_mag = np.abs(linear_spec)
        linear_mag_db = magnitude_to_decibel(linear_mag)
        linear_mag_db = normalize_decibel(linear_mag_db,
                                          CMUDatasetHelper.linear_ref_db,
                                          CMUDatasetHelper.linear_mag_max_db)
        # => linear_mag_db.shape = (T_spec, 1 + n_fft // 2)

        # Convert the mel spectrogram into decibel representation.
        mel_mag = np.abs(mel_spec)
        mel_mag_db = magnitude_to_decibel(mel_mag)
        mel_mag_db = normalize_decibel(mel_mag_db,
                                       CMUDatasetHelper.mel_mag_ref_db,
                                       CMUDatasetHelper.mel_mag_max_db)
        # => mel_mag_db.shape = (T_spec, n_mels)

        # Tacotron reduction factor.
        if model_params.reduction > 1:
            mel_mag_db, linear_mag_db = DatasetHelper.apply_reduction_padding(
                mel_mag_db, linear_mag_db, model_params.reduction)

        return np.array(mel_mag_db).astype(np.float32), \
               np.array(linear_mag_db).astype(np.float32)
Example #3
0
def collect_duration_statistics(dataset_name, path_listing):
    durations = []

    print("Collecting duration statistics for {} files ...".format(
        len(path_listing)))
    for path in path_listing:
        # Load the audio file.
        wav, sampling_rate = load_wav(path)
        # Get the duration in seconds.
        duration = get_duration(wav, sampling_rate)
        # Collect durations.
        durations.append(duration)

    durations_sum = sum(durations)
    durations_avg = durations_sum / len(durations)
    durations_min = min(durations)
    durations_max = max(durations)

    print("durations_sum: {} sec.".format(durations_sum))
    print("durations_avg: {} sec.".format(durations_avg))
    print("durations_min: {} sec.".format(durations_min))
    print("durations_max: {} sec.".format(durations_max))

    from matplotlib import rc
    rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 13})
    rc('text', usetex=True)

    # Create a histogram of the individual file durations.
    fig = plt.figure(figsize=(1.5 * 14.0 / 2.54, 7.7 / 2.54), dpi=100)
    plt.hist(durations, bins=100, normed=False, color="#6C8EBF")
    plt.grid(linestyle='dashed')
    plt.xlim([0, 21])
    # plt.title('"{}" file duration distribution'.format(dataset_name))
    plt.xlabel("Duration (seconds)")
    plt.ylabel("Count")
    plt.show()

    # DEBUG: Dump plot into a pdf file.
    fig.savefig("/tmp/durations.pdf", bbox_inches='tight')

    # DEBUG: Dump statistics into a csv file.
    np.savetxt("/tmp/durations.csv",
               durations,
               delimiter=",",
               fmt='%s',
               header="duration")
Example #4
0
def collect_reconstruction_error(path_listing, n_iters):
    mse_errors = []

    n_fft = 2048

    # Window length in ms.
    win_len = 50.0

    # Window stride in ms.
    win_hop = 12.5

    print("Collecting reconstruction statistics for {} files ...".format(
        len(path_listing)))
    for path in path_listing:
        # Load the audio file.
        wav, sampling_rate = load_wav(path)

        win_len_samples = ms_to_samples(win_len, sampling_rate=sampling_rate)
        win_hop_samples = ms_to_samples(win_hop, sampling_rate=sampling_rate)

        stft = linear_scale_spectrogram(wav,
                                        win_length=win_len_samples,
                                        hop_length=win_hop_samples,
                                        n_fft=n_fft)

        mag = np.abs(stft)
        # mag = np.power(mag, 1.2)

        _, mse = griffin_lim_v2(spectrogram=mag,
                                win_length=win_len_samples,
                                hop_length=win_hop_samples,
                                n_fft=n_fft,
                                n_iter=n_iters)

        # Collect mean-squared errors.
        mse_errors.append(mse)
        # For debugging purposes only.
        # print('"{}" => iters: {}, mse: {}'.format(path, n_iters, mse))

    total_mse = sum(mse_errors) / len(mse_errors)
    print('Dataset MSE with {} iterations: {}'.format(n_iters, total_mse))

    return total_mse
Example #5
0
def plot_liner_mel_spec_comparasion():
    ms_win_len = 50.0
    ms_win_hop = 12.5
    n_fft = 1024
    wav_path = '/thesis/datasets/blizzard_nancy/wav/RURAL-02198.wav'

    wav, sr = load_wav(wav_path)
    win_len = ms_to_samples(ms_win_len, sampling_rate=sr)
    hop_len = ms_to_samples(ms_win_hop, sampling_rate=sr)

    linear_spec = linear_scale_spectrogram(wav, n_fft, hop_len, win_len).T

    mel_spec = mel_scale_spectrogram(wav,
                                     n_fft=n_fft,
                                     sampling_rate=sr,
                                     n_mels=80,
                                     fmin=0,
                                     fmax=sr // 2,
                                     hop_length=hop_len,
                                     win_length=win_len,
                                     power=1).T

    # ==================================================================================================
    # Convert the linear spectrogram into decibel representation.
    # ==================================================================================================
    linear_mag = np.abs(linear_spec)
    linear_mag_db = magnitude_to_decibel(linear_mag)

    # ==================================================================================================
    # Convert the mel spectrogram into decibel representation.
    # ==================================================================================================
    mel_mag = np.abs(mel_spec)
    mel_mag_db = magnitude_to_decibel(mel_mag)

    rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 13})
    rc('text', usetex=True)

    y_formater = ticker.FuncFormatter(
        lambda x, pos: '{:.0f}'.format(x / 1000.0))

    linear_mag_db = linear_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) /
                                                                 hop_len), :]
    fig = plot_spectrogram(linear_mag_db.T,
                           sr,
                           hop_len,
                           0.0,
                           sr // 2.0,
                           'linear',
                           figsize=((1.0 / 1.35) * (14.0 / 2.54), 7.7 / 2.54),
                           _formater=y_formater)

    fig.savefig("/tmp/linear_spectrogram_raw_mag_db.pdf", bbox_inches='tight')

    def __tmp_fmt(x):
        if x == 0.0:
            return '{:.0f}'.format(x / 1000.0)
        elif x < 1000:
            return '{:.1f}'.format(x / 1000.0)
        else:
            return '{:.0f}'.format(math.floor(x / 1000.0))

    y_formater = ticker.FuncFormatter(lambda x, pos: __tmp_fmt(x))

    mel_mag_db = mel_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) /
                                                           hop_len), :]
    fig = plot_spectrogram(mel_mag_db.T,
                           sr,
                           hop_len,
                           0.0,
                           sr // 2.0,
                           'mel',
                           figsize=((1.025 / 1.35) * (14.0 / 2.54),
                                    7.7 / 2.54),
                           _formater=y_formater)

    fig.savefig("/tmp/mel_spectrogram_raw_mag_db.pdf", bbox_inches='tight')