Exemple #1
0
def decibel_statistics(wav, sampling_rate):
    """
    Calculate (min, max) values for the decibel values of both
    the linear scale magnitude spectrogram and a
    mel scale magnitude spectrogram.

    Arguments:
        wav (np.ndarray):
            Audio time series.
            The shape is expected to be shape=(n,).

        sampling_rate (int):
            Sampling rate using in the calculation of `wav`.

    Returns:
        np.ndarray:
            Min and max values of the decibel representations.

            Calculation: np.array[min(linear_db), max(linear_db), min(mel_db), max(mel_db)]
    """
    n_fft = 1024
    hop_length = n_fft // 4
    win_length = n_fft
    n_mels = 80

    # Get the linear scale spectrogram.
    linear_spec = linear_scale_spectrogram(wav,
                                           n_fft=n_fft,
                                           hop_length=hop_length,
                                           win_length=win_length)

    # Get the mel scale spectrogram.
    mel_spec = mel_scale_spectrogram(wav,
                                     n_fft=n_fft,
                                     sampling_rate=sampling_rate,
                                     n_mels=n_mels,
                                     fmin=0,
                                     fmax=sampling_rate // 2,
                                     hop_length=hop_length,
                                     win_length=win_length,
                                     power=1)

    # Convert the linear spectrogram into decibel representation.
    linear_mag = np.abs(linear_spec)
    linear_mag_db = magnitude_to_decibel(linear_mag)
    # linear_mag_db = normalize_decibel(linear_mag_db, 20, 100)

    # Convert the mel spectrogram into decibel representation.
    mel_mag = np.abs(mel_spec)
    mel_mag_db = magnitude_to_decibel(mel_mag)
    # mel_mag_db = normalize_decibel(mel_mag_db, -7.7, 95.8)

    return np.array([
        np.min(linear_mag_db),
        np.max(linear_mag_db),
        np.min(mel_mag_db),
        np.max(mel_mag_db)
    ])
    def load_audio(file_path):
        # Window length in audio samples.
        win_len = ms_to_samples(model_params.win_len,
                                model_params.sampling_rate)
        # Window hop in audio samples.
        hop_len = ms_to_samples(model_params.win_hop,
                                model_params.sampling_rate)

        # Load the actual audio file.
        wav, sr = load_wav(file_path.decode())

        # TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9).
        # Remove silence at the beginning and end of the wav so the network does not have to learn
        # some random initial silence delay after which it is allowed to speak.
        wav, _ = librosa.effects.trim(wav)

        # Calculate the linear scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers
        # for example are applied to each frame automatically.
        linear_spec = linear_scale_spectrogram(wav, model_params.n_fft,
                                               hop_len, win_len).T

        # Calculate the Mel. scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for
        # example are applied to each frame automatically.
        mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr,
                                         model_params.n_mels,
                                         model_params.mel_fmin,
                                         model_params.mel_fmax, hop_len,
                                         win_len, 1).T

        # Convert the linear spectrogram into decibel representation.
        linear_mag = np.abs(linear_spec)
        linear_mag_db = magnitude_to_decibel(linear_mag)
        linear_mag_db = normalize_decibel(linear_mag_db,
                                          CMUDatasetHelper.linear_ref_db,
                                          CMUDatasetHelper.linear_mag_max_db)
        # => linear_mag_db.shape = (T_spec, 1 + n_fft // 2)

        # Convert the mel spectrogram into decibel representation.
        mel_mag = np.abs(mel_spec)
        mel_mag_db = magnitude_to_decibel(mel_mag)
        mel_mag_db = normalize_decibel(mel_mag_db,
                                       CMUDatasetHelper.mel_mag_ref_db,
                                       CMUDatasetHelper.mel_mag_max_db)
        # => mel_mag_db.shape = (T_spec, n_mels)

        # Tacotron reduction factor.
        if model_params.reduction > 1:
            mel_mag_db, linear_mag_db = DatasetHelper.apply_reduction_padding(
                mel_mag_db, linear_mag_db, model_params.reduction)

        return np.array(mel_mag_db).astype(np.float32), \
               np.array(linear_mag_db).astype(np.float32)
Exemple #3
0
def plot_liner_mel_spec_comparasion():
    ms_win_len = 50.0
    ms_win_hop = 12.5
    n_fft = 1024
    wav_path = '/thesis/datasets/blizzard_nancy/wav/RURAL-02198.wav'

    wav, sr = load_wav(wav_path)
    win_len = ms_to_samples(ms_win_len, sampling_rate=sr)
    hop_len = ms_to_samples(ms_win_hop, sampling_rate=sr)

    linear_spec = linear_scale_spectrogram(wav, n_fft, hop_len, win_len).T

    mel_spec = mel_scale_spectrogram(wav,
                                     n_fft=n_fft,
                                     sampling_rate=sr,
                                     n_mels=80,
                                     fmin=0,
                                     fmax=sr // 2,
                                     hop_length=hop_len,
                                     win_length=win_len,
                                     power=1).T

    # ==================================================================================================
    # Convert the linear spectrogram into decibel representation.
    # ==================================================================================================
    linear_mag = np.abs(linear_spec)
    linear_mag_db = magnitude_to_decibel(linear_mag)

    # ==================================================================================================
    # Convert the mel spectrogram into decibel representation.
    # ==================================================================================================
    mel_mag = np.abs(mel_spec)
    mel_mag_db = magnitude_to_decibel(mel_mag)

    rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 13})
    rc('text', usetex=True)

    y_formater = ticker.FuncFormatter(
        lambda x, pos: '{:.0f}'.format(x / 1000.0))

    linear_mag_db = linear_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) /
                                                                 hop_len), :]
    fig = plot_spectrogram(linear_mag_db.T,
                           sr,
                           hop_len,
                           0.0,
                           sr // 2.0,
                           'linear',
                           figsize=((1.0 / 1.35) * (14.0 / 2.54), 7.7 / 2.54),
                           _formater=y_formater)

    fig.savefig("/tmp/linear_spectrogram_raw_mag_db.pdf", bbox_inches='tight')

    def __tmp_fmt(x):
        if x == 0.0:
            return '{:.0f}'.format(x / 1000.0)
        elif x < 1000:
            return '{:.1f}'.format(x / 1000.0)
        else:
            return '{:.0f}'.format(math.floor(x / 1000.0))

    y_formater = ticker.FuncFormatter(lambda x, pos: __tmp_fmt(x))

    mel_mag_db = mel_mag_db[int((0.20 * sr) / hop_len):int((1.85 * sr) /
                                                           hop_len), :]
    fig = plot_spectrogram(mel_mag_db.T,
                           sr,
                           hop_len,
                           0.0,
                           sr // 2.0,
                           'mel',
                           figsize=((1.025 / 1.35) * (14.0 / 2.54),
                                    7.7 / 2.54),
                           _formater=y_formater)

    fig.savefig("/tmp/mel_spectrogram_raw_mag_db.pdf", bbox_inches='tight')