Ejemplo n.º 1
0
def test_vqt(
    y_cqt,
    sr_cqt,
    hop_length,
    fmin,
    n_bins,
    gamma,
    bins_per_octave,
    tuning,
    filter_scale,
    norm,
    res_type,
    sparsity,
):

    C = librosa.vqt(
        y=y_cqt,
        sr=sr_cqt,
        hop_length=hop_length,
        fmin=fmin,
        n_bins=n_bins,
        gamma=gamma,
        bins_per_octave=bins_per_octave,
        tuning=tuning,
        filter_scale=filter_scale,
        norm=norm,
        sparsity=sparsity,
        res_type=res_type,
    )

    # type is complex
    assert np.iscomplexobj(C)

    # number of bins is correct
    assert C.shape[0] == n_bins
Ejemplo n.º 2
0
def AMT_Framing(filename_):
    # Audio Processing
    #filename = 'Guns N Roses-Sweet Child O Mine Intro.wav'
    filename = "{}".format(filename_)
    x, fs = librosa.load(filename, sr=None, mono=True, duration=DURATION)
    V = librosa.vqt(x,
                    sr=fs,
                    hop_length=hop_length,
                    fmin=fmin,
                    n_bins=n_bins,
                    gamma=20,
                    bins_per_octave=bins_per_octave,
                    tuning=tuning,
                    filter_scale=filter_scale,
                    norm=norm,
                    sparsity=0.01,
                    window='hann',
                    scale=scale,
                    pad_mode=pad_mode,
                    res_type=res_type,
                    dtype=dtype)
    # Mapping Magnitude spectrogram to the Mel Scale
    V_mel = np.abs(V)
    logFrame = librosa.amplitude_to_db(V_mel)
    mels = librosa.feature.melspectrogram(S=V_mel,
                                          sr=fs,
                                          n_mels=n_mels,
                                          n_fft=hop_length * 2,
                                          hop_length=hop_length)

    np_array_list = []
    np_array_list.append(mels)

    frame_windows_list = []
    numSlices_list = []
    Y_numSlices = 625

    for i in range(len(np_array_list)):
        VQT_result = np_array_list[i]
        paddedX = np.zeros(
            (VQT_result.shape[0], VQT_result.shape[1] + WINDOW_SIZE - 1),
            dtype=float)
        pad_amount = int(WINDOW_SIZE / 2)
        paddedX[:, pad_amount:-pad_amount] = VQT_result
        frame_windows = np.array([
            paddedX[:, j:j + WINDOW_SIZE] for j in range(VQT_result.shape[1])
        ])
        #frame_windows = np.expand_dims(frame_windows, axis=3)
        numSlices = min(frame_windows.shape[0], Y_numSlices)
        numSlices_list.append(numSlices)
        frame_windows_list.append(frame_windows[:numSlices])

    audio_frames = np.concatenate(frame_windows_list, axis=0)
    #audio_frames = frame_windows_list
    #storingData(audio_frames)
    #print("Windows shape: ",audio_frames.shape)
    return audio_frames
Ejemplo n.º 3
0
def AMT(filename_):
    # Audio Processing
    # Loading the Audios
    # Path Configuration
    #path = os.getcwd() + '/' + filename_
    filename = "{}".format(filename_)
    x, fs = librosa.load(filename, sr=None, mono=True, duration=12)
    # VQT Computation
    V = librosa.vqt(x, sr=fs, hop_length=hop_length, fmin=fmin, n_bins=n_bins, gamma=20, bins_per_octave=bins_per_octave, tuning=tuning,
                        filter_scale=filter_scale, norm=norm, sparsity=0.01, window='hann', scale=scale, pad_mode=pad_mode, res_type=res_type, dtype=dtype)

    V_mel = np.abs(V)
    # Conversion into the Mel-Scale to display and save Mel-spectrogram  for prediction  
    melspec(V_mel,filename)
Ejemplo n.º 4
0
def test_vqt(
    y_cqt_110,
    sr_cqt,
    hop_length,
    fmin,
    n_bins,
    gamma,
    bins_per_octave,
    tuning,
    filter_scale,
    norm,
    res_type,
    sparsity,
):

    C = librosa.vqt(
        y=y_cqt_110,
        sr=sr_cqt,
        hop_length=hop_length,
        fmin=fmin,
        n_bins=n_bins,
        gamma=gamma,
        bins_per_octave=bins_per_octave,
        tuning=tuning,
        filter_scale=filter_scale,
        norm=norm,
        sparsity=sparsity,
        res_type=res_type,
    )

    # type is complex
    assert np.iscomplexobj(C)

    # number of bins is correct
    assert C.shape[0] == n_bins

    if fmin is None:
        fmin = librosa.note_to_hz("C1")

    # check for peaks if 110 is within range
    if 110 <= fmin * 2 ** (n_bins / bins_per_octave):
        peaks = np.argmax(np.abs(C), axis=0)

        # This is our most common peak index in the CQT spectrum
        # we use the mode here over frames to sidestep transient effects
        # at the beginning and end of the CQT
        common_peak = scipy.stats.mode(peaks)[0][0]

        # Convert peak index to frequency
        peak_frequency = fmin * 2 ** (common_peak / bins_per_octave)
Ejemplo n.º 5
0
    def VQT_from_file(audio_file, bins_per_octave=60, n_octaves=8, gamma=20):
        y, fs = librosa.load(audio_file, sr=25600)

        vqt = librosa.vqt(y,
                          sr=fs,
                          hop_length=256,
                          fmin=SpectrogramUtil.FMIN,
                          n_bins=bins_per_octave * n_octaves,
                          bins_per_octave=bins_per_octave,
                          gamma=gamma)

        log_vqt = (
            (1. / 80.) *
            librosa.amplitude_to_db(np.abs(np.array(vqt)), ref=np.max)) + 1.

        return log_vqt
Ejemplo n.º 6
0
    def process_mel_spectrogram(self, n_mels, CQT=False, VQT=False):
        if CQT:
            C = np.abs(librosa.cqt(self.audio_waveform, sr=self.sample_rate, n_bins=112))
            CQT = librosa.amplitude_to_db(C, ref=np.max)
            self.mel_spectrogram = np.flipud(CQT)
        elif VQT:
            V = np.abs(librosa.vqt(self.audio_waveform, sr=self.sample_rate, n_bins=112))
            VQT = librosa.amplitude_to_db(V, ref=np.max)
            self.mel_spectrogram = np.flipud(VQT)
        else:
            mel_spectrogram = np.flipud(librosa.feature.melspectrogram(self.audio_waveform, sr=self.sample_rate, n_mels=n_mels))
            log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
            self.mel_spectrogram = log_mel_spectrogram

        midi_time = self.midi_note_array.shape[1]*self.tempo/self.PPQ/1000000
        audio_time = self.mel_spectrogram.shape[1]*512/self.sample_rate

        # Fixing audio and midi discrepancy
        time_difference = audio_time - midi_time
        time_difference_sample_ticks = round(time_difference*self.sample_rate/512)
        print(time_difference_sample_ticks)
        if time_difference_sample_ticks > 0:
            self.mel_spectrogram = self.mel_spectrogram[:, 0:-time_difference_sample_ticks]
        self.song_total_sample_ticks = self.mel_spectrogram.shape[1]
ax.set_ylabel('log Hz', size=15)
ax.set_xlabel('Time', size=15)

plt.show()

plt.show()

mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
print(mel_spectrogram)
log_mel_sprectrogram = librosa.power_to_db(mel_spectrogram)

C = np.abs(librosa.cqt(y, sr=sr, n_bins=112))
CQT = librosa.amplitude_to_db(C, ref=np.max)
print(CQT)

V = np.abs(librosa.vqt(y, sr=sr, n_bins=112))
VQT = librosa.amplitude_to_db(V, ref=np.max)
print(CQT)

X = librosa.stft(y)
Xdb = librosa.amplitude_to_db(abs(X))
fig, axs = plt.subplots(1, 3, figsize=(15, 20))
axs[0].imshow(np.flipud(VQT), aspect='auto', interpolation='nearest')
axs[1].imshow(np.flipud(log_mel_sprectrogram),
              aspect='auto',
              interpolation='nearest')
axs[2].imshow(np.flipud(CQT), aspect='auto', interpolation='nearest')
plt.show()

# fig, axs = plt.subplots()
# plt.grid(b=None)
Ejemplo n.º 8
0
def librosa_hvqt(audio, harmonics, sample_rate, hop_length, fmin, n_bins,
                 bins_per_octave, gamma):
    """
    Compute an HVQT using Librosa.

    Parameters
    ----------
    audio : ndarray (N)
      Audio to transform,
      N - number of samples
    harmonics : list of ints
      Specific harmonics to stack across the harmonic dimension
    sample_rate : int or float
      Number of samples per second of audio
    hop_length : int
      Number of samples between frames
    fmin : float
      Lowest center frequency in basis
    n_bins : int
      Number of basis functions in the filterbank
    bins_per_octave : int
      Number of basis functions per octave
    gamma : float
      Bandwidth offset to smoothly vary Q-factor

    Returns
    ----------
    hvqt : ndarray (H x F x T)
      Harmonic Variable-Q Transform (HVQT) for the provided audio,
      H - number of harmonics
      F - number of bins
      T - number of time steps (frames)
    """

    # Initialize a list to hold the harmonic-wise transforms
    hvqt = list()

    # Initialize a list to hold the number of frames for each transform
    frames = list()

    # Loop through harmonics
    for h in range(len(harmonics)):
        # Compute the true minimum center frequency for this harmonic
        h_fmin = harmonics[h] * fmin

        # Compute the VQT for this harmonic
        vqt = librosa.vqt(audio,
                          sr=sample_rate,
                          hop_length=hop_length,
                          fmin=h_fmin,
                          n_bins=n_bins,
                          gamma=gamma,
                          bins_per_octave=bins_per_octave)

        # Keep track of the number of frames produced
        frames.append(vqt.shape[-1])

        # Add the VQT to the collection
        hvqt.append(np.expand_dims(vqt, axis=0))

    # Determine the maximum number of frames that can be concatenated
    max_frames = min(frames)

    # Perform any trimming and concatenate
    hvqt = np.concatenate([vqt[..., :max_frames] for vqt in hvqt])

    # Take the magnitude
    hvqt = np.abs(hvqt)

    return hvqt