Esempio n. 1
0
 def __init__(self,
              audio_file,
              mono=True,
              hop_length=512,
              sample_rate=44100,
              normalize_gain=False,
              verbose=False):
     """[summary]
     
     Arguments:
         audio_file {[type]} -- [description]
     
     Keyword Arguments:
         mono {bool} -- [description] (default: {True})
         hop_length {int} -- [description] (default: {512})
         sample_rate {int} -- [description] (default: {44100})
         normalize_gain {bool} -- [description] (default: {False})
         verbose {bool} -- [description] (default: {False})
     """
     self.hop_length = hop_length
     self.fs = sample_rate
     self.audio_file = audio_file
     if normalize_gain:
         self.audio_vector = estd.EasyLoader(filename=audio_file,
                                             sampleRate=self.fs,
                                             replayGain=-9)()
     elif mono:
         self.audio_vector = estd.MonoLoader(filename=audio_file,
                                             sampleRate=self.fs)()
     if verbose:
         print(
             "== Audio vector of %s loaded with shape %s and sample rate %s =="
             % (audio_file, self.audio_vector.shape, self.fs))
def transcribe_from_paths(audio_path, topath, sr=22050, cuda=False):
    """
    `cuda` has no effect at now
    """
    audio = esst.EasyLoader(filename=audio_path, sampleRate=sr)()
    mat = transcribe(audio, sr, cuda=cuda)
    mat2midipath(mat, topath)
Esempio n. 3
0
 def read_audio(self, audio_file):
     self.set_audio_file(audio_file)
     if self.normalize_gain:
         self.audio_vector = estd.EasyLoader(filename=audio_file,
                                             sampleRate=self.fs,
                                             replayGain=-9)()
     elif self.mono:
         self.audio_vector = estd.MonoLoader(filename=audio_file,
                                             sampleRate=self.fs)()
def load_audio_file(file_path, sample_rate=44100):
    """
    Load audio file using essentia's EasyLoader class
    :param file_path: audio file path
    :param sample_rate: audio sample rate
    :return: audio data (numpy.ndarray of float32)
    """
    audio_file = estd.EasyLoader(filename=file_path, sampleRate=sample_rate)
    audio = audio_file.compute()
    return audio
Esempio n. 5
0
 def __init__(self, audio_file, mono=True, hop_length=512, sample_rate=44100, normalize_gain=False, verbose=False):
     """"""
     self.hop_length = hop_length
     self.fs = sample_rate
     self.audio_file = audio_file
     if normalize_gain:
         self.audio_vector = estd.EasyLoader(filename=audio_file, sampleRate=self.fs, replayGain=-9)()
     elif mono:
         self.audio_vector = estd.MonoLoader(filename=audio_file, sampleRate=self.fs)()
     if verbose:
         print ("== Audio vector of %s loaded with shape %s and sample rate %s ==" % (audio_file, self.audio_vector.shape, self.fs))
Esempio n. 6
0
def analyze_misc(filename, segment_duration=20):

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()
    replaygain = es.ReplayGain()(audio)

    segment_start = (len(audio) / 44100 - segment_duration) / 2
    segment_end = segment_start + segment_duration

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    loader = es.EasyLoader(filename=filename,
                           replayGain=replaygain,
                           startTime=segment_start,
                           endTime=segment_end)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    powerspectrum = es.PowerSpectrum()
    centroid = es.Centroid()
    zcr = es.ZeroCrossingRate()
    rms = es.RMS()
    hfc = es.HFC()
    pool = essentia.Pool()

    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024):
        frame_spectrum = spectrum(windowing(frame))
        pool.add('rms', rms(frame))
        pool.add('rms_spectrum', rms(frame_spectrum))
        pool.add('hfc', hfc(frame_spectrum))
        pool.add('spectral_centroid', centroid(frame_spectrum))
        pool.add('zcr', zcr(frame))

    audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)()
    # Ugly hack because we don't have a StereoResample
    left, right = es.StereoDemuxer()(audio_st)
    resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100)
    left = resampler(left)
    right = resampler(right)
    audio_st = es.StereoMuxer()(left, right)
    audio_st = es.StereoTrimmer(startTime=segment_start,
                                endTime=segment_end)(audio_st)
    ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100,
                                                startAtZero=True)(audio_st)
    pool.set('ebu_momentary', ebu_momentary)

    return pool
 def __init__(self,
              audio_file,
              mono=True,
              sample_rate=44100,
              normalize_gain=False):
     """"""
     self.fs = sample_rate
     if normalize_gain:
         self.audio_vector = estd.EasyLoader(filename=audio_file,
                                             sampleRate=self.fs,
                                             replayGain=-9)()
     else:
         self.audio_vector = estd.MonoLoader(filename=audio_file,
                                             sampleRate=self.fs)()
     print "== Audio vector of %s loaded with shape %s and sample rate %s ==" % (
         audio_file, self.audio_vector.shape, self.fs)
     return
def analyze_loops(db_file="../loopermanscrapy/loops.db"):

    loop_data = {}

    conn = sqlite3.connect(db_file)
    cur = conn.cursor()

    sample_rate = 44100
    for loop in loops:
        cur.execute("SELECT BPM FROM LOOPS WHERE OUTPUT_FILE LIKE '%s'" %
                    loop.rstrip())
        bpm = cur.fetchone()[0]

        audio_file = estd.EasyLoader(filename="./drum_loops/" + loop.rstrip(),
                                     sampleRate=sample_rate)
        audio = audio_file.compute()

        beat_duration = (60.0 * sample_rate) / int(bpm)
        L = [beat_duration * n for n in range(1, 128)]
        thr_lambda = 0.5 * beat_duration
        la = audio.shape[0]
        delta = min([abs(l - la) for l in L])
        if delta > thr_lambda:
            ann_confidence = 0.0
        else:
            ann_confidence = (1.0 - float(delta) / thr_lambda)

        duration = la / sample_rate

        loop_data[loop.rstrip()] = {
            "bpm_annotated": bpm,
            "duration": duration,
            "annotated_confidence": ann_confidence,
            "bpm_percival": bpm_percival,
            "confidence": confidence
        }

    conn.close()
    json.dump(loop_data, open("loop_data.json", 'w'))
Esempio n. 9
0
def load_audio_files_in_dir(path):

    vsts = []
    original = []
    paths = []
    for root, dirs, files in os.walk(path):
        FOUND_VST = False
        vst = []
        for file in files:
            if file.endswith(AUDIO_EXTS) and 'scales' not in file:
                if 'target' in file:
                    audios = original
                else:
                    FOUND_VST = True
                    audios = vst
                audios.append(
                    esst.EasyLoader(filename=os.path.join(root, file),
                                    sampleRate=SR)())
        if FOUND_VST:
            vsts.append(vst)
            paths.append(root)
    return vsts, original, paths
Esempio n. 10
0
def load_audio_excerpts(path=AUDIO_PATH, num_features=9):
    """
    Extracts `num_features+1` MFCC coeffcients from each audio and discards the
    first coefficients (tied to energy).
    """

    targets = np.zeros((3, 5, num_features))
    out = np.zeros((3, 5, 4, num_features))
    for file in tqdm(os.listdir(path)):
        if file.endswith(excerpt_search.FORMAT):
            audio = esst.EasyLoader(filename=os.path.join(path, file),
                                    sampleRate=SR)()
            if audio.shape[0] % 2 == 1:
                audio = audio[:-1]
            spectrum = esst.Spectrum(size=audio.shape[0])(audio)
            _bands, features = esst.MFCC(inputSize=spectrum.shape[0],
                                         sampleRate=SR,
                                         numberCoefficients=num_features +
                                         1)(spectrum)
            splits = file.replace('.flac', '').split('_')
            question = int(splits[0][1])
            _fill_out_targets(out[question], targets[question], features[1:],
                              splits, 'target')
    return out - targets[..., np.newaxis, :]
Esempio n. 11
0
def transcribe_from_paths(audio_path,
                          data,
                          velocity_model,
                          midi_score_path=None,
                          tofile='out.mid'):
    """
    Load a midi and an audio file and call `transcribe`. If `tofile` is not
    empty, it will also write a new MIDI file with the provided path.
    The output midi file will contain only one track with piano (program 0)
    """
    import essentia.standard as esst
    audio = esst.EasyLoader(filename=audio_path, sampleRate=SR)()
    if midi_score_path:
        score = midipath2mat(midi_score_path)
    else:
        score = None
    new_score, _, _, _ = transcribe(audio,
                                    data,
                                    score=score,
                                    velocity_model=velocity_model)

    # writing to midi
    mat2midipath(new_score, tofile)
    return new_score
Esempio n. 12
0
#wavFiles = sys.argv[2:]

#the mbid corresponding to the title of the wavfile will be used to name the resulting pitch track file
mbid_all={"D_X-1": "61dd663f-77b8-423b-8ce9-4c40fee8b014","D_X-2": "a00666b5-07ef-40c8-8117-78b871569354","D_E-1": "f6339039-e01a-4a70-82ff-001c64fecddd","J_X-1": "bc86b730-b7bb-46ba-a771-2d0e6304b459","J_X-2": "a9896439-46ce-4a37-a6ed-3a5c998bd6cf","J_E-1": "b19d1921-9119-4a57-8c78-540ca09f09b9","J_E-2": "720e93d6-21b5-4026-8c50-5c415cca53ec","LD_X-1": "5593350d-4a2d-4288-b3c6-b57b7f3c8dd4","LD_X-2": "afd9da3d-418b-4dfc-85d4-6d5829b9c7e2","LD_E-1": "f581a2b2-ff94-4e03-91f8-d278b01d6fc4","LD_E-2": "67069b89-fe41-4509-9aba-5cc33d9c8e53","LS_X-1": "cdbf88f4-7a55-412e-b4b2-351f4c8aab49", "LS_X-2": "4a0fc23c-9c62-4b2e-a38a-124796c6ac5d","LS_E-1": "999917df-2282-49d4-ad95-e45d176fba64","LS_E-2": "26054c12-3578-4d64-a251-f8f694fbc41c","XS_X-1":      "e9794b6f-a797-4d7f-842a-a0c73639c2a5","XS_X-2": "206d833f-b194-40cc-92d1-7a632fbef603","XS_E-1": "aec9929b-69b4-4ca5-b998-f4d54ac426cb","XS_E-2": "01661eb4-114d-4671-af82-14b8fead56e1"}
#print pitchDir
hopSize = 128
frameSize = 2048
sampleRate = 44100
guessUnvoiced = False
for f in wavFiles:
    if ".wav" not in f.lower(): continue
    
    #mbid = f.split("/")[-1][:-4]
    #if exists(pitchDir+"/"+mbid+".txt"): continue
    #find the mbid to be used to name the resulting pitch track
    name=f.split("_")[0]+"_"+f.split("_")[1]
    print name
    mbid=mbid_all[name]
    print mbid
    
    loader = es.EasyLoader(filename=f, sampleRate=44100)
    equalLoudness = es.EqualLoudness(sampleRate=44100)
    audio = loader()
    audioDL = equalLoudness(audio)
    pitchPolyphonic = es.PredominantMelody(binResolution=1, guessUnvoiced=guessUnvoiced, hopSize=hopSize, minFrequency=100, maxFrequency = 1200, voicingTolerance = 1.2)
    res = pitchPolyphonic(audioDL)
    t = np.linspace(0, len(res[0])*128.0/44100, len(res[0]))
    data = zip(t, res[0], res[1])
    data = np.array(data)
    np.savetxt(mbid+".txt", data, delimiter="\t")
Esempio n. 13
0
def analyze_hp(filename, segment_duration=20):

    lowlevelFrameSize = 2048
    lowlevelHopSize = 1024
    tonalFrameSize = 4096
    tonalHopSize = 1024

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()
    replaygain = es.ReplayGain()(audio)

    segment_start = (len(audio) / 44100 - segment_duration) / 2
    segment_end = segment_start + segment_duration

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    loader = es.EasyLoader(filename=filename,
                           replayGain=replaygain,
                           startTime=segment_start,
                           endTime=segment_end)
    window = es.Windowing(type='blackmanharris62')
    fft = es.FFT()

    stft = []

    audio = loader()
    for frame in es.FrameGenerator(audio,
                                   frameSize=lowlevelFrameSize,
                                   hopSize=lowlevelHopSize):
        stft.append(fft(window(frame)))

    # Librosa requires bins x frames format
    stft = np.array(stft).T

    D_harmonic, D_percussive = librosa.decompose.hpss(stft, margin=8)
    D_percussive_magnitude, _ = librosa.magphase(D_percussive)
    D_harmonic_magnitude, _ = librosa.magphase(D_harmonic)

    # Convert back to Essentia format (frames x bins)
    spectrum_harmonic = D_harmonic_magnitude.T
    specturm_percussive = D_percussive_magnitude.T

    # Processing for Mel bands
    melbands = es.MelBands(numberBands=96,
                           lowFrequencyBound=0,
                           highFrequencyBound=11025)

    # Normalize Mel bands: log10(1+x*10000)
    norm = es.UnaryOperator(type='identity', shift=1, scale=10000)
    log10 = es.UnaryOperator(type='log10')

    p = essentia.Pool()

    for spectrum_frame in spectrum_harmonic:
        p.add('melbands_harmonic', log10(norm(melbands(spectrum_frame))))

    for spectrum_frame in specturm_percussive:
        p.add('melbands_percussive', log10(norm(melbands(spectrum_frame))))

    return p
def create_excerpt(audio_path, time, name):
    """
    Given audio path and times, transcribes it and creates new midis and wav
    files for the given excerpts. `name` is the file name without extension and
    transcription number.
    """

    full_audio = esst.EasyLoader(filename=audio_path, sampleRate=SR)()
    start_audio, _ = find_start_stop(full_audio, sample_rate=SR, seconds=True)
    original = midipath2mat(audio_path[:-4] + '.mid')

    # compute score path
    score_path = './my_scores/' + os.path.basename(audio_path)[:-8] + '.mid'
    score = midipath2mat(score_path)

    # transcribe
    data = pickle.load(open(TEMPLATE_PATH, 'rb'))
    transcription_0, _, _, _ = proposed.transcribe(full_audio,
                                                   data,
                                                   score=score)

    transcription_1 = magenta_transcription.transcribe(full_audio, SR)

    # transcription_2, _, _, _ = proposed.transcribe(full_audio,
    #                                                data,
    #                                                score=None)

    # chose another interpretation
    performance = '01'
    if audio_path[-6:-4] == '01':
        performance = '02'
    other = midipath2mat(audio_path[:-6] + performance + '.mid')

    # segment all the scores and audios
    full_audio = esst.EasyLoader(filename=audio_path, sampleRate=OUT_SR)()
    original_audio = full_audio[round(time[0][0] * OUT_SR):round(time[0][1] *
                                                                 OUT_SR)]
    other_time = remap_original_in_other(original, other, time[0])
    original = segment_mat(original, time[0][0], time[0][1], start_audio)
    other = segment_mat(other, other_time[0], other_time[1], start_audio)
    transcription_0 = segment_mat(transcription_0, time[0][0], time[0][1],
                                  start_audio)
    transcription_1 = segment_mat(transcription_1, time[0][0], time[0][1],
                                  start_audio)
    # transcription_2 = segment_mat(transcription_2, time[0][0], time[0][1],
    #                               start_audio)

    # write scores to `to_be_synthesized` and audios to `excerpts`
    if not os.path.exists('to_be_synthesized'):
        os.mkdir('to_be_synthesized')
    midi_path = os.path.join('to_be_synthesized', name)
    mat2midipath(original, midi_path + 'orig.mid')
    mat2midipath(other, midi_path + 'other.mid')
    mat2midipath(transcription_0, midi_path + 'proposed.mid')
    mat2midipath(transcription_1, midi_path + 'magenta.mid')
    # mat2midipath(transcription_2, midi_path + 'vienna.mid')

    if not os.path.exists('audio'):
        os.mkdir('audio')
    audio_path = os.path.join('audio', name) + 'target.' + FORMAT

    # write audio
    if os.path.exists(audio_path):
        os.remove(audio_path)
    esst.MonoWriter(filename=audio_path,
                    sampleRate=OUT_SR,
                    format=FORMAT,
                    bitrate=256)(original_audio)
Esempio n. 15
0
def main():
    import essentia.standard as esst
    spec = esst.SpectrumCQ(numberBins=BINS, sampleRate=SR, windowType='hann')

    print("Loading midi")
    notes = pm.PrettyMIDI(midi_file=SCALE_PATH[0]).instruments[0].notes
    print("Loading audio")
    audio = esst.EasyLoader(filename=SCALE_PATH[1], sampleRate=SR)()

    # template = np.zeros((FRAME_SIZE // 2 + 1, 128, BASIS))
    template = np.zeros((BINS, 128, BASIS))
    counter = np.zeros((128, BASIS))

    maxpitch = 0
    minpitch = 128

    for i in trange(len(notes)):
        note = notes[i]
        if maxpitch < note.pitch:
            maxpitch = note.pitch
        if minpitch > note.pitch:
            minpitch = note.pitch

        # start and end frame
        start = int(np.round((note.start) * SR))
        end = int(np.round((note.end) * SR))
        ENDED = False

        spd = np.zeros((BINS, BASIS))
        frames = esst.FrameGenerator(audio[start:end],
                                     frameSize=FRAME_SIZE,
                                     hopSize=HOP_SIZE)
        # attack
        for a in range(ATTACK):
            try:
                frame = next(frames)
            except StopIteration:
                print("Error: notes timing not correct")
                print(f"note: {start}, {end}, {len(audio)}")
                sys.exit(99)
            spd[:, 0] += spec(frame)
        counter[note.pitch, 0] += ATTACK

        # other basis except the last one
        for b in range(1, BASIS-1):
            if not ENDED:
                for a in range(BASIS_L):
                    try:
                        frame = next(frames)
                    except StopIteration:
                        # note is shorter than the number of basis
                        ENDED = True
                        break
                    spd[:, b] += spec(frame)
                    counter[note.pitch, b] += 1

        # last basis
        if not ENDED:
            for frame in frames:
                spd[:, BASIS-1] += spec(frame)
                counter[note.pitch, BASIS-1] += 1
        template[:, note.pitch, :] += spd

    idx = np.nonzero(counter)
    template[:, idx[0], idx[1]] /= counter[idx]

    # collapsing basis and pitch dimension
    template = template.reshape((-1, 128 * BASIS), order='C')

    # plot template
    fig = go.Figure(data=go.Heatmap(z=template))
    fig.show()

    # saving template
    pickle.dump((template, minpitch, maxpitch), open(TEMPLATE_PATH, 'wb'))
def transcribe_from_paths(audio_path, topath, sr=44100, cuda=False):
    audio = esst.EasyLoader(filename=audio_path, sampleRate=sr)()
    mat = transcribe(audio, sr, cuda=cuda)
    mat2midipath(mat, topath)
Esempio n. 17
0
def plot_segmentation(audio, onset, sustain, release=None, offset=None):
    plt.figure()
    plt.vlines(onset, 0, 1, label='attack_init', color='g')
    plt.vlines(sustain, 0, 1, label='sustain_init', color='b')
    plt.vlines(release, 0, 1, label='sustain_end', color='r')
    plt.vlines(offset, 0, 1, label='sustain_end', color='g')
    time = np.linspace(0, len(audio) / 512, num=len(audio))
    plt.plot(time, audio)
    plt.show()


# INIT PARAMETERS
# path to the audio file
file_path = settings.DATA_PATH + '/47_58_2.wav'
fs = 44100
hopSize = 512
frameSize = 2048
rms_onset_threshold = 1E-5
mel_onset_threshold = 80
flux_onset_threshold = 0.1
onset_threshold = 0.1
max_attack_time = seconds2samples(0.5, fs)

audio_file = estd.EasyLoader(filename=file_path, sampleRate=fs)
audio = audio_file.compute()

onset, sustain = segment(audio, hopSize, frameSize, rms_onset_threshold,
                         mel_onset_threshold, flux_onset_threshold,
                         onset_threshold)

plot_segmentation(audio, onset, sustain)