def combineandsplitsongs(path_to_audio, path_to_voice, path_of_output, path_to_output1, path_to_output2):
    song, sr = librosa.load(path_to_audio)
    voice, sr = librosa.load(path_to_voice)
    song[0:len(voice)] += voice
    librosa.output.write_wav(path_of_output, song, sr)
    split1len = splitty(song, path_to_output1, path_to_output2, sr)
    return split1len
def B3():
    w = 2048
    h = 1024
    path = '../audio/train/vio/'
    
    vio_64 = librosa.load(path + 'vio_64.wav', fs)[0][0:61000]
    vio_88 = librosa.load(path + 'vio_88.wav', fs)[0][0:61000]
    cla_64 = librosa.load('../audio/train/cla/cla_64.wav', fs)[0][0:61000]
   
    S_1 = NMF.extractTemplate(vio_64)
    S_2 = NMF.extractTemplate(vio_88)
    S_3 = NMF.extractTemplate(cla_64)
 
    librosa.display.specshow(S_1, y_axis='cqt_note', x_axis='frames', n_yticks=180)
    plt.axis([0, 2, 0, 100])
    plt.show()

    librosa.display.specshow(S_2, y_axis='cqt_note', x_axis='frames', n_yticks=10)
    #plt.axis([0, 2, 0, 100])
    plt.show()
    
    librosa.display.specshow(S_3, y_axis='cqt_note', x_axis='frames', n_yticks=180)
    plt.axis([0, 2, 0, 100])
    plt.show()

    S_1 = librosa.core.istft(S_1)
    librosa.display.waveplot(S_1, x_axis='time')
    plt.show()
    S_2 = librosa.core.istft(S_2)
    librosa.display.waveplot(S_2, x_axis='time')
    plt.show()
def run_nmf_mfcc_and_pickle_mir_evals(paths):
    """

    :return:
    """
    mix_path, (fore_name, back_name) = paths
    file_name = os.path.split(mix_path)[1]
    pickle_name = splitext(file_name)[0]
    pickle_output_path = os.path.join(output_folder, pickle_name)

    if os.path.exists(pickle_output_path):
        print(pickle_output_path + "exists! Skipping...")
        return

    back_path = os.path.join(background_folder, back_name)
    true_back = librosa.load(back_path)[0]
    fore_path = os.path.join(foreground_folder, fore_name)
    true_fore = librosa.load(fore_path)[0]

    estimated_sources, mfcc_clusters = do_nmf_and_clustering(mix_path, n_clusters)
    # transpose?
    min_len = np.min((len(true_fore), len(true_back), estimated_sources.shape[1]))
    true_back = true_back[:min_len]
    true_fore = true_fore[:min_len]
    true_srcs = np.vstack([true_back, true_fore])
    estimated_sources = estimated_sources[:, :min_len]
    sdr_dict = run_bss_eval(true_srcs, estimated_sources)

    pickle_dict = {"file_name": file_name, "mfcc_clusters": mfcc_clusters, "sdr_dict": sdr_dict}
    pickle.dump(pickle_dict, open(os.path.join(output_folder, pickle_output_path + ".pick"), "wb"))
    print("pickled {}".format(pickle_name))
def B1():
    fs = 44100
    path = '../audio/validation/'
    a = librosa.load(path + '01_vio.wav', fs)[0]
    b = librosa.load(path + '01_cla.wav', fs)[0]
    c = librosa.load(path + '01_mix.wav', fs)[0]
    n = np.random.randn(len(a)) 
    print(a.shape)
    #print(evalSDR(np.array([c, c])/2, np.array([a, b]))) 
    # correct one
    print("SDR [c;c]/2, [a;b]")
    print(evalSDR(np.array([a, b]), np.array([c, c])/2))
    print("SDR [a;b], [a;b]")
    print(evalSDR(np.array([a, b]), np.array([a, b])))
    print("SDR [b;a], [a;b]")
    print(evalSDR(np.array([a, b]), np.array([b, a])))
    print("SDR [2a;2b], [a;b]")
    print(evalSDR(np.array([a, b]), np.array([2*a, 2*b])))
    print("SDR (a+0.01*n), a")
    print(evalSDR(a, (a + 0.01*n)))
    print("SDR (a+0.1*n), a")
    print(evalSDR(a, (a + 0.1*n)))
    print("SDR (a+n), a")
    print(evalSDR(a, (a + n)))
    print("SDR (a+0.01*b), a")
    print(evalSDR(a, (a + 0.01*b)))
    print("SDR (a+0.1*b), a")
    print(evalSDR(a, (a + 0.1*b)))
    print("SDR (a+b), a")
    print(evalSDR(a, (a + b)))
Beispiel #5
0
def mashability(song1, song2):
    """
	Returns how well song1 transitions into song2 using cosine matrix similarity
	and FFT semitone bin approximation matrices
	"""
    # If the tempo differs by more than thirty then we should never make that transition
    if abs(song1.bpm - song2.bpm) > 30:
        return 1
    sample_length = MIX_LENGTH  # beats per sample
    beats1 = song1.AudioFile.analysis.beats[song1.mix_out : song1.mix_out + sample_length]
    beats2 = song2.AudioFile.analysis.beats[song1.mix_in : song1.mix_in + sample_length]
    data1 = audio.getpieces(song1.AudioFile, beats1)
    data2 = audio.getpieces(song2.AudioFile, beats2)
    data1.encode("temp1.mp3")
    data2.encode("temp2.mp3")
    y1, sr1 = librosa.load("temp1.mp3")
    y2, sr2 = librosa.load("temp2.mp3")
    S1 = np.abs(librosa.stft(y1, n_fft=4096))
    chroma1 = librosa.feature.chroma_stft(S=S1, sr=sr1)
    S2 = np.abs(librosa.stft(y2, n_fft=4096))
    chroma2 = librosa.feature.chroma_stft(S=S2, sr=sr2)
    # im = librosa.display.specshow(chroma1,x_axis = "time",y_axis = "chroma")
    # im2 = librosa.display.specshow(chroma2,x_axis = "time",y_axis = "chroma")
    # plt.show()
    orthogonal_arr = []
    for i in range(min(chroma1.shape[1], chroma2.shape[1])):
        orthogonal_arr.append(dst.cosine(chroma1[:, i], chroma2[:, i]))
    return sum(orthogonal_arr) / len(orthogonal_arr)
Beispiel #6
0
 def results(self):
 
     #Loading audio files
     #Extract MFCC features and use dtw to compare the distance between two MFCCs
     y1, sr1 = librosa.load('output1.wav') 
     y2, sr2 = librosa.load('output2.wav') 
     
     mfcc1 = librosa.feature.mfcc(y1,sr1)   #Computing MFCC values
     mfcc2 = librosa.feature.mfcc(y2,sr2)
         
     dist, cost, path = dtw(mfcc1.T, mfcc2.T)
     
     #Set a threshold for our game's ranking system
     if dist <= 40:
         self.textEdit_2.setText("You did a great job! ^^")
     elif dist <= 50:
         self.textEdit_2.setText("You did good.")
     elif dist <= 60:
         self.textEdit_2.setText("You're fine.")
     else:
         self.textEdit_2.setText("You are poor at this game... TT")
         
     
     self.rank.append(dist)
     self.textEdit_3.setText(str(self.rank[self.count]))
     self.outputRank += "Player " + str(self.count) + " got " + str(self.rank[self.count]) + "\n\n"      
     self.count = self.count + 1
Beispiel #7
0
def alignment_helper(file_list, target_path):
    """Downsample and perform cross-correlation on files relative
    to a target file to test if they are correctly aligned.

    Parameters
    ----------
    file_list : list
        List of files (i.e. stem_files, raw_files)
    target_path : str
        Filepath to compare files in file_list to.

    Returns
    -------
    status : bool
        True if the cross_correlation values are within a threshold, demonstrating
        that the files are correctly aligned.
    """
    sr = 1000
    output_handle = tempfile.NamedTemporaryFile(suffix='.wav')  
    output_path = output_handle.name

    if len(file_list) > 1: 
        file_sum = sox.Combiner()
        file_sum.build(
            file_list, output_path, 'mix'
        ) 
    else:
        file_sum = sox.Transformer()
        file_sum.build(file_list[0], output_path)

    file_sum.rate(sr,'m')

    target_handle = tempfile.NamedTemporaryFile(suffix='.wav')
    target_handle_path = target_handle.name
    target_sum = sox.Transformer()
    target_sum.build(target_path, target_handle_path)
    target_sum.rate(sr, 'm')

    dur = get_length(target_path)
    offset = (dur/44100.0) / 2.0
    y_files, sr = librosa.load(output_path, sr=sr, offset = offset, duration=30.0)
    y_target, sr = librosa.load(target_handle_path, sr=sr, offset = offset, duration=30.0)

    correlation = np.correlate(y_files, y_target, 'full')

    N = len(y_target)
    a = np.arange(1, N+1)
    a_rev = np.arange(1, N)
    b = a_rev[ : :-1]  
    c = np.concatenate((a, b))
    c = c.astype(float)

    correlation = np.abs(correlation) / c
    center = N
    corr_index = np.argmax(correlation)

    if np.abs(corr_index - center) > 5:
        return False
    else:
        return True
Beispiel #8
0
def loadAudio(mixedAudioPath,stemsPathList, sr = 44100):
    ''' Function to load mix and stem audio for a song.
    
    Input: 
        - mixedAudioPath (str): file path for mixed audio
        - stemsPathList (list): list of file path for stems
        - sr (int): sample rate 
        
    Returns:
        - mixAudio (np.Array): (,length of audio)
        - stemsAudio (np.Array): (num stems, length of audio)
       
    '''

    mixAudio, sr = librosa.load(mixedAudioPath, sr)
    mixAudio = np.array(mixAudio).T

    # load stems

    stems = []

    for path in stemsPathList:

        stems.append(librosa.load(path, sr)[0])

    stemsAudio = np.array(stems).T
    
    return mixAudio, stemsAudio
Beispiel #9
0
 def run(self):
     # Initialize signals
     self.percentDoneSignal.emit(0)
     percent_scale = 1000.0 / 5
     self.doneSignal.emit(0)
     self.statusSignal.emit("")
     # Load in audio data
     self.statusSignal.emit("Loading {}".format(os.path.split(self.mix_file)[1]))
     mix, self.fs = librosa.load(self.mix_file, sr=None)
     self.percentDoneSignal.emit(1 * percent_scale)
     self.statusSignal.emit("Loading {}".format(os.path.split(self.source_file)[1]))
     source, self.fs = librosa.load(self.source_file, sr=self.fs)
     self.percentDoneSignal.emit(2 * percent_scale)
     # Fix any gross timing offset
     self.statusSignal.emit("Aligning...")
     mix, source = estimate.align(mix, source, self.fs)
     self.percentDoneSignal.emit(3 * percent_scale)
     self.statusSignal.emit("Subtracting...")
     source = estimate.reverse_channel(mix, source)
     mix, source = estimate.pad(mix, source)
     self.percentDoneSignal.emit(4 * percent_scale)
     self.statusSignal.emit("Enhancing...")
     self.subtracted = estimate.wiener_enhance(mix - source, source, self.wiener_threshold)
     self.percentDoneSignal.emit(5 * percent_scale)
     self.doneSignal.emit(1)
Beispiel #10
0
def read_audio(current_file, sample_rate=None, mono=True):
    """Read audio file

    Parameters
    ----------
    current_file : dict
        Dictionary given by pyannote.database.
    sample_rate: int, optional
        Target sampling rate. Defaults to using native sampling rate.
    mono : int, optional
        Convert multi-channel to mono. Defaults to True.

    Returns
    -------
    y : (n_samples, n_channels) np.array
        Audio samples.
    sample_rate : int
        Sampling rate.

    Notes
    -----
    In case `current_file` contains a `channel` key, data of this (1-indexed)
    channel will be returned.

    """

    # sphere files
    if current_file['audio'][-4:] == '.sph':

        # dump sphere file to a temporary wav file
        # and load it from here...
        from sphfile import SPHFile
        sph = SPHFile(current_file['audio'])
        with tempfile.NamedTemporaryFile() as f:
            sph.write_wav(f.name)
            y, sample_rate = librosa.load(f.name, sr=sample_rate, mono=False)

    # all other files
    else:
        y, sample_rate = librosa.load(current_file['audio'],
                                      sr=sample_rate,
                                      mono=False)

    # reshape mono files to (1, n) [was (n, )]
    if y.ndim == 1:
        y = y.reshape(1, -1)

    # extract specific channel if requested
    channel = current_file.get('channel', None)
    if channel is not None:
        y = y[channel - 1, :]

    # convert to mono
    if mono:
        y = librosa.to_mono(y)

    return y.T, sample_rate
Beispiel #11
0
def test_ndarray_to_file():
    apply_audio_effects(mono, outfile)
    y = lr.load(outfile, sr=None)[0]
    lr.output.write_wav('test_ndarray_to_file_mono.wav', y, sr)
    assert lr.util.valid_audio(y)

    apply_audio_effects(stereo, outfile)
    y = lr.load(outfile, sr=None, mono=False)[0]
    lr.output.write_wav('test_ndarray_to_file_stereo.wav', y, sr)
    assert lr.util.valid_audio(y, mono=False)
Beispiel #12
0
def test_segment_load():
    """
    Test loading a segment. Check size accuracy
    """

    sample_len = 2003
    y, sr = librosa.load('data/test1_44100.wav', sr=None, mono=False, offset=0., duration=sample_len/44100.)
    assert y.shape[1] == sample_len

    y, sr = librosa.load('data/test1_44100.wav', sr=None, mono=False, offset=2048/44100., duration=1.0)
    assert y.shape[1] == 44100
Beispiel #13
0
def remove_umms(ns):
    if ns.output is None:
        ns.output = '{0}-umdone{1}'.format(*os.path.splitext(ns.input))
    x, sr = librosa.load(ns.input, mono=True, sr=None)
    bounds = segment.boundaries(x, sr, window_length=ns.window_length, 
                                threshold=ns.noise_threshold)
    mfccs, distances, categories = umdone.io.load(ns.train)
    matches = discover.match(x, sr, bounds, mfccs, distances, categories) 
    del x, sr, bounds, mfccs, distances, categories
    # read back in to preserve mono/stereo and levels on output
    x, sr = librosa.load(ns.input, mono=False, sr=None)
    y = segment.remove_slices(x.T, matches)
    librosa.output.write_wav(ns.output, y.T, sr, norm=False)
Beispiel #14
0
    def plot(self):
        ''' plot sound wave '''
    
#        fs, x = wavread('output1.wav')
        y1, sr1 = librosa.load('output1.wav') 
        y2, sr2 = librosa.load('output2.wav') 
        
        self.figure.set_size_inches(4.5, 2.0)
        #plt.axis('off')
        ax1 = self.figure.add_subplot(2,1,1)
        ax1.axis('off')
        
        ax1.plot(y1)
        self.canvas.draw()
Beispiel #15
0
def all_repet_params(fg_input_directory, fg_file_name_base, bg_input_directory, bg_file_name_base, output_directory, sample_rate):
    '''
    Creates all combinations of foreground and background files and runs a large series of REPET parameters on them

    Parameters
    ----------
    fg_input_directory : str
        input directory 
    fg_file_name_base : str
        the base string for a file name
    bg_input_directory : str
        input directory 
    bg_file_name_base : str
        the base string for a file name
    output_directory : str
        where the results will be stored
    sample_rate : int
        sample rate in number of samples per seconds
    '''
    window_sizes = [256, 512, 1024, 2048, 4096, 8192, 16384]
    window_types = [nussl.WindowType.HAMMING, nussl.WindowType.RECTANGULAR, nussl.WindowType.HANN, nussl.WindowType.BLACKMAN]
    
    for i in range(0, 5):
        for j in range(1, 5):
            fg_file_name = fg_file_name_base + ('%02d.wav' % i)
            bg_file_name = bg_file_name_base + ('%02d.wav' % j)
            fg, sr = librosa.load(os.path.join(fg_input_directory, fg_file_name), sr=sample_rate)
            bg, sr = librosa.load(os.path.join(bg_input_directory, bg_file_name), sr=sample_rate)

            bg = bg[:fg.shape[0]]
            mix = fg + bg

            # create the directory for the output files
            # new_directory = os.path.join(output_directory, 'fg-%02d-bg-%02d' % (i, j))

            # if not os.path.exists(new_directory):
            #     os.makedirs(new_directory)
            
            for window_size in window_sizes:
                for window_type in window_types:
                    bg_simple, fg_simple, bg_complex, fg_complex = run_repet(mix, window_size=window_size, window_type=window_type)

                    fg_simple_result = mir_eval.separation.bss_eval_sources(fg, fg_simple.audio_data)
                    bg_simple_result = mir_eval.separation.bss_eval_sources(bg, bg_simple.audio_data)

                    fg_complex_result = mir_eval.separation.bss_eval_sources(fg, fg_complex.audio_data)
                    bg_complex_result = mir_eval.separation.bss_eval_sources(bg, bg_complex.audio_data)

                    print 'window_size: ', window_size, 'window_type: ', window_type, 'simple_bg_sdr: ', bg_simple_result[0][0], 'simple_fg_sdr: ', fg_simple_result[0][0], 'complex_bg_sdr: ', bg_complex_result[0][0], 'complex_fg_sdr: ', fg_complex_result[0][0]
Beispiel #16
0
def all_nearest_neighbor(fg_input_directory, fg_file_name_base, bg_input_directory, bg_file_name_base, output_directory, sample_rate):
    '''
    Creates all combinations of foreground and background files and runs a large series of REPET parameters on them

    fg_input_directory : str
        input directory 
    fg_file_name_base : str
        the base string for a file name
    bg_input_directory : str
        input directory 
    bg_file_name_base : str
        the base string for a file name
    output_directory : str
        where the results will be stored
    sample_rate : int
        sample rate in number of samples per seconds
    '''
    window_sizes = [256, 512, 1024, 2048, 4096, 8192]
    window_types = [nussl.WindowType.HAMMING, nussl.WindowType.RECTANGULAR, nussl.WindowType.HANN, nussl.WindowType.BLACKMAN]
    # window_sizes = [256]
    # window_types = [nussl.WindowType.HAMMING]
    # period = [i for i in range(0, 1000)]
    for i in range(0, 1):
        for j in range(1, 2):
            fg_file_name = fg_file_name_base + ('%02d.wav' % i)
            bg_file_name = bg_file_name_base + ('%02d.wav' % j)
            fg, sr = librosa.load(os.path.join(fg_input_directory, fg_file_name), sr=sample_rate)
            bg, sr = librosa.load(os.path.join(bg_input_directory, bg_file_name), sr=sample_rate)

            bg = bg[:fg.shape[0]]
            mix = fg + bg
            
            for window_size in window_sizes:
                for window_type in window_types:

                    bs, suggested_period = compute_beat_spectrum_and_suggested_period(mix, window_size=window_size, window_type=window_type)
                    period_min = suggested_period / 8
                    period_max = suggested_period

                    sd = beat_spectrum_std(bs)
                    tempo, beats = librosa.beat.beat_track(mix)

                    periods = [suggested_period / 8, suggested_period / 7, suggested_period / 6, suggested_period / 5, suggested_period / 4, suggested_period / 3, suggested_period / 2, suggested_period]
                    for period in periods:
                        
                        values = (window_size, window_type, period, sd, tempo, fg_file_name, bg_file_name, period_min, period_max, suggested_period)

                        print values
                        insert_nearest_neighbors(values)
def estimateValidSet(path, vio_W, cla_W, score_inf=None):
    for i in range(0, 5): 
        p = path + '0'
        x = str(i + 1)
        print('round : ' + x)

        valid_v = librosa.load(p + x + '_vio.wav', 44100)[0]
        valid_c = librosa.load(p + x + '_cla.wav', 44100)[0]
        valid_m = librosa.load(p + x + '_mix.wav', 44100)[0]
        sc = None

        if score_inf is not None:
            sc = score_inf[i]

        estimate(valid_m, vio_W, cla_W, valid_v, valid_c, x, sc)
Beispiel #18
0
def get_features(collection='drum_samples_train',
                 features=('zero_crossing_rate', 'spectral_centroid'),
                 scaler=None):
    if collection == 'drum_samples_train':
        kick_filepaths, snare_filepaths = download_samples('drum_samples_train')
        kick_signals = [
            librosa.load(p)[0] for p in kick_filepaths
        ]
        snare_signals = [
            librosa.load(p)[0] for p in snare_filepaths
        ]

        kick_features = numpy.array([extract_features(x, features) for x in kick_signals])
        snare_features = numpy.array([extract_features(x, features) for x in snare_signals])
        feature_table = numpy.vstack((kick_features, snare_features))
        if scaler is None:
            scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))
            scaler.fit(feature_table)
        training_features = scaler.transform(feature_table)

        kick_labels = numpy.zeros(10)
        snare_labels = numpy.ones(10)
        training_labels = numpy.concatenate((kick_labels, snare_labels))

        return training_features, training_labels, scaler

    elif collection == 'drum_samples_test':
        kick_filepaths, snare_filepaths = download_samples('drum_samples_test')
        kick_signals = [
            librosa.load(p)[0] for p in kick_filepaths
        ]
        snare_signals = [
            librosa.load(p)[0] for p in snare_filepaths
        ]

        kick_features = numpy.array([extract_features(x, features) for x in kick_signals])
        snare_features = numpy.array([extract_features(x, features) for x in snare_signals])
        feature_table = numpy.vstack((kick_features, snare_features))
        if scaler is None:
            scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))
            scaler.fit(feature_table)
        test_features = scaler.transform(feature_table)

        kick_labels = numpy.zeros(30)
        snare_labels = numpy.ones(30)
        labels = numpy.concatenate((kick_labels, snare_labels))

        return test_features, labels, scaler
Beispiel #19
0
def test_resample_mono():

    def __test(y, sr_in, sr_out, res_type, fix):

        y2 = librosa.resample(y, sr_in, sr_out,
                              res_type=res_type,
                              fix=fix)

        # First, check that the audio is valid
        librosa.util.valid_audio(y2, mono=True)

        # If it's a no-op, make sure the signal is untouched
        if sr_out == sr_in:
            assert np.allclose(y, y2)

        # Check buffer contiguity
        assert y2.flags['C_CONTIGUOUS']

        # Check that we're within one sample of the target length
        target_length = y.shape[-1] * sr_out // sr_in
        assert np.abs(y2.shape[-1] - target_length) <= 1

    for infile in ['data/test1_44100.wav',
                   'data/test1_22050.wav',
                   'data/test2_8000.wav']:
        y, sr_in = librosa.load(infile, sr=None, duration=5)

        for sr_out in [8000, 22050]:
            for res_type in ['kaiser_best', 'kaiser_fast', 'scipy']:
                for fix in [False, True]:
                    yield (__test, y, sr_in, sr_out, res_type, fix)
Beispiel #20
0
def test_tonnetz():
    y, sr = librosa.load(librosa.util.example_audio_file())
    tonnetz_chroma = np.load(os.path.join('tests', "data", "feature-tonnetz-chroma.npy"))
    tonnetz_msaf = np.load(os.path.join('tests', "data", "feature-tonnetz-msaf.npy"))

    # Use cqt chroma
    def __audio():
        tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
        assert tonnetz.shape[0] == 6

    # Use pre-computed chroma
    def __stft():
        tonnetz = librosa.feature.tonnetz(chroma=tonnetz_chroma)
        assert tonnetz.shape[1] == tonnetz_chroma.shape[1]
        assert tonnetz.shape[0] == 6
        assert np.allclose(tonnetz_msaf, tonnetz)

    def __cqt():
        # Use high resolution cqt chroma
        chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr, n_chroma=24)
        tonnetz = librosa.feature.tonnetz(chroma=chroma_cqt)
        assert tonnetz.shape[1] == chroma_cqt.shape[1]
        assert tonnetz.shape[0] == 6
        # Using stft chroma won't generally match cqt chroma
        # skip the equivalence check

    # Call the function with not enough parameters
    yield pytest.mark.xfail(librosa.feature.tonnetz, raises=librosa.ParameterError)
    yield __audio
    yield __stft
    yield __cqt
Beispiel #21
0
def test_ifgram_matches_stft():

    y, sr = librosa.load('data/test1_22050.wav')

    def __test(n_fft, hop_length, win_length, center, norm, dtype):
        D_stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                              win_length=win_length, center=center,
                              dtype=dtype)

        _, D_ifgram = librosa.ifgram(y, sr, n_fft=n_fft,
                                     hop_length=hop_length,
                                     win_length=win_length, center=center,
                                     norm=norm, dtype=dtype)

        if norm:
            # STFT doesn't do window normalization;
            # let's just ignore the relative scale to make this easy
            D_stft = librosa.util.normalize(D_stft, axis=0)
            D_ifgram = librosa.util.normalize(D_ifgram, axis=0)

        assert np.allclose(D_stft, D_ifgram)

    for n_fft in [1024, 2048]:
        for hop_length in [None, n_fft // 2, n_fft // 4]:
            for win_length in [None, n_fft // 2, n_fft // 4]:
                for center in [False, True]:
                    for norm in [False, True]:
                        for dtype in [np.complex64, np.complex128]:
                            yield (__test, n_fft, hop_length, win_length,
                                   center, norm, dtype)
Beispiel #22
0
    def __test(res_type):
        y_native, sr = librosa.load(librosa.util.example_audio_file(),
                                    sr=None,
                                    offset=offset,
                                    duration=duration,
                                    res_type=res_type)

        y2 = librosa.resample(y_native, sr, sr_target, res_type=res_type)

        y, _ = librosa.load(librosa.util.example_audio_file(),
                            sr=sr_target,
                            offset=offset,
                            duration=duration,
                            res_type=res_type)

        assert np.allclose(y2, y)
Beispiel #23
0
def save_background(input_path, output_path, sample_rate, length=0, number_of_repeating_segments=0):
    '''
    Stitch together wave files to a specified length

    Parameters
    ----------
    input_path : str
        the path of the input file
    length : int
        length in seconds of the output file
    output_path : str
        the path of the output file
    '''
    print 'loading bg file...'
    bg, sample_rate = librosa.load(input_path, sr=sample_rate)
    print 'stitching bg file...'
    if length > 0:
        bg_length = bg.shape[0] / sample_rate
        number_of_segments = int(np.ceil(length / bg_length))
    elif number_of_repeating_segments > 0:
        number_of_segments = number_of_repeating_segments
    else:
        print 'a length or number of repeating segments must be specified'
        return
    
    result = bg
    for i in range(0, number_of_segments):
        result = np.concatenate((bg, result))

    print 'writing bg...' 
    wavwrite(output_path, result, sample_rate)
Beispiel #24
0
    def __test_consistency(frame_length, hop_length, center):
        y, sr = librosa.load(__EXAMPLE_FILE, sr=None)

        # Ensure audio is divisible into frame size.
        y = librosa.util.fix_length(y, y.size - y.size % frame_length)
        assert y.size % frame_length == 0

        # STFT magnitudes with a constant windowing function and no centering.
        S = librosa.magphase(librosa.stft(y,
                                          n_fft=frame_length,
                                          hop_length=hop_length,
                                          window=np.ones,
                                          center=center))[0]

        # Try both RMS methods.
        rms1 = librosa.feature.rms(S=S, frame_length=frame_length,
                                   hop_length=hop_length)
        rms2 = librosa.feature.rms(y=y, frame_length=frame_length,
                                   hop_length=hop_length, center=center)

        assert rms1.shape == rms2.shape
        # Normalize envelopes.
        rms1 /= rms1.max()
        rms2 /= rms2.max()

        # Ensure results are similar.
        np.testing.assert_allclose(rms1, rms2, rtol=5e-2)
Beispiel #25
0
def main():
    args = get_arguments()

    log.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s',
                    level=LOG_LEVEL)
    log.info("Start of '{}'.".format(__file__))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
        log.info("Created output directory '{}'.".format(args.output_dir))

    for f in find_files(args.input_dir, '*.mid', path=False):
        midi_old = pretty_midi.PrettyMIDI(
            os.path.join(args.input_dir, f))
        midi_new = trim_midi_file(args.sec_from, args.sec_to, midi_old)
        midi_new.write(
            os.path.join(args.output_dir, f))
        log.info("Processed file '{}'".format(f))

    for f in find_files(args.input_dir, '*.wav', path=False):
        audio_old, sr = librosa.load(os.path.join(args.input_dir, f))
        audio_new = trim_audio_file(args.sec_from, args.sec_to, audio_old, sr)
        librosa.output.write_wav(
            os.path.join(args.output_dir, f), audio_new, sr)
        log.info("Processed file '{}'".format(f))

    log.info("End of '{}'.".format(__file__))
Beispiel #26
0
    def analyze(self, songname,songpath, setName):
        # Step one. Read in an audio file, extract all features for the song, 
        # extract phrase boundaries, and store features into small seg object
        y, sr = librosa.load(songpath)
        # Separate harmonics and percussives into two waveforms
        y_harmonic, y_percussive = librosa.effects.hpss(y)
        # Beat track on the percussive signal
        tempo, beat_frames = librosa.beat.beat_track(y=y, sr = sr)
        # Step two. Get phrase boundary
        # Extract Phrase info TO-DO: automatically
        phraseLabName = '../dataset/Annotations/myphraselab/%s/%s.txt' % (setName, songname)
        phraseBound = [float(l) for l in open(phraseLabName).read().splitlines()]

        for v in range(0,len(phraseBound)-1):
            # start time, and end time in second
            st = int(phraseBound[v] * 1. * sr)
            ed = int(phraseBound[v+1] * 1. * sr)
            currSeg = Seg(songname, st, ed, y[st:ed], y_harmonic[st:ed], y_percussive[st:ed], sr)
            currSeg.get_seg_feature()
            currSeg.idx = self.countSeg
            self.countSeg += 1
            self.segments.append(currSeg)
        self.mfcc_size = currSeg.mfcc_size            
        self.chroma_size = currSeg.chroma_size
        self.rms_size = 1
  def __init__(self, folder, transform=None, classes=CLASSES,
               silence_percentage=0.1, sample_rate=16000):
    all_classes = [d for d in os.listdir(folder) if
                   os.path.isdir(os.path.join(folder, d)) and not d.startswith(
                     '_')]
    for c in classes[2:]:
      assert c in all_classes

    class_to_idx = {classes[i]: i for i in range(len(classes))}
    for c in all_classes:
      if c not in class_to_idx:
        print ("Class ", c, "assigned as unknown")
        class_to_idx[c] = 0
    data = []
    for c in all_classes:
      d = os.path.join(folder, c)
      target = class_to_idx[c]
      for f in os.listdir(d):
        path = os.path.join(d, f)
        samples, sample_rate = librosa.load(path, sr=sample_rate)
        audio = {'samples': samples, 'sample_rate': sample_rate}
        data.append((audio, target))

    # add silence
    target = class_to_idx['silence']
    samples = np.zeros(sample_rate, dtype=np.float32)
    silence = {'samples': samples, 'sample_rate': sample_rate}
    data += [(silence, target)] * int(len(data) * silence_percentage)

    self.classes = classes
    self.data = data
    self.transform = transform
Beispiel #28
0
def test_piptrack_properties():

    def __test(S, n_fft, hop_length, fmin, fmax, threshold):

        pitches, mags = librosa.core.piptrack(S=S,
                                              n_fft=n_fft,
                                              hop_length=hop_length,
                                              fmin=fmin,
                                              fmax=fmax,
                                              threshold=threshold)

        # Shape tests
        eq_(S.shape, pitches.shape)
        eq_(S.shape, mags.shape)

        # Make sure all magnitudes are positive
        assert np.all(mags >= 0)

        # Check the frequency estimates for bins with non-zero magnitude
        idx = (mags > 0)
        assert np.all(pitches[idx] >= fmin)
        assert np.all(pitches[idx] <= fmax)

        # And everywhere else, pitch should be 0
        assert np.all(pitches[~idx] == 0)

    y, sr = librosa.load('data/test1_22050.wav')

    for n_fft in [2048, 4096]:
        for hop_length in [None, n_fft // 4, n_fft // 2]:
            S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
            for fmin in [0, 100]:
                for fmax in [4000, 8000, sr // 2]:
                    for threshold in [0.1, 0.2, 0.5]:
                        yield __test, S, n_fft, hop_length, fmin, fmax, threshold
Beispiel #29
0
def build_output(times, quantized_times, labels, kit, file_path, inputLength, quantized=False):
    # check for empty arrays
    if not times or not labels:
        return False
    labels = [label[0] for label in labels]
    # replace beatbox with drums
    drums = []
    label_to_kit = {}
    for label in labels:
        if label in label_to_kit:
            drum = label_to_kit[label]
        else:
            drum, ssr = librosa.load('../kits/'+kit+'/'+label+'.wav', sr=None)
            label_to_kit[label] = drum
        drums.append(drum)

    # reconstruct signal from replaced sounds
    if quantized:
        result = reconstructor.replace(quantized_times, drums, ssr, inputLength)
    else:
        result = reconstructor.replace(times, drums, ssr, inputLength)

    # write output signal to .wav
    librosa.output.write_wav(file_path[:-4]+'-out.wav', result, ssr)
    return result, ssr
Beispiel #30
0
def process_one_file(mp3_filename, skip=True):
    '''
    Load in an mp3, get the features, and write the features out

    :parameters:
        - mp3_filename : str
            Path to an mp3 file
        - skip : bool
            Whether to skip files when the h5 already exists
    '''
    # h5 files go in the 'h5' dir instead of 'mp3'
    output_filename = mp3_filename.replace('mp3', 'h5')
    # Skip files already created
    if skip and os.path.exists(output_filename):
        return
    try:
        # Load audio and compute CQT
        audio_data, _ = librosa.load(
            mp3_filename, sr=feature_extraction.AUDIO_FS)
        cqt = feature_extraction.audio_cqt(audio_data)
        # Create subdirectories if they don't exist
        if not os.path.exists(os.path.split(output_filename)[0]):
            os.makedirs(os.path.split(output_filename)[0])
        # Save CQT
        deepdish.io.save(output_filename, {'gram': cqt})
    except Exception as e:
        print "Error processing {}: {}".format(
            mp3_filename, traceback.format_exc(e))
def compute_STFT_data_from_file_list(wavfile_list,
                                     fs=16000,
                                     wlen_sec=0.032,
                                     hop_percent=0.5,
                                     zp_percent=0,
                                     trim=False,
                                     top_db=60,
                                     out_file=None):
    """
    Compute short-term Fourier transform (STFT) power and phase spectrograms from a list of wav files, 
    and save them to a pickle file.
    
    Parameters
    ----------
    
    wavfile_list                List of wav files
    fs                          Sampling rate
    wlen_sec                    STFT window length in seconds
    hop_percent                 Hop size as a percentage of the window length
    zp_percent                  Zero-padding size as a percentage of the window length
    trim                        Boolean indicating if leading and trailing silences should be trimmed
    top_db                      The threshold (in decibels) below reference to consider as silence (see librosa doc)
    out_file                   Path to the pickle file for saving the data
    
    Returns
    -------
    
    data                        A list of dictionaries, the length of the list is the same as 'wavfile_list' 
                                Each dictionary has the following fields:
                                        'file': The wav file name
                                        'power_spectrogram': The power spectrogram
                                        'phase_spectrogram': The phase spectrogram
    
    Examples
    --------
    
    fs = 16e3 # Sampling rate
    wlen_sec = 64e-3 # STFT window length in seconds
    hop_percent = 0.25  # hop size as a percentage of the window length
    trim=False
    data_folder = '/local_scratch/sileglai/datasets/clean_speech/TIMIT/TEST'
    test_file_list = librosa.util.find_files(data_folder, ext='wav') 
    data = compute_data(test_file_list, fs=fs, wlen_sec=wlen_sec, hop_percent=hop_percent, trim=trim, zp_percent=0,             
                        out_file='test_compute_data.pckl')                   
    
    """

    # STFT parameters
    wlen = wlen_sec * fs  # window length of 64 ms
    wlen = np.int(np.power(2, np.ceil(np.log2(wlen))))  # next power of 2
    hop = np.int(hop_percent * wlen)  # hop size
    nfft = wlen + zp_percent * wlen  # number of points of the discrete Fourier transform
    win = np.sin(np.arange(.5, wlen - .5 + 1) / wlen * np.pi)
    # sine analysis window

    fs_orig = librosa.load(wavfile_list[0], sr=None)[1]  # Get sampling rate

    data = [None] * len(
        wavfile_list)  # Create an empty list that will contain dictionaries

    for n, wavfile in enumerate(wavfile_list):

        path, file_name = os.path.split(wavfile)

        if fs == fs_orig:
            x = librosa.load(wavfile,
                             sr=None)[0]  # Load wav file without resampling
        else:
            print('resampling while loading with librosa')
            x = librosa.load(wavfile,
                             sr=fs)[0]  # Load wav file with resampling

        if trim:
            x = librosa.effects.trim(
                x, top_db=top_db)[0]  # Trim leading and trailing silences

        T_orig = len(x)
        x_pad = librosa.util.fix_length(
            x, T_orig +
            wlen // 2)  # Padding for perfect reconstruction (see librosa doc)

        X = librosa.stft(x_pad,
                         n_fft=nfft,
                         hop_length=hop,
                         win_length=wlen,
                         window=win)  # STFT
        X_abs_2 = np.abs(X)**2  # Power spectrogram
        X_angle = np.angle(X)

        data[n] = {
            'file': file_name,
            'power_spectrogram': X_abs_2,
            'phase_spectrogram': X_angle
        }

    f = open(out_file, 'wb')
    pickle.dump([data, fs, wlen_sec, hop_percent, trim], f)
    f.close()

    return data
def compute_STFT_data_from_file_list_TIMIT(wavfile_list,
                                           fs=16000,
                                           wlen_sec=0.032,
                                           hop_percent=0.5,
                                           zp_percent=0,
                                           trim=False,
                                           verbose=False,
                                           out_file=None):
    """
    Same as 'compute_STFT_data_from_file_list' function except that specific fields related to TIMIT are added to the returned and saved dictionaries.
    """

    # STFT parameters
    wlen = wlen_sec * fs  # window length of 64 ms
    wlen = np.int(np.power(2, np.ceil(np.log2(wlen))))  # next power of 2
    hop = np.int(hop_percent * wlen)  # hop size
    nfft = wlen + zp_percent * wlen  # number of points of the discrete Fourier transform
    win = np.sin(np.arange(.5, wlen - .5 + 1) / wlen * np.pi)
    # sine analysis window

    fs_orig = librosa.load(wavfile_list[0], sr=None)[1]  # Get sampling rate

    data = [None] * len(
        wavfile_list)  # Create an empty list that will contain dictionaries

    for n, wavfile in enumerate(wavfile_list):

        path, file_name = os.path.split(wavfile)
        path, speaker = os.path.split(path)
        path, dialect = os.path.split(path)
        path, set_type = os.path.split(path)

        if verbose:
            print('processing %s/%s/%s/%s\n' %
                  (set_type, dialect, speaker, file_name))

        if fs == fs_orig:
            x = librosa.load(wavfile,
                             sr=None)[0]  # Load wav file without resampling
        else:
            print('resampling while loading with librosa')
            x = librosa.load(wavfile,
                             sr=fs)[0]  # Load wav file with resampling

        if trim:
            with open(
                    os.path.join(path, set_type, dialect, speaker,
                                 file_name[:-4] + '.PHN'), 'r') as f:
                first_line = f.readline()  # Read the first line
                for last_line in f:  # Loop through the whole file reading it all
                    pass

            if not ('#' in first_line) or not ('#' in last_line):
                raise NameError(
                    'The first or last lines of the .phn file should contain #'
                )

            ind_beg = int(first_line.split(' ')[1])
            ind_end = int(last_line.split(' ')[0])
            x = x[ind_beg:ind_end]

        T_orig = len(x)
        x_pad = librosa.util.fix_length(
            x, T_orig +
            wlen // 2)  # Padding for perfect reconstruction (see librosa doc)

        X = librosa.stft(x_pad,
                         n_fft=nfft,
                         hop_length=hop,
                         win_length=wlen,
                         window=win)  # STFT
        X_abs_2 = np.abs(X)**2  # Power spectrogram
        X_angle = np.angle(X)

        data[n] = {
            'set': set_type,
            'dialect': dialect,
            'speaker': speaker,
            'file': file_name,
            'power_spectrogram': X_abs_2,
            'phase_spectrogram': X_angle
        }

    f = open(out_file, 'wb')
    pickle.dump([data, fs, wlen_sec, hop_percent, trim], f)
    f.close()

    return data
Beispiel #33
0
# In[12]:


#####################################################descarga video de youtube
from pytube import YouTube

yt = YouTube('https://www.youtube.com/watch?v=vPaBI_IQoJk&ab_channel=FuerzaPopular')
stream = yt.streams.get_by_itag('251')
stream.download()


# In[10]:


import librosa
data, fs = librosa.load('pasos/keiko.wav')
print(data)
print("####################################################")
print(fs)


# In[2]:


##########################################################segmentar audio
from pydub import AudioSegment
t1 = 3 * 1000 #Works in milliseconds
t2 = 6 * 1000
newAudio = AudioSegment.from_wav("pasos/keiko.wav")
newAudio = newAudio[t1:t2]
newAudio.export('pasos/newSong.wav', format="wav") #Exports to a wav file in the current path.
Beispiel #34
0
def read_audio_spectrum(filename):
    signal, fs = librosa.load(filename)
    S = librosa.stft(signal, N_FFT)
    final = np.log1p(np.abs(S[:, :430]))
    return final, fs
Beispiel #35
0
def load_wav(filename, sample_rate):
	audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
	audio = audio.flatten()
	return audio
        conn = sqlite3.connect(database)
        c = conn.cursor()
        
        
        print("Database will be saved as: {}".format(database))
        print("Noisegroup of collected MFCCs: {}".format(noisegroup))
        print("Noise wavefile: {}".format(environment_noise))
        print("Number of MFCCs to be extracted: {}".format(num_mfcc))
        
        check_variables = input("\nIMPORTANT!!!!\nAre the items listed above correct? (Y or N): ")
        if 'y' in check_variables.lower():

            #load environment noise to be added to training data
            if environment_noise: 
                try:
                    env_noise = librosa.load(environment_noise)[0]
                except FileNotFoundError as fnf:
                    print("\nCannot find {} in cwd.\n".format(environment_noise))
                    raise fnf
            else:
                env_noise = None

            columns = list((range(0,num_mfcc)))
            column_type = []
            for i in columns:
                column_type.append('"'+str(i)+'" REAL')


            c.execute(''' CREATE TABLE IF NOT EXISTS mfcc_40(%s,filename  TEXT, noisegroup TEXT, noiselevel REAL, dataset INT, speaker INT, intensity INT, statement INT, repetition INT, speaker_sex INT, label INT) ''' % ", ".join(column_type))
            conn.commit()
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 19 08:49:04 2021

@author: CS
"""

# 改变采样率,统一为44.1kHz

import librosa
import numpy as np
import soundfile as sf
import os

file_path = 'D:/Project/DCASE_test/Data/Data_ShipsEar/'
sr_output = 44100
out_path = 'D:/Project/DCASE_test/Data/test/'

file_list = os.listdir(file_path)
for file in file_list:
    wav_path = file_path + file
    data, sr = librosa.load(wav_path, None)

    data_output = librosa.resample(data.astype(np.float32), sr, sr_output)

    out_name = out_path + file
    sf.write(out_name, data_output, sr_output)
    https://colab.research.google.com/drive/1LZ9BX53fhzO6o-zbbmju_Mi6HV1yu7GK
"""

import librosa
import matplotlib.pyplot as plt
import librosa.display
#part1 - preprocessing the signal
#---------------------------------------------------------------------------
#A audio(Music-Jazz) is played for 30 secs and recorded.
#15 lakh plus samples are collected and sampling frequency is 48000.
#Therefore in one second 48000 samples are collected that means
#the total length of signal in time domain is approximately 32
#secs. for analysation we need to have approximately only 50,000
#samples therefore we need to downsample the signal by
#1516541/50,0000 == 30.33 or aproximately 31.
x, sr = librosa.load('/content/jazz-mp3.mp3')
print(sr)
print(len(x))
#length comes out to be 696663, far greater than 50,000. We see that Sr is 22050 meaning that in 1 second
#22050 samples are collected, this shows that the music ran for 699663/22050 secs == 31.73 secs which is true.
#to reduce the no of samples approximately we have to change the sampling rate.
#To find the new sampling rate
audio_time = len(x) // sr
new_sr = 50000 // audio_time
print(new_sr)
#new_sr is equal to approx 1582.544
#Load the signal again but this with the new sr
x, sr = librosa.load('/content/jazz-mp3.mp3', sr=new_sr)
print(len(x))
#no of samples now is 50000 exact.
#plotting the graph
Beispiel #39
0
def load_wav(path) :
    return librosa.load(path, sr=hp.sample_rate)[0]
Beispiel #40
0
from wavenet.wavenet import Wavenet
from auxilaries import mel_extractor

os.environ['CUDA_VISIBLE_DEVICES'] = ''
with open('../config_jsons/wavenet_mol.json', 'rt') as F:
    te_configs = json.load(F)
te_hparams = Namespace(**te_configs)
teacher_wavenet = Wavenet(te_hparams)

with open('../config_jsons/parallel_wavenet.json', 'rt') as F:
    configs = json.load(F)
hparams = Namespace(**configs)
parallel_wavenet = ParallelWavenet(hparams, teacher=teacher_wavenet)

seq_len = 7680
wav_val, _ = librosa.load('test_data/test.wav', sr=16000)
batch_size = 4
wav_val = wav_val[:batch_size * seq_len].reshape([batch_size, seq_len])

wav_shape = [batch_size, seq_len]
mel_shape = [batch_size, 39, 80]
mel_val = np.zeros(mel_shape)
for i in range(batch_size):
    mel_val[i] = mel_extractor.melspectrogram(wav_val[i])

mel_ph = tf.placeholder(tf.float32, mel_shape, name='mel_ph')
wav_ph = tf.placeholder(tf.float32, wav_shape, name='wav_ph')
inputs = {'mel': mel_ph, 'wav': wav_ph}

tf.set_random_seed(12345)
pff_dict = parallel_wavenet.feed_forward(inputs)
Beispiel #41
0
"""

import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
from librosa.display import waveplot
plt.close('all')

sound_file_paths = "a039_10_20_forest_path.wav"  #"a038_30_40_home.wav"

parent_dir = 'small_data/'

X, sr = librosa.load(os.path.join(parent_dir, sound_file_paths))

S = librosa.feature.melspectrogram(X, sr=sr, n_mels=128)

# Convert to log scale (dB). We'll use the peak power as reference.
log_S = librosa.logamplitude(S, ref_power=np.max)

# Make a new figure
plt.figure()
librosa.display.waveplot(
    np.array(X),
    sr=22050,
)

plt.figure()
librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
Beispiel #42
0
import matplotlib.pyplot as plt
import librosa
import librosa.display
import tensorflow
import sklearn

from keras.layers import Dense, Conv1D, MaxPool1D,\
    Input, BatchNormalization, Activation
from keras.models import Sequential, Model

# 로드 데이터
# y1, sr1 = librosa.load(
#     'c:/nmb/nmb_data/F1_high.wav'
# ) # 여성 화자

y2, sr2 = librosa.load('c:/nmb/nmb_data/M2_low.wav')  # 남성 화자


# 필요함수 생성 (노이즈 생성, 정규화)
def noising(data, noise_factor):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data


def normalize(data, axis=0):
    return sklearn.preprocessing.minmax_scale(data, axis=axis)


# 원본 음성, 노이즈 합성
Beispiel #43
0
def gen_chroma_stft(file_path):
    y, sr = librosa.load(get_binary_data_from_gcs(file_path), sr=None)
    return librosa.feature.chroma_stft(y, sr)
import librosa

filename = "C:\\Users\\mwang\\sheet_music_project\\music\\fur_elise\\fur_elise.mp3"
# y, sr = librosa.load(filename)
# filename = librosa.util.example_audio_file()
print(filename)
y, sr = librosa.load(filename)
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
print('Estimated tempo: {:.2f} beats per minute'.format(tempo))
for o, a in myOpts:
    if o == '-i':
        json_file = a

with open(json_file) as file:
    cp = json.load(file)

try:
    x_train = np.load("x_train.npy")
except:
    # Build training set
    audio_files = get_audiofiles(cp["audio_folder"])
    y_train = np.array([])
    x_train = np.array([])
    for file in audio_files:
        audio_samples, sample_rate = librosa.load(file)
        audio_samples = librosa.resample(audio_samples, sample_rate,
                                         cp["sample_rate"])
        window_size = int(cp["short_term"] * cp["sample_rate"])
        step_size = int(cp["step_size"] * cp["sample_rate"])
        print("Window_size: ", window_size, "Step_size: ", step_size)
        no_of_samples = int(
            (audio_samples.shape[0] - window_size) / step_size) - 1
        # dt = time between each feature vector
        dt = step_size / sample_rate
        print("Extracting features from ", file, "# samples: ",
              audio_samples.shape, " sr: ", cp["sample_rate"], " dt: ", dt,
              "# features: ", no_of_samples)

        for i in range(no_of_samples):
            y = audio_samples[(i * step_size):(i * step_size + window_size)]
Beispiel #46
0
def loadAudio(audioPath, offset=0.0, duration=None):
    y, sr = librosa.load(audioPath, offset=offset, duration=duration)
    return y, sr
Beispiel #47
0
"""
Abstract : Librosa is a package which serves to study audio files
with the help of this package we'll load a function that will read in the path to an audio file, 
using two functions call .amplitude and .stft with the help of matplotlib package 
to mash up frequency and amplitude into a resulting array ploted graph
(NOTE : The default sampling rate used by Librosa is 22050, but I used 44100 due to the sample's lenght)
"""
import librosa
import numpy as np
import matplotlib.pyplot as plt

# Wav file path

fichier = (r"C:\Users\Adam\Desktop\audio.wav")
y, sr = librosa.load(fichier, sr=44100)

# Size of the fft

n_fft = 2048
S = librosa.stft(y, n_fft=n_fft, hop_length=n_fft // 2)
"""
hop_length : The number of samples between successive frames
n_fft and hop length determine frequency in function of time resolution converted to dB
"""
D = librosa.amplitude_to_db(np.abs(S), ref=np.max)

# Calculate average over file

D_AVG = np.mean(D, axis=1)
plt.bar(np.arange(D_AVG.shape[0]), D_AVG)
"""
 def get_mag(url):
     y, _ = librosa.load(url, sr=sample_rate)
     complex_spec = librosa.core.stft(y, n_fft=n_fft, win_length=n_fft, hop_length=hop_length)
     mag_spec = np.abs(complex_spec)
     if use_log:  mag_spec = db_func(mag_spec)
     return mag_spec
Beispiel #49
0
def extract_features(dataset='train'):
    f = open(data_path + dataset + '_list.txt', 'r')

    i = 0
    for file_name in f:
        # progress check

        # load audio file
        file_name = file_name.rstrip('\n')
        file_path = data_path + file_name

        y, sr = librosa.load(file_path, sr=22050)

        S = librosa.core.stft(y, n_fft=1024, hop_length=512, win_length=1024)

        D_harmonic, D_percussive = librosa.decompose.hpss(S)

        #Hmag = librosa.amplitude_to_db(D_harmonic)
        #Pmag = librosa.amplitude_to_db(D_percussive)
        D_H = np.abs(D_harmonic)**2
        D_P = np.abs(D_percussive)**2
        # mel spectrogram (512 --> 40)
        mel_basis = librosa.filters.mel(sr, 1024, n_mels=40)
        mel_H = np.dot(mel_basis, D_H)
        mel_P = np.dot(mel_basis, D_P)
        #log compression
        log_mel_H = librosa.power_to_db(mel_H)
        log_mel_P = librosa.power_to_db(mel_P)

        # mfcc (DCT)
        mfcc_H = librosa.feature.mfcc(S=log_mel_H, n_mfcc=13)
        mfcc_H_delta = librosa.feature.delta(mfcc_H)
        mfcc_H_delta2 = librosa.feature.delta(mfcc_H, order=2)
        mfcc_H = np.concatenate((mfcc_H, mfcc_H_delta, mfcc_H_delta2), axis=0)
        mfcc_H = mfcc_H.astype(np.float32)

        mfcc_P = librosa.feature.mfcc(S=log_mel_P, n_mfcc=13)
        mfcc_P_delta = librosa.feature.delta(mfcc_P)
        mfcc_P_delta2 = librosa.feature.delta(mfcc_P, order=2)
        mfcc_P = np.concatenate((mfcc_P, mfcc_P_delta, mfcc_P_delta2), axis=0)
        mfcc_P = mfcc_P.astype(np.float32)
        # to save the memory (64 to 32 bits)
        file_name = file_name.replace('.wav', '.npy')
        save_file_H = mfcc_path + 'Harmonic/' + file_name
        save_file_P = mfcc_path + 'Percussive/' + file_name

        if not os.path.exists(os.path.dirname(save_file_H)):
            os.makedirs(os.path.dirname(save_file_H))
        if not os.path.exists(os.path.dirname(save_file_P)):
            os.makedirs(os.path.dirname(save_file_P))
        np.save(save_file_H, mfcc_H)
        np.save(save_file_P, mfcc_P)

        rmse = librosa.feature.rmse(S=S)[0]
        save_file_r = rmse_path + file_name
        if not os.path.exists(os.path.dirname(save_file_r)):
            os.makedirs(os.path.dirname(save_file_r))
        np.save(save_file_r, rmse)

        i = i + 1
        if not (i % 10):
            print i

    f.close()
def load_file(input_filename, mono=True, sr=22050):
    # if mono is true, returns samples of shape (2, n, )
    # else returns samples of shape (n, )
    # sample rate refers to number of samples per second: default selected by None, librosa default is 22050
    x, sr = librosa.load(input_filename, mono=mono, sr=sr)
    return x, sr
Beispiel #51
0
words, _ = zip(*count_pairs)
words_size = len(words)
print('词汇表大小:', words_size)
 
word_num_map = dict(zip(words, range(len(words))))
to_num = lambda word: word_num_map.get(word, len(words))
labels_vector = [ list(map(to_num, label)) for label in labels]
#print(wavs_file[0], labels_vector[0])
#wav/train/A11/A11_0.WAV -> [479, 0, 7, 0, 138, 268, 0, 222, 0, 714, 0, 23, 261, 0, 28, 1191, 0, 1, 0, 442, 199, 0, 72, 38, 0, 1, 0, 463, 0, 1184, 0, 269, 7, 0, 479, 0, 70, 0, 816, 254, 0, 675, 1707, 0, 1255, 136, 0, 2020, 91]
#print(words[479]) #绿
label_max_len = np.max([len(label) for label in labels_vector])
print('最长句子的字数:', label_max_len)
 
wav_max_len = 0  # 673
for wav in wav_files:
    wav, sr = librosa.load(wav, mono=True)
    mfcc = np.transpose(librosa.feature.mfcc(wav, sr), [1,0])
    if len(mfcc) > wav_max_len:
        wav_max_len = len(mfcc)
print("最长的语音:", wav_max_len)
 
batch_size = 16
n_batch = len(wav_files) // batch_size
 
# 获得一个batch
pointer = 0
def get_next_batches(batch_size):
    global pointer
    batches_wavs = []
    batches_labels = []
    for i in range(batch_size):
Beispiel #52
0
def getBeats(audiosrc):
    y, sr = librosa.load(audiosrc)
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    ts = librosa.frames_to_time(beats, sr=sr)
    return(ts)
Beispiel #53
0
def make_spectrum(filename=None,
                  y=None,
                  is_slice=False,
                  feature_type='logmag',
                  mode=None,
                  FRAMELENGTH=None,
                  SHIFT=None,
                  _max=None,
                  _min=None):
    '''
    Return:
    Sxx   = [F, T] (is_slice==False) or [T//FRAMELENGTH, F, FRAMELENGTH] (is_slice==True)
    phase = [F, T] (is_slice==False) or [T//FRAMELENGTH, F, FRAMELENGTH] (is_slice==True)
    y     = y

    '''
    if y is not None:
        y = y
    else:
        y, sr = librosa.load(filename, sr=16000)
        if sr != 16000:
            raise ValueError('Sampling rate is expected to be 16kHz!')
        if y.dtype == 'int16':
            y = np.float32(y / 32767.)
        elif y.dtype != 'float32':
            y = np.float32(y)

    ### Normalize waveform
    # y = y / np.max(abs(y)) / 2.

    D = librosa.stft(y,
                     center=False,
                     n_fft=hp.n_fft,
                     hop_length=hp.hop_length,
                     win_length=hp.n_fft,
                     window=scipy.signal.hamming)
    utt_len = D.shape[-1]
    phase = np.exp(1j * np.angle(D))
    D = np.abs(D)

    ### Feature type
    if feature_type == 'logmag':
        Sxx = np.log1p(D)
    elif feature_type == 'lps':
        Sxx = np.log10(D**2)
    else:
        Sxx = D

    if mode == 'mean_std':
        mean = np.mean(Sxx, axis=1).reshape(((hp.n_fft // 2) + 1, 1))
        std = np.std(Sxx, axis=1).reshape(((hp.n_fft // 2) + 1, 1)) + 1e-12
        Sxx = (Sxx - mean) / std
    elif mode == 'minmax':
        Sxx = 2 * (Sxx - _min) / (_max - _min) - 1

    # print("noisy_spec.shape before slice_pad:",Sxx.shape)
    if is_slice:
        Sxx = slice_pad(Sxx, SHIFT, seg_size=FRAMELENGTH, pad_value=0)
    # print("noisy_spec.shape after slice_pad:",Sxx.shape)

    return Sxx, phase, y
Beispiel #54
0
# coding: utf-8
from __future__ import unicode_literals
import numpy as np
import librosa
import os

audio_file_name = "./piano/piano_1.mp3"
# 오디오 파일 읽기
y, sr = librosa.load(audio_file_name)
# mfcc 추출
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_fft=2048, n_mfcc=20)
# 약 20*길이(초)*43 정도 (초당 약 43회 추출)
# 사이즈 확인
print(mfcc.shape)
# mfcc 변화량 추출
mfcc_delta = librosa.feature.delta(mfcc)
print(mfcc_delta.shape)
# 합치기
mfcc_and_delta = np.concatenate((mfcc, mfcc_delta), axis=0)
print(mfcc_and_delta.shape)
# 축 뒤집기
mfcc_and_delta = mfcc_and_delta.T
print(mfcc_and_delta.shape)
# 최종적으로 (43*길이, 40)의 모양이 되어야 함


overall_length = mfcc_and_delta.shape[0] #43*초
#10초씩 자르기 (10*43 = 430)
current_time = 0
window_length = 10*43
X = []
Beispiel #55
0
def load_sound_files(file_paths):
    raw_sounds = []
    for fp in file_paths:
        X, sr = librosa.load(fp)
        raw_sounds.append(X)
    return raw_sounds
Beispiel #56
0
num_of_test_samples = 300
batch_size = 64
no_epochs = 50
input_shape = 64

data_dir = 'data/'

dataset = []

### PREPARING THE DATASET ###

for i in range(10):
    directory = data_dir + str(i) + "/" + str(i)

    for j in range(1, num_of_samples + 1):
        y, sr = librosa.load(directory + " (" + str(j) + ").wav")
        no_samples = len(y)
        #spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, hop_length= math.floor(no_samples/128.))
        spectrogram = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_mels=input_shape,
            hop_length=math.floor(no_samples / input_shape))
        dataset.append((spectrogram, i))

random.shuffle(dataset)

data_train = dataset[:(num_of_samples - num_of_test_samples) * 10]
data_test = dataset[(num_of_samples - num_of_test_samples) * 10:]

X_train, y_train = zip(*data_train)
def read_wav(path, sr, duration=None, mono=True):
    wav, sr = librosa.load(path=path, sr=sr, mono=mono, duration=duration)
    return wav
import numpy as np
# import matplotlib as mpl
# mpl.use('Agg')
import librosa as l
#f rom matplotlib import pyplot

# specify path to audio files, directory to contain numpy arrays
audio_path = sys.argv[1]
out_path = sys.argv[2]

for audiofile in os.listdir(audio_path):

	if audiofile.endswith(".wav"):

		# Set 'y' to audio time series, 'sr' to sample rate
		y, sr = l.load(audio_path + audiofile)

		# Initialize the chromagram
		C = l.feature.chromagram(y=y, sr=sr, n_fft=4096, hop_length=64)

		# Uncomment to save chromagram to file
		np.save(out_path + "/" + audiofile[:-4], C)

		# The following can be uncommented to save a visual figure of the chromagram

		# # Make a new figure
		# pyplot.figure(figsize=(12,4))

		# # Display the chromagram: the energy in each chromatic pitch class as a function of time
		# # To make sure that the colors span the full range of chroma values, set vmin and vmax
		# l.display.specshow(C, sr=sr, hop_length=64, x_axis='time', y_axis='chroma', vmin=0, vmax=1)
tb = TensorBoard(log_dir='C:/nmb/nmb_data/graph',histogram_freq=0, write_graph=True, write_images=True)
history = model.fit(x_train, y_train, epochs=300, batch_size=16, validation_split=0.2, callbacks=[es, tb, lr, mc])

# 평가, 예측
model.load_weights('C:/nmb/nmb_data/h5/model_DNN_mels.h5')

result = model.evaluate(x_test, y_test, batch_size=16)
print("loss : ", result[0])
print("acc : ", result[1])
print("f1_score ", result[2])

pred_pathAudio = 'C:/nmb/nmb_data/pred_voice/'
files = librosa.util.find_files(pred_pathAudio, ext=['wav'])
files = np.asarray(files)
for file in files:   
    y, sr = librosa.load(file, sr=22050) 
    mels = librosa.feature.melspectrogram(y, sr=sr, hop_length=128, n_fft=512)
    pred_mels = librosa.amplitude_to_db(mels, ref=np.max)
    pred_mels = pred_mels.reshape(1, pred_mels.shape[0], pred_mels.shape[1])
    y_pred = model.predict(pred_mels)
    # print(y_pred)
    y_pred_label = np.argmax(y_pred)
    # print(y_pred_label)
    if y_pred_label == 0 :                   
        print(file,(y_pred[0][0])*100, '%의 확률로 여자입니다.')
    else:                               
        print(file,(y_pred[0][1])*100, '%의 확률로 남자입니다.')

end_now = datetime.datetime.now()
time = end_now - start_now
print("time >> " , time)    # time >>  0:00:33.975135
Beispiel #60
0
def get_features(audio_paths,
                 track_id=None,
                 param=param_default,
                 source_sr=None,
                 pass_random=True,
                 offset=0.0):

    feature_list = []

    # If input type is already audio array just pass it on
    if type(audio_paths) == np.ndarray:
        y = audio_paths

        # Potentially resample:
        if source_sr is not None and param['SAMPLING_RATE'] != source_sr:
            y = librosa.resample(y,
                                 orig_sr=source_sr,
                                 target_sr=param['SAMPLING_RATE'])
            if param['SAMPLING_RATE'] > source_sr:
                print('Warning: Tried to increase sampling rate.')

    # Otherwise load audio
    else:
        # Set correct path to file
        if type(audio_paths) == str:
            audio_path = audio_paths
        elif type(audio_paths) == dict and track_id is not None:
            audio_path = audio_paths[track_id]
        else:
            raise Exception('Incompatible parameters given to get_features.')

        # Potentially load offset and duration
        duration = None
        if param['single_slice_audio']:
            duration = param['sample_sec']
        # if 'offset' in param.keys():
        #     offset = param['offset']

        # Load track
        try:
            y, _ = librosa.load(audio_path,
                                sr=param['SAMPLING_RATE'],
                                duration=duration,
                                offset=offset)
        except:
            # TODO: This is another quick hack and not ideal for many reasons (e.g. fixed feature size)...
            if pass_random:
                print(
                    f'Cannot load audio file {audio_path}. Passing random features instead'
                )
                return np.random.random((128, 126))
            else:
                print(f'Cannot load audio file {audio_path}.')
                raise

    # Calculate spectrum
    y_stft_full = get_spectrum(y, param['N_FFT'], param['HOP_LENGTH'])

    # If HPSS, split spectrum here and perform feature extraction on both parts
    if 'USE_HPSS' in param.keys() and param['USE_HPSS']:
        y_h, y_p = librosa.decompose.hpss(y_stft_full)
        y_list = [y_h, y_p]
    else:
        y_list = [y_stft_full]

    for y_stft in y_list:
        # Get melspectrogram of track, as well as deltas
        melspec = get_melspec(y_stft, n_mels=param['MELSPEC_BANDS'])
        if param['USE_DELTA']:
            delta = librosa.feature.delta(melspec)
        if param['USE_DELTADELTA']:
            delta_delta = librosa.feature.delta(melspec, order=2)

        # Get MFCC
        if param['USE_MFCC'] or param['USE_MFCC_DELTA'] or param[
                'USE_MFCC_DELTADELTA']:
            mfcc = get_mfcc(melspec, n_mfcc=param['N_MFCC'])
            if param['USE_MFCC_DELTA']:
                mfcc_delta = librosa.feature.delta(mfcc)
            if param['USE_MFCC_DELTADELTA']:
                mfcc_delta_delta = librosa.feature.delta(mfcc, order=2)

        # Get Fluctogram
        if param['USE_FLUCT']:
            fluct, spec_contrac, spec_flat = get_fluctogram(
                y_stft, sr=param['SAMPLING_RATE'], n_fft=param['N_FFT'])

            if 'MASK_FLUCT' in param.keys() and param['MASK_FLUCT']:
                fluct = mask_fluctogram(fluct, spec_contrac, spec_flat, param)

        # Concatenate
        if param['USE_SPEC']:
            # Convert melspec from -80 to 0dB to range [0,1]
            spec = (melspec + 80.0) / 80.0
            feature_list.append(spec)
        if param['USE_DELTA']:
            feature_list.append(delta)
        if param['USE_DELTADELTA']:
            feature_list.append(delta_delta)
        if param['USE_MFCC']:
            feature_list.append(mfcc)
        if param['USE_MFCC_DELTA']:
            feature_list.append(mfcc_delta)
        if param['USE_MFCC_DELTADELTA']:
            feature_list.append(mfcc_delta_delta)
        if param['USE_FLUCT']:
            feature_list.append(fluct)
        if 'USE_SC' in param.keys() and param['USE_SC']:
            feature_list.append(spec_contrac)
        if 'USE_SF' in param.keys() and param['USE_SF']:
            feature_list.append(spec_flat)

    features = np.concatenate(feature_list)

    return features