def generate_mfcc_lists(train_aud, test_aud, number_mfcc=20):
    aud = { 'train_aud': train_aud, 'test_aud': test_aud}
    for key in aud.keys():
        files = aud[key]
        # initialize general variables
        mfcc = []
        trim_len = []
        pretrim_len = []

        # conduct loop over set of audio files
        for file in files:
            data, rate = librosa.load(file)

            # Trim leading and trailing silence
            pretrim_len.append(round(librosa.get_duration(data, rate),1))
            data, index = librosa.effects.trim(data, top_db=15)
            trim_len.append(round(librosa.get_duration(data, rate),1))

            mfcc.append(librosa.feature.mfcc(data, rate, n_mfcc=number_mfcc))

        # Scale features
        for i,x in enumerate(mfcc): 
            mfcc[i] = sklearn.preprocessing.scale(mfcc[i], axis=1)

        # assign result to corresponding variables
        if (key == 'train_aud'):
            x_train = mfcc
            train_len = trim_len
        elif (key == 'test_aud'):
            x_test = mfcc
            test_len = trim_len
    
    return x_train, x_test, train_len, test_len
Example #2
0
    def __test(infile, n_steps, bins_per_octave):
        y, sr = librosa.load(infile, duration=4.0)
        ys = librosa.effects.pitch_shift(y, sr, n_steps,
                                         bins_per_octave=bins_per_octave)

        orig_duration = librosa.get_duration(y, sr=sr)
        new_duration = librosa.get_duration(ys, sr=sr)

        # We don't have to be too precise here, since this goes through an STFT
        eq_(orig_duration, new_duration)
Example #3
0
    def __test(infile, rate):
        y, sr = librosa.load(infile, duration=4.0)
        ys = librosa.effects.time_stretch(y, rate)

        orig_duration = librosa.get_duration(y, sr=sr)
        new_duration = librosa.get_duration(ys, sr=sr)

        # We don't have to be too precise here, since this goes through an STFT
        assert np.allclose(orig_duration, rate * new_duration,
                           rtol=1e-2, atol=1e-3)
Example #4
0
def test_get_duration_filename():

    filename = 'data/test2_8000.wav'
    true_duration = 30.197625

    duration_fn = librosa.get_duration(filename=filename)
    y, sr = librosa.load(filename, sr=None)
    duration_y = librosa.get_duration(y=y, sr=sr)

    assert np.allclose(duration_fn, true_duration)
    assert np.allclose(duration_fn, duration_y)
Example #5
0
 def __init__(self, source, duration = None, sample_rate = None, offset = 0.0):
     if isinstance(source, basestring):
         self.signal, self.sample_rate = librosa.load(source, duration=duration, offset=offset)
         if duration is None:
             self.duration = librosa.get_duration(self.signal, sr=self.sample_rate)
         else:
             self.duration = duration
     else:
         self.signal = source
         self.sample_rate = sample_rate
         self.duration = librosa.get_duration(self.signal, sr=sample_rate)
     self.pre_process()
Example #6
0
    def __test(y, top_db, ref, trim_duration):
        yt, idx = librosa.effects.trim(y, top_db=top_db,
                                       ref=ref)

        # Test for index position
        fidx = [slice(None)] * y.ndim
        fidx[-1] = slice(*idx.tolist())
        assert np.allclose(yt, y[tuple(fidx)])

        # Verify logamp
        rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False)
        logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None)
        assert np.all(logamp > - top_db)

        # Verify logamp
        rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze()
        logamp_all = librosa.power_to_db(rms_all**2, ref=ref,
                                         top_db=None)

        start = int(librosa.samples_to_frames(idx[0]))
        stop = int(librosa.samples_to_frames(idx[1]))
        assert np.all(logamp_all[:start] <= - top_db)
        assert np.all(logamp_all[stop:] <= - top_db)

        # Verify duration
        duration = librosa.get_duration(yt)
        assert np.allclose(duration, trim_duration, atol=1e-1), duration
def libroRMS(filepath, kRatio):
    y, sr = librosa.load(filepath) # Load the waveform as y, sr is sample rate
    clipLength = librosa.get_duration(y=y, sr=sr)
    kValue = int(clipLength/kRatio +1) #sets up relative ratio of samples

    ### get the RMS of the audio sample ###
    data = librosa.feature.rmse(y=y, hop_length=2048)
    boundaries = librosa.segment.agglomerative(data, k=kValue) # Agglomeration
    boundary_times = librosa.frames_to_time(boundaries, hop_length=2048) # ~.1s
    intervals = np.hstack([boundary_times[:-1, np.newaxis], boundary_times[1:, np.newaxis]])
    get_rms = librosa.feature.sync(data, boundaries, aggregate=np.max)

    nkValue = kValue-1 #because, for some reason, the intervals above leave out the last one
    fixedN = np.delete(get_rms, nkValue, axis=1)
    npsTurn = np.concatenate((intervals, fixedN.T), axis=1)

    #transform from np array to regular list
    flatnps = npsTurn.tolist()
    slice_value = int(kValue//3)
    rmsOut1 = sorted(flatnps, key = lambda x: int(x[2]), reverse=True)
    #rmsOut2 = slice(rmsOut1[0: slice_value])
    rmsOut2 = rmsOut1[0 : slice_value]
    rmsOut3 = sorted(rmsOut2, key = lambda x: int(x[0]))

    return rmsOut3
Example #8
0
    def __test(jam_in, audio_file):

        jam = muda.load_jam_audio(jam_in, audio_file)

        assert hasattr(jam.sandbox, 'muda')

        eq_(jam.file_metadata.duration,
            librosa.get_duration(**jam.sandbox.muda._audio))
Example #9
0
File: time.py Project: EQ4/muda
    def states(self, jam):
        '''Get the state information from the jam'''

        state = dict()
        mudabox = jam.sandbox.muda
        state['duration'] = librosa.get_duration(y=mudabox._audio['y'],
                                                 sr=mudabox._audio['sr'])
        yield state
Example #10
0
def test_load_jam_audio(jam_loader, audio_file, validate, strict, fmt):

    jam = muda.load_jam_audio(jam_loader, audio_file,
                              validate=validate, strict=strict, fmt=fmt)

    assert hasattr(jam.sandbox, 'muda')

    duration = librosa.get_duration(**jam.sandbox.muda._audio)
    assert jam.file_metadata.duration == duration
Example #11
0
File: audio.py Project: EQ4/amen
 def __init__(self, file_path, convert_to_mono=False, sample_rate=22050):
     """
     Opens a file path, loads it with librosa.
     """
     self.file_path = file_path
     y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate)
     self.sample_rate = float(sr)
     self.raw_samples = y
     self.num_channels = y.ndim
     self.duration = librosa.get_duration(y=y, sr=sr)
Example #12
0
File: ir.py Project: EQ4/muda
    def states(self, jam):
        '''Iterate the impulse respones states'''

        state = dict()
        mudabox = jam.sandbox.muda
        state['duration'] = librosa.get_duration(y=mudabox._audio['y'],
                                                 sr=mudabox._audio['sr'])

        for i in range(len(self.ir_)):
            state['index'] = i
            yield state
Example #13
0
    def __test_spec(filename, sr, duration, n_fft, hop_length, center):
        y, sr = librosa.load(filename, sr=sr, duration=duration)

        S = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, center=center)

        duration_est = librosa.get_duration(S=S, sr=sr, n_fft=n_fft,
                                            hop_length=hop_length,
                                            center=center)

        # We lose a little accuracy in framing without centering, so it's
        # not as precise as time-domain duration
        assert np.allclose(duration_est, duration, rtol=1e-1, atol=1e-2)
Example #14
0
File: core.py Project: bmcfee/muda
def load_jam_audio(jam_in, audio_file,
                   validate=True,
                   strict=True,
                   fmt='auto',
                   **kwargs):
    '''Load a jam and pack it with audio.

    Parameters
    ----------
    jam_in : str, file descriptor, or jams.JAMS
        JAMS filename, open file-descriptor, or object to load.
        See ``jams.load`` for acceptable formats.

    audio_file : str
        Audio filename to load

    validate : bool
    strict : bool
    fmt : str
        Parameters to `jams.load`

    kwargs : additional keyword arguments
        See `librosa.load`

    Returns
    -------
    jam : jams.JAMS
        A jams object with audio data in the top-level sandbox

    Notes
    -----
    This operation can modify the `file_metadata.duration` field of `jam_in`:
    If it is not currently set, it will be populated with the duration of the
    audio file.

    See Also
    --------
    jams.load
    librosa.core.load
    '''

    if isinstance(jam_in, jams.JAMS):
        jam = jam_in
    else:
        jam = jams.load(jam_in, validate=validate, strict=strict, fmt=fmt)

    y, sr = librosa.load(audio_file, **kwargs)

    if jam.file_metadata.duration is None:
        jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr)

    return jam_pack(jam, _audio=dict(y=y, sr=sr))
Example #15
0
def analyze(filename=None, y=None, sr=None):
    '''Analyze a recording for all tasks.

    Parameters
    ----------
    filename : str, optional
        Path to audio file

    y : np.ndarray, optional
    sr : number > 0, optional
        Audio buffer and sampling rate

    .. note:: At least one of `filename` or `y, sr` must be provided.

    Returns
    -------
    jam : jams.JAMS
        a JAMS object containing all estimated annotations

    Examples
    --------
    >>> from crema.analyze import analyze
    >>> import librosa
    >>> jam = analyze(filename=librosa.util.example_audio_file())
    >>> jam
    <JAMS(file_metadata=<FileMetadata(...)>,
          annotations=[1 annotation],
          sandbox=<Sandbox(...)>)>
    >>> # Get the chord estimates
    >>> chords = jam.annotations['chord', 0]
    >>> chords.to_dataframe().head(5)
           time  duration  value  confidence
    0  0.000000  0.092880  E:maj    0.336977
    1  0.092880  0.464399    E:7    0.324255
    2  0.557279  1.021678  E:min    0.448759
    3  1.578957  2.693515  E:maj    0.501462
    4  4.272472  1.486077  E:min    0.287264
    '''

    _load_models()

    jam = jams.JAMS()
    # populate file metadata

    jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr,
                                                      filename=filename)

    for model in __MODELS__:
        jam.annotations.append(model.predict(filename=filename, y=y, sr=sr))

    return jam
Example #16
0
def beat_track(infile, outfile):

    # Load the audio file
    y, sr = librosa.load(infile)

    # Compute the track duration
    track_duration = librosa.get_duration(y=y, sr=sr)

    # Extract tempo and beat estimates
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

    # Convert beat frames to time
    beat_times = librosa.frames_to_time(beat_frames, sr=sr)

    # Construct a new JAMS object and annotation records
    jam = jams.JAMS()

    # Store the track duration
    jam.file_metadata.duration = track_duration

    beat_a = jams.Annotation(namespace='beat')
    beat_a.annotation_metadata = jams.AnnotationMetadata(data_source='librosa beat tracker')

    # Add beat timings to the annotation record.
    # The beat namespace does not require value or confidence fields,
    # so we can leave those blank.
    for t in beat_times:
        beat_a.append(time=t, duration=0.0)

    # Store the new annotation in the jam
    jam.annotations.append(beat_a)

    # Add tempo estimation to the annotation.
    tempo_a = jams.Annotation(namespace='tempo', time=0, duration=track_duration)
    tempo_a.annotation_metadata = jams.AnnotationMetadata(data_source='librosa tempo estimator')

    # The tempo estimate is global, so it should start at time=0 and cover the full
    # track duration.
    # If we had a likelihood score on the estimation, it could be stored in 
    # `confidence`.  Since we have no competing estimates, we'll set it to 1.0.
    tempo_a.append(time=0.0,
                   duration=track_duration,
                   value=tempo,
                   confidence=1.0)

    # Store the new annotation in the jam
    jam.annotations.append(tempo_a)

    # Save to disk
    jam.save(outfile)
Example #17
0
    def __init__(self, file_path=None, raw_samples=None, convert_to_mono=False,
                 sample_rate=44100, analysis_sample_rate=22050):
        """
        Audio constructor.
        Opens a file path, loads the audio with librosa, and prepares the features

        Parameters
        ----------

        file_path: string
            path to the audio file to load

        raw_samples: np.array
            samples to use for audio output

        convert_to_mono: boolean
            (optional) converts the file to mono on loading

        sample_rate: number > 0 [scalar]
            (optional) sample rate to pass to librosa.


        Returns
        ------
        An Audio object
        """

        if file_path:
            y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate)
        elif raw_samples is not None:
            # This assumes that we're passing in raw_samples
            # directly from another Audio's raw_samples.
            y = raw_samples
            sr = sample_rate

        self.file_path = file_path
        self.sample_rate = float(sr)
        self.analysis_sample_rate = float(analysis_sample_rate)
        self.num_channels = y.ndim
        self.duration = librosa.get_duration(y=y, sr=sr)

        self.analysis_samples = librosa.resample(librosa.to_mono(y),
                                                 sr, self.analysis_sample_rate,
                                                 res_type='kaiser_best')
        self.raw_samples = np.atleast_2d(y)

        self.zero_indexes = self._create_zero_indexes()
        self.features = self._create_features()
        self.timings = self._create_timings()
Example #18
0
def __test_time(jam_orig, jam_new, rate):

    # Test the track length
    ap_(librosa.get_duration(**jam_orig.sandbox.muda['_audio']),
        rate * librosa.get_duration(**jam_new.sandbox.muda['_audio']))

    # Test the metadata
    ap_(jam_orig.file_metadata.duration,
        rate * jam_new.file_metadata.duration)

    # Test each annotation
    for ann_orig, ann_new in zip(jam_orig.annotations, jam_new.annotations):
        # JAMS 0.2.1 support
        if hasattr(ann_orig, 'time'):
            ap_(ann_orig.time, rate * ann_new.time)
            ap_(ann_orig.duration, rate * ann_new.duration)

        ap_(ann_orig.data.time.values.astype(float),
            rate * ann_new.data.time.values.astype(float))
        ap_(ann_orig.data.duration.values.astype(float),
            rate * ann_new.data.duration.values.astype(float))

        if ann_orig.namespace == 'tempo':
            ap_(rate * ann_orig.data.value, ann_new.data.value)
Example #19
0
File: time.py Project: EQ4/muda
    def states(self, jam):
        '''Set the state for the transformation object'''

        state = dict()

        mudabox = jam.sandbox.muda

        state['track_duration'] = librosa.get_duration(y=mudabox._audio['y'],
                                                       sr=mudabox._audio['sr'])

        offsets = np.arange(start=0,
                            stop=(state['track_duration'] - self.min_duration),
                            step=self.stride)

        for t in offsets:
            state['offset'] = t
            yield state
Example #20
0
def smc_file_metadata(infile):
    '''Construct a metadata object from an SMC wav file'''

    match = re.match('.*(?P<index>SMC_\d+).wav$', infile)

    if not match:
        raise RuntimeError('Could not index filename {:s}'.format(infile))

    # Get the duration of the track
    y, sr = librosa.load(infile, sr=None)
    duration = librosa.get_duration(y=y, sr=sr)

    # Format duration as time
    metadata = jams.FileMetadata(title=match.group('index'),
                                 duration=duration)

    return metadata
Example #21
0
def test_save():

    jam = muda.load_jam_audio('data/fixture.jams',
                              'data/fixture.wav')

    _, jamfile = tempfile.mkstemp(suffix='.jams')
    _, audfile = tempfile.mkstemp(suffix='.wav')

    muda.save(audfile, jamfile, jam)

    jam2 = muda.load_jam_audio(jamfile, audfile)
    jam2_raw = jams.load(jamfile)

    os.unlink(audfile)
    os.unlink(jamfile)

    assert hasattr(jam2.sandbox, 'muda')
    assert '_audio' in jam2.sandbox.muda
    assert '_audio' not in jam2_raw.sandbox.muda

    eq_(jam2.file_metadata.duration,
        librosa.get_duration(**jam2.sandbox.muda['_audio']))
Example #22
0
def test_save(jam_in, audio_file, strict, fmt):

    jam = muda.load_jam_audio(jam_in, audio_file)

    _, jamfile = tempfile.mkstemp(suffix='.jams')
    _, audfile = tempfile.mkstemp(suffix='.wav')

    muda.save(audfile, jamfile, jam, strict=strict, fmt=fmt)

    jam2 = muda.load_jam_audio(jamfile, audfile, fmt=fmt)
    jam2_raw = jams.load(jamfile, fmt=fmt)

    os.unlink(audfile)
    os.unlink(jamfile)

    assert hasattr(jam2.sandbox, 'muda')
    assert '_audio' in jam2.sandbox.muda
    assert '_audio' not in jam2_raw.sandbox.muda

    duration = librosa.get_duration(**jam2.sandbox.muda['_audio'])

    assert jam2.file_metadata.duration == duration
Example #23
0
        d_text = d_file.read()
    g_file = open('mood.txt')
    with g_file:
        g_text = g_file.read()
    al_file = open('album.txt')
    with al_file:
        al_text = al_file.read()

    d, g, al = eval(d_text), eval(g_text), eval(al_text)
    c, c1, c2 = len(d), len(g), len(al)

    for filename in os.listdir(f'H:/hindisongs/{file}'):
        singer1, singer2, singer3 = 0, 0, 0
        try:
            audio = f'H:/hindisongs/{file}/{filename}'
            audio_duration = librosa.get_duration(filename=audio)
            duration, offset = audio_duration // 2, audio_duration // 3
            y, sr = librosa.load(audio, duration=duration, offset=offset)

            rmse = librosa.feature.rms(y=y)
            chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
            zcr = np.mean(librosa.feature.zero_crossing_rate(y=y))
            spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
            rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
            mel = np.mean(librosa.feature.melspectrogram(y, sr=sr))
            mfcc = librosa.feature.mfcc(y=y, sr=sr)
            percussion = librosa.effects.percussive(y, margin=8)

            harmonic = librosa.effects.harmonic(y, margin=8)
            chromagram = librosa.feature.chroma_cqt(y=harmonic, sr=sr)
Example #24
0
def get_wav_duration(file_path):
    return librosa.get_duration(filename=file_path)
Example #25
0
    j+=1

s=0
while(s<j):
    song_segment[s].export("song_segment_"+str(s),format="wav")
    s+=1
"""
##################Audio测试训练集###########################

#When you load the data, it gives you two objects; a numpy array of
# an audio file and the corresponding sampling rate by which it
# was extracted. Now to represent this as a waveform
# (which it originally is), use the following  code
data, sampling_rate = librosa.load('E:\\Urban_Sound_challenge_data\Train\Train\\2022.wav')##读取文件
plt.figure(figsize=(12, 4))
x_length = librosa.get_duration(data,sampling_rate)
print(x_length)
librosa.display.waveplot(data,sr=sampling_rate)
plt.show()
train = read_csv('E:\\Urban_Sound_challenge_data\Train\\train.csv')
def parser(row):
   # function to load files and extract features
   file_name = os.path.join(os.path.abspath('E:\\Urban_Sound_challenge_data\Train\\'), 'Train', str(row.ID) + '.wav')
   print(file_name)
   # handle exception to check if there isn't a file which is corrupted
   try:
      # here kaiser_fast is a technique used for faster extraction
      X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
      # we extract mfcc feature from data
      mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=30).T,axis=0)
   except Exception as e:
Example #26
0
path = input("Path>")
if not path:
    path = "test.mp3"
y, sr = librosa.load(path)
print(1)
y_harmonic, y_percussive = librosa.effects.hpss(y)
print(2)
tempo, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr)
print(3)
# with open("temp.pkl","wb") as f:
#     pickle.dump([y,sr,y_harmonic,y_percussive,tempo,beat_frames],f)
# with open("temp.pkl", "rb") as f:
#     y, sr, y_harmonic, y_percussive, tempo, beat_frames = pickle.load(f)
# 淡入淡出部分重叠?
# 每一段开始结束响度一样?
time = librosa.get_duration(y=y, sr=sr)
total_beat = librosa.time_to_frames(time)
# print((time,total_beat,beat_frames))
print(sr)
begin = 0
parts = []
last = 0
for i in beat_frames:
    end = int((len(y) * i) / total_beat)  # +(sr)//120
    # print(i)
    # sum=0
    # for i in y[begin:end]:
    #     sum+=i
    # 平均值是没有意义的!!!!!!!!!!!!!!!!
    # 眼睛啦
    # 简单过0频率检测
Example #27
0
def main(args):
    
    # load parameters and data path
    music_type = args.music_type
    onset_threshold = args.onset_threshold
    segment_threshold = args.segment_threshold
    input_type = 'melspectrogram'
    
    if music_type == 'synth':
        dataset_name = 'pedal-times_test.npz'
        npz_path = os.path.join(DIR_PEDAL_METADATA, dataset_name)
    elif music_type == 'real':
        npz_dir = os.path.join(DIR_REAL_DATA, 'reference') 
        dataset_name = 'pedal-times_realaudio.npz'
        npz_path = os.path.join(npz_dir, dataset_name)
    else:
        print("Error: Please set the music_type to either synth or real!")
    tracks = np.load(npz_path)
    filenames = tracks['filename']
    pedal_offset_gt_tracks = tracks['pedal_offset']
    pedal_onset_gt_tracks = tracks['pedal_onset']
    
    # load convnet models
    onset_exp_name = 'sub-onset_cnnkernel-melspectrogram_l4c13'
    onset_model = keras.models.load_model(os.path.join(DIR_SAVE_MODEL,"{}_best_model.h5".format(onset_exp_name)),
                                          custom_objects={'Melspectrogram':Melspectrogram})
    onset_model.load_weights(os.path.join(DIR_SAVE_MODEL,"{}_best_weights.h5".format(onset_exp_name)))    
    segment_exp_name = 'sub-segment_cnnkernel-melspectrogram_multift'
    segment_model = keras.models.load_model(os.path.join(DIR_SAVE_MODEL,"{}_best_model.h5".format(segment_exp_name)),
                                            custom_objects={'Melspectrogram':Melspectrogram})
    segment_model.load_weights(os.path.join(DIR_SAVE_MODEL,"{}_best_weights.h5".format(segment_exp_name)))    


    # initialise performance measurement
    filename_records = []
    accuracys = []
    precisions = []
    recalls = []
    fscores = []
    support0s = []
    support1s = []
    fp_rates = []
    fn_rates = []
    # append to lists
    filename_records = []
    support0s = []
    support1s = []
    acc01_frms = []
    p1_frms = []
    r1_frms = []
    f1_frms = []
    fp_rates = []
    fn_rates = []
    # boundary matrixs
    boundary_wins = []
    p1_sbrs = []
    r1_sbrs = []
    f1_sbrs = []
    r2e_deviation1s = []
    e2r_deviation1s = []
    p01_sbrs = []
    r01_sbrs = []
    f01_sbrs = []
    r2e_deviation01s = []
    e2r_deviation01s = []
    # structural matrixs
    p_pairwises = []
    r_pairwises = []
    f_pairwises = []
    nce_overs = []
    nce_unders = []
    nce_fs = []
    rand_indexs = []
    adjrand_indexs = []
    mutual_infos = []
    adjmutual_infos = []
    normmutual_infos = []
    
    # do detection piece by piece
    for filename_idx, filename in enumerate(filenames):  
        
        # load ground truth of the current piece 
        pedal_offset_gt = np.array(pedal_offset_gt_tracks[filename_idx])
        pedal_onset_gt = np.array(pedal_onset_gt_tracks[filename_idx])
        
        # load audio data of the current piece
        if music_type == 'synth':
            paudio_path = os.path.join(DIR_RENDERED, '{}-p.wav'.format(filename))
        elif music_type == 'real':
            paudio_dir = os.path.join(DIR_REAL_DATA, '{}'.format(filename)) 
            paudio_path = os.path.join(paudio_dir, '{}.wav'.format(filename))
        paudio, sr = librosa.load(paudio_path, sr=SR) 
        print("{}...".format(filename))
        
        # detect pedal onset if threshold is greater than 0
        if onset_threshold>0:
            len_onset_shape = int(SR * (TRIM_SECOND_BEFORE + TRIM_SECOND_AFTER))
            onsethop_length = HOP_LENGTH
            onsethop_duration = onsethop_length/SR
            n_ponset = int(np.ceil((len(paudio)-len_onset_shape)/onsethop_length))
            gen_ponset = data_gen(paudio, n_ponset, len_onset_shape, 'onset', hop_length=onsethop_length)
            pred_ponset = onset_model.predict_generator(gen_ponset, n_ponset // batch_size)
            # filter to reduce fragmentation
            pred_ponset_filter = medfilt(pred_ponset[:,1],15)
            # the corresponding time in second each frame represents
            frmtime_ponset = np.arange(n_ponset)*onsethop_duration+TRIM_SECOND_BEFORE
            # represent as frame wise binary results
            pred_ponset_todetect = np.copy(pred_ponset_filter)
            pred_ponset_todetect[pred_ponset_todetect<onset_threshold]=0
            pred_ponset_todetect[pred_ponset_todetect>=onset_threshold]=1
        
        # detect pedalled segment
        len_segment_shape = int(SR * MIN_SRC)
        seghop_length = HOP_LENGTH*10
        seghop_duration = seghop_length/SR
        n_psegment = int(np.ceil((len(paudio)-len_segment_shape)/seghop_length))
        gen_psegment = data_gen(paudio, n_psegment, len_segment_shape, 'segment', hop_length=seghop_length)
        pred_psegment = segment_model.predict_generator(gen_psegment, n_psegment // batch_size)
        # filter to reduce fragmentation
        pred_psegment_filter = medfilt(pred_psegment[:,1],3)
        # the corresponding time in second each frame represents
        frmtime_psegment = np.arange(n_psegment)*seghop_duration+MIN_SRC/2
        # remove the predicted value before the note onset
        paudio_firstonsettime = librosa.frames_to_time(librosa.onset.onset_detect(y=paudio, sr=SR), sr=SR)[0]
        n_segment_tozero=0
        for t in frmtime_psegment:
            if t < paudio_firstonsettime:
                n_segment_tozero+=1
            else:
                break        
        pred_psegment_filter[:n_segment_tozero] = 0
        # represent as frame wise binary results
        pred_psegment_todetect = np.copy(pred_psegment_filter)
        pred_psegment_todetect[pred_psegment_todetect<segment_threshold]=0
        pred_psegment_todetect[pred_psegment_todetect>=segment_threshold]=1

        # decide the initial indexes of pedal segment boundary
        onseg_initidxs = []
        offseg_initidxs = []
        for idx, v in enumerate(pred_psegment_todetect):
            if idx>0 and idx<len(pred_psegment_todetect)-1:
                if pred_psegment_todetect[idx-1]==0 and v==1 and pred_psegment_todetect[idx+1]==1:
                    onseg_initidxs.append(idx-1)
                elif pred_psegment_todetect[idx-1]==1 and v==1 and pred_psegment_todetect[idx+1]==0:
                    offseg_initidxs.append(idx+1)

        if offseg_initidxs[0] <= onseg_initidxs[0]:
            del offseg_initidxs[0]
        if onseg_initidxs[-1] >= offseg_initidxs[-1]:
            del onseg_initidxs[-1]

        if (len(onseg_initidxs) != len(offseg_initidxs)) or not len(pedal_offset_gt) or not len(pedal_onset_gt):
            print(" skip!")
        else:
            onseg_idxs = []
            offseg_idxs = []
            for idx in range(len(onseg_initidxs)):
                if onseg_initidxs[idx] < offseg_initidxs[idx]:
                    onseg_idxs.append(onseg_initidxs[idx])
                    offseg_idxs.append(offseg_initidxs[idx])

            if not len(onseg_idxs) or not len(offseg_idxs):
                print("  no detection!")  

            else:
                if onset_threshold>0:
                    # decide the boundary times in seconds with pedal onset candidates
                    onseg_times = []
                    offseg_times = []
                    for idx, onseg_idx in enumerate(onseg_idxs):
                        onponset_idx = onseg_idx*10-5
                        if any(pred_ponset_todetect[onponset_idx-5:onponset_idx+5]):
                            offseg_idx = offseg_idxs[idx]
                            offseg_times.append(frmtime_psegment[offseg_idx])
                            onseg_times.append(frmtime_psegment[onseg_idx])
                else:
                    onseg_times = frmtime_psegment[onseg_idxs] 
                    offseg_times = frmtime_psegment[offseg_idxs]
                    
                segintervals_est = np.stack((np.asarray(onseg_times),np.asarray(offseg_times)), axis=-1)

                # set the ground truth and estimation results frame by frame
                paudio_duration = librosa.get_duration(y=paudio, sr=SR)
                n_frames = int(np.ceil(paudio_duration/seghop_duration))
                segframes_gt = np.zeros(n_frames)
                segframes_est = np.zeros(n_frames)

                pedal_offset_gt = np.array(tracks['pedal_offset'][filename_idx])
                pedal_onset_gt = np.array(tracks['pedal_onset'][filename_idx])
                longpseg_idx = np.where((pedal_offset_gt-pedal_onset_gt)>seghop_duration)[0]
                longseg_onset_gt = pedal_onset_gt[longpseg_idx]
                longseg_offset_gt = pedal_offset_gt[longpseg_idx]
                segintervals_gt = np.stack((longseg_onset_gt,longseg_offset_gt), axis=-1)

                for idx, onset_t in enumerate(longseg_onset_gt):
                    offset_t = longseg_offset_gt[idx]
                    onset_frm = int(onset_t//seghop_duration)
                    offset_frm = int(offset_t//seghop_duration)
                    segframes_gt[onset_frm:offset_frm] = 1

                for idx, onset_t in enumerate(onseg_times):
                    offset_t = offseg_times[idx]
                    onset_frm = int(onset_t//seghop_duration)
                    offset_frm = int(offset_t//seghop_duration)
                    segframes_est[onset_frm:offset_frm] = 1 

                # set the ground truth and estimation results as interval format
                segintervals1_gt, segintervals01_gt, labels_gt = intervals1tointervals01(segintervals_gt, paudio_duration)
                segintervals1_est, segintervals01_est, labels_est = intervals1tointervals01(segintervals_est, paudio_duration)

                # Metrics for frame-wise label 'p'
                acc01_frm = accuracy_score(segframes_gt,segframes_est)
                p1_frm, r1_frm, f1_frm, support = precision_recall_fscore_support(segframes_gt,segframes_est)
                tn, fp, fn, tp = confusion_matrix(segframes_gt,segframes_est).ravel()
                fp_rate = fp/(fp+tn)
                fn_rate = fn/(fn+tp)

                # performance matrix based on boundary annotation of 'p'
                # window depends on duration of a beat
                onset_env = librosa.onset.onset_strength(paudio, sr=SR)
                tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=SR)[0]
                beat_insecond = 60/tempo
                p1_sbr,r1_sbr,f1_sbr = mir_eval.segment.detection(segintervals1_gt, segintervals1_est, window=beat_insecond)
                r2e_deviation1, e2r_deviation1 = mir_eval.segment.deviation(segintervals1_gt, segintervals1_est)
                # performance matrix based on boundary annotation of both 'p' and 'np' 
                p01_sbr,r01_sbr,f01_sbr = mir_eval.segment.detection(segintervals01_gt, segintervals01_est, window=beat_insecond)

                # performance matrix based on structural annotation
                scores = mir_eval.segment.evaluate(segintervals01_gt, labels_gt, segintervals01_est, labels_est)
                r2e_deviation01, e2r_deviation01 = [scores['Ref-to-est deviation'], scores['Est-to-ref deviation']]
                p_pairwise, r_pairwise, f_pairwise = [scores['Pairwise Precision'], scores['Pairwise Recall'], 
                                                      scores['Pairwise F-measure']]
                rand_index, adjrand_index = [scores['Rand Index'], scores['Adjusted Rand Index']]
                mutual_info, adjmutual_info, normmutual_info = [scores['Mutual Information'], scores['Adjusted Mutual Information'], 
                                                                scores['Normalized Mutual Information']]
                nce_over, nce_under, nce_f = [scores['NCE Over'], scores['NCE Under'], scores['NCE F-measure']]

                # append to lists
                filename_records.append(filename)
                support0s.append(support[0])
                support1s.append(support[1])
                acc01_frms.append(acc01_frm)
                p1_frms.append(p1_frm[1])
                r1_frms.append(r1_frm[1])
                f1_frms.append(f1_frm[1])
                fp_rates.append(fp_rate)
                fn_rates.append(fn_rate)
                # boundary matrixs
                boundary_wins.append(beat_insecond)
                p1_sbrs.append(p1_sbr)
                r1_sbrs.append(r1_sbr)
                f1_sbrs.append(f1_sbr)
                r2e_deviation1s.append(r2e_deviation1)
                e2r_deviation1s.append(e2r_deviation1)
                p01_sbrs.append(p01_sbr)
                r01_sbrs.append(r01_sbr)
                f01_sbrs.append(f01_sbr)
                r2e_deviation01s.append(r2e_deviation01)
                e2r_deviation01s.append(e2r_deviation01)
                # structural matrixs
                p_pairwises.append(p_pairwise)
                r_pairwises.append(r_pairwise)
                f_pairwises.append(f_pairwise)
                nce_overs.append(nce_over)
                nce_unders.append(nce_under)
                nce_fs.append(nce_f)
                rand_indexs.append(rand_index)
                adjrand_indexs.append(adjrand_index)
                mutual_infos.append(mutual_info)
                adjmutual_infos.append(adjmutual_info)
                normmutual_infos.append(normmutual_info)
                print("  done!")

    rows = zip(*[filename_records, support0s, support1s, acc01_frms, p1_frms, r1_frms, f1_frms, fp_rates, fn_rates, 
                 boundary_wins, p1_sbrs, r1_sbrs, f1_sbrs, r2e_deviation1s, e2r_deviation1s,
                 p01_sbrs, r01_sbrs, f01_sbrs, r2e_deviation01s, e2r_deviation01s,
                 p_pairwises, r_pairwises, f_pairwises, nce_overs, nce_unders, nce_fs, rand_indexs, 
                 adjrand_indexs, mutual_infos, adjmutual_infos, normmutual_infos])
    column_names =  ['filename_record', 'support0', 'support1', 'acc01_frm', 'p1_frm', 'r1_frm', 'f1_frm', 'fp_rate', 'fn_rate', 
                     'boundary_win', 'p1_sbr', 'r1_sbr', 'f1_sbr', 'r2e_deviation1', 'e2r_deviation1',
                     'p01_sbr', 'r01_sbr', 'f01_sbr', 'r2e_deviation01', 'e2r_deviation01',
                     'p_pairwise', 'r_pairwise', 'f_pairwise', 'nce_over', 'nce_under', 'nce_f', 'rand_index', 
                     'adjrand_index', 'mutual_info', 'adjmutual_info', 'normmutual_info']
    df = pd.DataFrame(rows, columns = column_names)

    if music_type == 'synth':
        df.to_csv('psegment-testresult_onset{}_seg{}.csv'.format(int(onset_threshold*100),int(segment_threshold*100)))
    elif music_type == 'real':
        df.to_csv('psegment-testresult-realaudio_onset{}_seg{}.csv'.format(int(onset_threshold*100),int(segment_threshold*100)))
Example #28
0
def harmonic_index(
        sourcefile,
        offset=0.0,
        duration=120.0,
        key=None,
        output_dir=None,
        n_fft=4096,
        hop_length=1024,
        pitch_median=5,  # how many frames for running medians?
        high_pass_f=40.0,
        low_pass_f=4000.0,
        debug=False,
        cached=True,
        n_peaks=16,
        **kwargs):
    """
    Index spectral peaks
    """
    if debug:
        from librosa.display import specshow
        import matplotlib.pyplot as plt
    # args that will make a difference to content,
    # apart from the sourcefile itself
    argset = dict(
        analysis="harmonic_index",
        # sourcefile=sourcefile,
        offset=offset,
        duration=duration,
        n_fft=n_fft,
        hop_length=hop_length,
        high_pass_f=high_pass_f,
        low_pass_f=low_pass_f,
        pitch_median=pitch_median,
        n_peaks=n_peaks,
    )
    sourcefile = Path(sourcefile).resolve()
    if output_dir is None:
        output_dir = sourcefile.parent
    output_dir = Path(output_dir)

    if key is None:
        key = str(sourcefile.stem) + "___" + sfio.safeish_hash(argset)

    metadatafile = (output_dir / key).with_suffix(".json")
    if cached and metadatafile.exists():
        return json.load(metadatafile.open("r"))

    metadata = dict(key=key, metadatafile=str(metadatafile), **argset)
    y, sr = sfio.load(str(sourcefile),
                      sr=None,
                      mono=True,
                      offset=offset,
                      duration=duration)

    if high_pass_f is not None:
        y = basicfilter.high_passed(y, sr, high_pass_f)

    dur = librosa.get_duration(y=y, sr=sr)

    metadata["dur"] = dur
    metadata["sr"] = sr
    # convert to spectral frames
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    y_rms = librosa.feature.rmse(S=D)
    # Separate into harmonic and percussive. I think this preserves phase?
    H, P = librosa.decompose.hpss(D)
    # Resynthesize the harmonic component as waveforms
    y_harmonic = librosa.istft(H)
    harmonicfile = str(output_dir / key) + ".harmonic.wav"
    sfio.save(harmonicfile, y_harmonic, sr=sr, norm=True)
    metadata["harmonicfile"] = harmonicfile

    # Now, power spectrogram
    H_mag, H_phase = librosa.magphase(H)

    H_peak_f, H_peak_mag = librosa.piptrack(S=H_mag,
                                            sr=sr,
                                            fmin=high_pass_f,
                                            fmax=low_pass_f)

    # First we smooth to use inter-bin information
    H_peak_f = median_filter(H_peak_f, size=(1, pitch_median))
    H_peak_mag = median_filter(H_peak_mag, size=(1, pitch_median))

    H_peak_power = np.real(H_peak_mag**2)
    H_rms = librosa.feature.rmse(S=H_peak_mag)

    if debug:
        plt.figure()
        specshow(librosa.logamplitude(H_peak_f, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak Freqs')
        plt.figure()
        specshow(librosa.logamplitude(H_peak_power, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak amps')
        plt.figure()

    # Now we pack down to the biggest few peaks:
    H_peak_f, H_peak_power = compress_peaks(H_peak_f, H_peak_power, n_peaks)

    if debug:
        plt.figure()
        specshow(librosa.logamplitude(H_peak_f, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak Freqs packed')
        plt.figure()
        specshow(librosa.logamplitude(H_peak_power, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak amps packed')
        # plt.figure()
        # plt.scatter(
        #     librosa.logamplitude(H_peak_power, ref_power=np.max),
        #     y_axis='log',
        #     sr=sr)
        # plt.title('Compressed')

    return dict(
        metadata=metadata,
        peak_f=H_peak_f,
        peak_power=H_peak_power,
        rms=y_rms,
        harm_rms=H_rms,
    )
Example #29
0

SRC_PATH = 'I:\\dataset-esc\\ESC-10-oggtowav\\'
DST_PATH = 'I:\\dataset-esc\\ESC-10-oggtowavtrimmed\\'


folder_list = os.listdir(SRC_PATH)
folder_list.sort()


for i in range(0, len(folder_list)):
    files = os.listdir(SRC_PATH + folder_list[i])
    files.sort()
    for name in sorted(files):

        if fnmatch(name, "*.wav"):
            y, sr = librosa.load(SRC_PATH + folder_list[i] + '\\' + name)

            # trim the zeros
            yt, index = librosa.effects.trim(y)

            # create a class folder if it does not exist
            if not os.path.exists(DST_PATH + folder_list[i]):
                os.makedirs(DST_PATH + folder_list[i])

            # save the trimmed file
            librosa.output.write_wav(DST_PATH + folder_list[i] + '\\' + name, yt, sr)

            # print the original duration and the new one
            print(librosa.get_duration(y), librosa.get_duration(yt) , max(y), max(yt))
Example #30
0
samples.sort()

spk_name = 'enbible'
lang = 'en_us'
n_skip = 0
total_dur = 0

fw = open(os.path.join(output_path, 'metadata.csv'), 'w', encoding='utf-8')

i = 0
for l in samples:
    filename, script, _ = l.split('\t')
    wav_file = os.path.join(in_path, filename + '.wav')
    if not os.path.exists(wav_file):
        print("Missing", wav_file)
        continue
    dur = librosa.get_duration(filename=wav_file)
    if not 1 <= dur <= 20 or any([c.isdigit() for c in script]):
        print(filename, script, dur)
        n_skip += 1
        continue
    total_dur += dur
    shutil.copy(wav_file,
                os.path.join(wav_output_path, '%s_%010d.wav' % (spk_name, i)))
    fw.write('|'.join(['%s_%010d' % (spk_name, i), script, spk_name, lang]) +
             '\n')
    i += 1

print("%d samples, %d skipped" % (len(samples) - n_skip, n_skip))
print("Total duration: %.2f h, %.2f min" %
      (total_dur / 60 / 60, total_dur / 60))
Example #31
0
else:
    etap_2 = '-'
    print('Второго этапа не будет')

#Фиксируем дату и время
date_time = str(datetime.now())

date_spliter = date_time.split(' ')
time_spliter = date_spliter[1].split('.')

date = date_spliter[0]
time = time_spliter[0]

#Узнаем id результата распознавания и длительность аудио
unique_id = id(result)
duration = librosa.get_duration(filename=pat_h)

#Таблица project
test = 'Testing'
description_1 = '-'

#Узнаем имя хоста и ip. Таблица server
hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)
description_2 = '-'

#Создаем лог-файл и фиксируем в него инфу
logging.basicConfig(level=logging.DEBUG, filename="tinkoff.log")
logging.debug(
    'Logging: %s', {
        'date': date,
Example #32
0
def gather_data(filename):
    '''
    Formats the analysis of a sound file into a single easy-to-use dictionary!
    
    filename: the path to the sound file to be analyzed
    
    returns: a dictionary with lots of juicy info!
    {
        "beats"       : a list of times at which a beat event occurs
        "framerate"   : the size of a frame in seconds
        "numframes"   : the total number of frames
        "frequencies" : a list of frequency spectrums (256 bins) at each frame
        "elevations"  : a list of the relative pitch heights of each frame 
    }
    '''
    print "Gathering song data..."

    # get our song
    y, sr = librosa.load(filename)

    # separate the foreground and background
    y_harmonic, y_percussive = librosa.effects.hpss(y)

    # compute the frequency spectrum
    S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=316)

    # Convert to log scale (dB) using the peak power as a reference
    log_S = librosa.logamplitude(S, ref_power=np.max)

    # format the frequencies into a list of 256 amplitudes at each frame
    frequencies = [[int(f[t]+80) for f in log_S[30:-30]] for t in xrange(len(log_S[0])) if t%30==0]

    # repeat each value 256 times
    frequencies = [[f for f in frame for _ in xrange(256)] for frame in frequencies]

    # get the framerate of the frequencies
    dur = librosa.get_duration(y)
    numframes = len(frequencies)
    framerate = dur / numframes

    # calculate the times of each beat event
    tempo, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=64)
    beats = [0] + [librosa.frames_to_time(b, sr=sr, hop_length=64) for b in beat_frames] + [dur]

    # get the elevation at each frame
    elevations = [ elevation(freqs) for freqs in frequencies ]
    
    # set the list of amplitudes for each semitone
    chromagram = librosa.feature.chromagram(y, sr)
    amplitudes = []
    for i in range(len(chromagram)):
        amplitudes.append(0)
        for t in chromagram[i]:
            amplitudes[i] += t

    # get the key of the whole audio
    base_key = key_finder(amplitudes)
    print base_key
    
    # get the base colors for the sphere
    amplitude_sum = 0.0
    for amplitude in amplitudes:
        amplitude_sum += amplitude
    print major_score(amplitudes, base_key, amplitude_sum)
    print minor_score(amplitudes, base_key, amplitude_sum)
    base_red, base_green, base_blue = mood_finder(major_score(
        amplitudes, base_key, amplitude_sum) < minor_score(
        amplitudes, base_key, amplitude_sum), tempo)

    base_colors = [ base_red, base_green, base_blue ]

    return {
        "beats": beats,
        "framerate": framerate,
        "numframes": numframes,
        "frequencies": frequencies,
        "elevations": elevations,
        "base_colors": base_colors,
    }
Example #33
0
def audio_length(filename):
    with open(filename, 'rb') as file:
        y = np.frombuffer(file.read(), dtype=np.float32)
    return librosa.get_duration(y, sr)
Example #34
0
    def create_video(self):
        selected_audio = filedialog.askopenfilename(parent=self,
                                                    initialdir='Resources/',
                                                    title='Select Audio File')
        if not selected_audio:
            return
        else:
            print(f'Selected Audio File: {selected_audio}')
            duration_s = librosa.get_duration(filename=selected_audio)
            duration_m = duration_s / 60
            print(f'Duration in Seconds: {duration_s}')
            print(f'Duration in Seconds: {duration_m}')

        selected_visual = filedialog.askopenfilename(
            parent=self,
            initialdir='Resources/',
            title='Select The Image For The Video')
        selected_visual_type = filetype.guess(selected_visual)
        selected_visual_mime = selected_visual_type.mime
        print(selected_visual_mime)
        if not selected_visual:
            return
        else:
            print(f'Selected Image File: {selected_visual}')

        width = 1920
        height = 1080
        vid_size = width, height
        # Flag for Custom Dimensions
        custom_dims = False
        response = messagebox.askquestion(
            'Use Custom Dimensions?',
            'Do you want to specify custom Dimensions? Defaults to 1920x1080')
        if response == 'yes':
            ui_width = simpledialog.askinteger(
                title='Specify Width', prompt='Specify the Video Width')
            ui_height = simpledialog.askinteger(
                title='Specify Height', prompt='Specify the Video Height')
            if ui_width and ui_height:
                custom_dims = True
            else:
                print('Missing Dimension Input. Defaulting to 1920x1080')

        if selected_audio and selected_visual:
            audio_path = os.path.basename(str(selected_audio))
            audio = AudioFileClip(selected_audio, fps=44100)
            visual_clip = None

            visual_clip = cast_to_clip(selected_visual, selected_visual_mime,
                                       duration_s)

            if custom_dims:
                visual_clip = resize(visual_clip, (ui_width, ui_height))

            response = messagebox.askquestion(
                'Add Watermark', 'Do you want to add a Watermark?')
            watermark_clip = None
            if response == 'yes':
                watermark = filedialog.askopenfilename(
                    parent=self,
                    initialdir='Resources/',
                    title='Select The Image For The Watermark')
                watermark_type = filetype.guess(watermark)
                watermark_mime = watermark_type.mime
                if watermark:
                    print(f'Adding Watermark: {watermark}')
                    factor_table = {'50%': 2, '33%': 3, '25%': 4, '20%': 5}
                    factor = 5
                    factor = simpledialog.askstring(
                        'Specify Scale of Watermark',
                        'Specify Scale of Watermark from [50%, 33%, 25%, 20%]')
                    if isinstance(factor,
                                  str) and factor in factor_table.keys():
                        factor = factor_table[factor]
                        print(f'Watermark Scale set to [{factor}]')

                    if 'image' in watermark_mime:
                        watermark_clip = (ImageClip(watermark).set_duration(
                            duration_s).resize(
                                (width / factor, height / factor)).set_pos(
                                    ('right', 'bottom')))
                    elif 'gif' in watermark_mime or 'video' in watermark_mime:
                        watermark_clip = (VideoFileClip(watermark).resize(
                            (width / factor, height / factor)).set_pos(
                                ('right', 'bottom')))
                        watermark_clip = watermark_clip.fx(vfx.loop,
                                                           duration=duration_s)

            if visual_clip:
                clip = visual_clip.set_audio(audio).set_duration(duration_s)
                if watermark_clip:
                    clip = CompositeVideoClip([clip, watermark_clip])
                clip.write_videofile('Exports/Test.mp4', fps=30)
Example #35
0
def getTimeLenSec(file0):
    y, sr = librosa.load(file0, sr=None)
    timeLen = librosa.get_duration(y, sr)

    return timeLen
import librosa
import os
import config as hp
from tqdm import tqdm

if __name__ == "__main__":
    files = [f for f in os.listdir(hp.wav_folder) if f.endswith('.wav')]
    files = [os.path.join(hp.wav_folder, f) for f in files]
    print('num_files: ' + str(len(files)))

    total_duration = 0

    for file in tqdm(files):
        y, sr = librosa.load(file)
        duration = librosa.get_duration(y, sr)
        total_duration += duration

    print('total_duration: ' + str(total_duration))
Example #37
0
    def generate_vectors(self):
        '''Generates noise and class vectors as inputs for each frame'''

        PULSE_SMOOTH = 0.75
        MOTION_SMOOTH = 0.75
        classes = self.classes
        class_shuffle_seconds = self.class_shuffle_seconds or [0]
        class_shuffle_strength = round(self.class_shuffle_strength * 12)
        fps = self.fps
        class_smooth_frames = self.class_smooth_seconds * fps
        motion_react = self.motion_react * 20 / fps

        # Get number of noise vectors to initialize (based on speed_fpm)
        num_init_noise = round(
            librosa.get_duration(self.wav, self.sr) / 60 * self.speed_fpm)

        # If num_init_noise < 2, simply initialize the same
        # noise vector for all frames
        if num_init_noise < 2:

            noise = [self.truncation * \
                     truncnorm.rvs(-2, 2,
                                   size = (self.batch_size, self.input_shape)) \
                              .astype(np.float32)[0]] * \
                    len(self.spec_norm_class)

        # Otherwise, initialize num_init_noise different vectors, and generate
        # linear interpolations between these vectors
        else:

            # Initialize vectors
            init_noise = [self.truncation * \
                          truncnorm.rvs(-2, 2,
                                        size=(self.batch_size, self.input_shape)) \
                                   .astype(np.float32)[0]\
                          for i in range(num_init_noise)]

            # Compute number of steps between each pair of vectors
            steps = int(
                np.floor(len(self.spec_norm_class)) / len(init_noise) - 1)

            # Interpolate
            noise = full_frame_interpolation(init_noise, steps,
                                             len(self.spec_norm_class))

        # Initialize lists of Pulse, Motion, and Class vectors
        pulse_noise = []
        motion_noise = []
        self.class_vecs = []

        # Initialize "base" vectors based on Pulse/Motion Reactivity values
        pulse_base = np.array([self.pulse_react] * self.input_shape)
        motion_base = np.array([motion_react] * self.input_shape)

        # Randomly initialize "update directions" of noise vectors
        self.motion_signs = np.array([random.choice([1,-1]) \
                                      for n in range(self.input_shape)])

        # Randomly initialize factors based on motion_randomness
        rand_factors = np.array([random.choice([1,1-self.motion_randomness]) \
                                 for n in range(self.input_shape)])

        for i in range(len(self.spec_norm_class)):

            # UPDATE NOISE #

            # Re-initialize randomness factors every 4 seconds
            if i % round(fps * 4) == 0:
                rand_factors = np.array([random.choice([1, 1-self.motion_randomness]) \
                                     for n in range(self.input_shape)])

            # Generate incremental update vectors for Pulse and Motion
            pulse_noise_add = pulse_base * self.spec_norm_pulse[i]
            motion_noise_add = motion_base * self.spec_norm_motion[i] * \
                               self.motion_signs * rand_factors

            # Smooth each update vector using a weighted average of
            # itself and the previous vector
            if i > 0:
                pulse_noise_add = pulse_noise[i-1]*PULSE_SMOOTH + \
                                  pulse_noise_add*(1 - PULSE_SMOOTH)
                motion_noise_add = motion_noise[i-1]*MOTION_SMOOTH + \
                                   motion_noise_add*(1 - MOTION_SMOOTH)

            # Append Pulse and Motion update vectors to respective lists
            pulse_noise.append(pulse_noise_add)
            motion_noise.append(motion_noise_add)

            # Update current noise vector by adding current Pulse vector and
            # a cumulative sum of Motion vectors
            noise[i] = noise[i] + pulse_noise_add + sum(motion_noise[:i + 1])
            self.noise = noise
            self.current_noise = noise[i]

            # Update directions
            self.motion_signs = self.update_motion_signs()

            # UPDATE CLASSES #

            # If current frame is a shuffle frame, shuffle classes accordingly
            if self.is_shuffle_frame(i):
                self.classes = self.classes[class_shuffle_strength:] + \
                               self.classes[:class_shuffle_strength]

            # Generate class update vector and append to list
            class_vec_add = self.generate_class_vec(frame=i)
            self.class_vecs.append(class_vec_add)

        # Smoothen class vectors by obtaining the mean vector per
        # class_smooth_frames frames, and interpolating between these vectors
        if class_smooth_frames > 1:

            # Obtain mean vectors
            class_frames_interp = [np.mean(self.class_vecs[i:i + class_smooth_frames],
                                           axis = 0) \
                                  for i in range(0, len(self.class_vecs),
                                                 class_smooth_frames)]
            # Interpolate
            self.class_vecs = full_frame_interpolation(class_frames_interp,
                                                       class_smooth_frames,
                                                       len(self.class_vecs))
Example #38
0
#model_cutoff = 200

render_duration = 1000.0
render_size = 1920
extension = 'png'
model_cutoff = 200

beats_per_frame = 4
sigma_weight = 1 / 2.5
exageration_weight = 0.10
exageration_sigma = 1 / 5.0
fps = 30

f_wav = "sound/secret_crates.wav"
WAV, sr = librosa.load(f_wav, duration=render_duration)
total_seconds = librosa.get_duration(WAV, sr)

save_dest = "results/interpolation_matching_beats"
os.system(f'mkdir -p {save_dest}')

f_beats = f_wav + '_beats.npy'
f_onset = f_wav + '_onset.npy'
beats = np.load(f_beats)
onsets = np.load(f_onset)

sess = create_session()
t_size = tf.placeholder_with_default(200, [])
t_image = cppn(t_size)
train_vars = sess.graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

Example #39
0
#MARTINV
#FRANCISCOS
#ALANG
#FERNANDOO
#CESARA
#MARIOV
#KEIKOF
#VERONICAM
#LOCUTORA1
#LOCUTORA2

nombre_politico = "_JULIOG"
step = 1  #Intervalo en segundos del audio
file_name = 'audio0010'  #nombre del archivo
file_name_full = "pasos/" + file_name + ".wav"  #Crear una carpeta "pasos" en la carpeta donde esta el proyecto
len_audio = int(librosa.get_duration(
    filename=file_name_full))  #Longitud del audio en segundos.
print("El audio tiene una duracion de ", len_audio, " segundos")
espaciado = "_______________________________"
audio_vacio = []

#abrir el archivo wav y convertirlo a monaural.
sound = AudioSegment.from_wav(file_name_full)
sound = sound.set_channels(1)
file_name_full_mono = "pasos/" + file_name + "_mono.wav"
sound.export(file_name_full_mono, format="wav")

data1, rate1 = sf.read(file_name_full_mono)  # Carga el audio
meter1 = pyln.Meter(rate1)
loudness1 = meter1.integrated_loudness(data1)
print(loudness1)
Example #40
0
    def add_background(cls,
                       file_name,
                       noise_path,
                       out_dir,
                       len_noise_to_add=5.0):
        '''
        Takes an absolute file path, and the path to a
        directory that contains noise to overlay onto the 
        given sound file (wind, rain, etc.).
        
        Returns a numpy structure corresponding to the
        original audio with the noise overlaid, plus the
        sample rate of the new sample. A file name is suggested
        for the sample. It is composed of elements such 
        as the nature and duration of the noise. Client
        may choose to ignore or use.

        :param file_name: absolute path to sound file
        :type file_name: str
        :param noise_path: absolute path to directory
            with noise files
        :type noise_path: str
        :param out_dir: destination directory of new audio file
        :type out_dir: str
        :param len_noise_to_add: how much of a noise snippet
            to overlay (seconds)
        :type len_noise_to_add: float
        :return: full path of new audio file
        :rtype: str
        '''

        len_noise_to_add = float(len_noise_to_add)
        backgrounds = os.listdir(noise_path)

        # Pick a random noise file:
        background_name = backgrounds[random.randint(0, len(backgrounds) - 1)]

        cls.log.info(f"Adding {background_name} to {file_name}.")

        # We will be working with 1 second as the smallest unit of time
        # load all of both wav files and determine the length of each
        noise, noise_sr = SoundProcessor.load_audio(
            os.path.join(noise_path,
                         background_name))  # type(noise) = np.ndarray
        orig_recording, orig_sr = SoundProcessor.load_audio(file_name)

        new_sr = math.gcd(noise_sr, orig_sr)
        if noise_sr != orig_sr:
            # Resample both noise and orig records so that they have same sample rate
            cls.log.info(f"Resampling: {background_name} and {file_name}")
            noise = librosa.resample(noise, noise_sr, new_sr)
            orig_recording = librosa.resample(orig_recording, orig_sr, new_sr)
            # input("ready?")

        noise_duration = librosa.get_duration(noise, noise_sr)
        if noise_duration < len_noise_to_add:
            cls.log.info(
                f"Duration:{noise_duration} < len_noise_to_add:{len_noise_to_add}. Will only add {noise_duration}s of noise"
            )
            samples_per_segment = len(noise)
        elif noise_duration >= len_noise_to_add:  # randomly choose noise segment
            samples_per_segment = int(
                new_sr * len_noise_to_add
            )  # this is the number of samples per 5 seconds
            # Place noise randomly:
            subsegment_start = random.randint(0,
                                              len(noise) - samples_per_segment)
            noise = noise[subsegment_start:subsegment_start +
                          samples_per_segment]
        cls.log.info(
            f"len(noise) after random segment: {len(noise)}; noise duration: {len(noise)/new_sr}"
        )

        orig_duration = librosa.core.get_duration(orig_recording, orig_sr)
        # if orig_recording is shorter than the noise we want to add, just add 5% noise
        if orig_duration < len_noise_to_add:
            cls.log.info(
                f"Recording: {file_name} was shorter than len_noise_to_add. Adding 5% of recording len worth of noise."
            )
            new_noise_len = orig_duration * 0.05
            noise = noise[:int(new_noise_len * new_sr)]
        noise_start_loc = random.randint(
            0,
            len(orig_recording) - samples_per_segment)
        cls.log.info(
            f"Inserting noise starting at {noise_start_loc/new_sr} seconds.")
        # split original into three parts: before_noise, during_noise, after_noise
        before_noise = orig_recording[:noise_start_loc]
        during_noise = orig_recording[noise_start_loc:noise_start_loc +
                                      samples_per_segment]
        after_noise = orig_recording[noise_start_loc + samples_per_segment:]

        assert len(during_noise) == len(noise)

        segment_with_noise = during_noise + Utils.noise_multiplier(
            orig_recording, noise) * noise
        first_half = np.concatenate((before_noise, segment_with_noise))
        new_sample = np.concatenate(
            (first_half, after_noise))  # what i think it should be
        new_duration = librosa.get_duration(new_sample, float(new_sr))

        assert new_duration == orig_duration
        # File name w/o extension:
        sample_file_stem = Path(file_name).stem
        noise_file_stem = Path(background_name).stem
        noise_dur = str(int(noise_start_loc / new_sr * 1000))
        file_name = f"{sample_file_stem}-{noise_file_stem}_bgd{noise_dur}ms.wav"

        # Ensure that the fname doesn't exist:
        uniq_fname = Utils.unique_fname(out_dir, file_name)
        out_path = os.path.join(out_dir, uniq_fname)

        soundfile.write(out_path, new_sample, new_sr)
        return out_path
Example #41
0
train_sets = ['test']

for train_set in train_sets:
  #train_csv = base_dir + train_set + '.csv'
  train_audio_dir = base_dir + train_set + '/'
  
  output_train_csv = ourput_dir + train_set + '.csv'

  with open(output_train_csv, 'w') as ofp:
    #for audio_file in os.listdir(train_audio_dir):
    #if True:
    header = True
    ctr = 0
    #for line in tqdm(open(train_csv)):
    for audio_file in tqdm(os.listdir(train_audio_dir)):
      ctr += 1
      if ctr > 10:
        #break
        pass
      if header:
        ofp.write('duration,sampling_rate,waveform_length,' + 'fname' + '\n')
        header = False
        continue
      #audio_file = line.split(',')[0]
      audio_file_path = train_audio_dir + audio_file
      wave_form, sampling_rate = librosa.load(audio_file_path) 
      duration = librosa.get_duration(y=wave_form, sr=sampling_rate)     
      output_line = str(duration) + ',' + str(sampling_rate) + ',' + str(len(wave_form)) + ',' + audio_file + '\n'
      ofp.write(output_line)

Example #42
0
ref_recording, sr = music_parser.readMusicFile(f'assets/{ref_track}')
test_recording, sr = music_parser.readMusicFile(f'assets/{test_track}')

# Importing the metadata from the JSON-file
meta_data = JSON_Classifier()
meta_data.readJSON('assets/testdata.json')

# Splitting the audio file in segments, according to the metadata
segment_list = music_parser.splitReferenceRecording(
    meta_data.segments,
    sr,
    ref_recording,
)

# Feature Extraction/Definition
ref_length = librosa.get_duration(ref_recording, sr=sr)
test_length = librosa.get_duration(test_recording, sr=sr)
frame_length = 9600
hopsize = 4800
window = 'hann'

# Creating each chromagram
ref_chromagram = music_parser.compute_chromagrams(segment_list,
                                                  sr,
                                                  norm=None,
                                                  hop_length=hopsize,
                                                  n_fft=frame_length,
                                                  window=window,
                                                  tuning=0)
test_chromagram = music_parser.compute_one_chromagram(test_recording,
                                                      sr,
Example #43
0
def save_trimwav(wavpath, putpath, top_db=20):
    y, sr = lib.load(wavpath)
    yt, index = lib.effects.trim(y, top_db=top_db)
    print(lib.get_duration(y), lib.get_duration(yt))
    lib.output.write_wav(putpath, yt, sr)
Example #44
0
import matplotlib.pyplot as plt

# Utility
import os
import glob
import numpy as np
from tqdm import tqdm
import itertools

dataset = []
for folder in ["dataset/set_a/**", "dataset/set_b/**"]:
    for filename in glob.iglob(folder):
        if os.path.exists(filename):
            label = os.path.basename(filename).split("_")[0]
            # skip audio smaller than 4 secs
            if librosa.get_duration(filename=filename) >= 4:
                if label not in ["Aunlabelledtest", "Bunlabelledtest"]:
                    dataset.append({"filename": filename, "label": label})
dataset = pd.DataFrame(dataset)
dataset = shuffle(dataset, random_state=42)

print(dataset.info())

plt.figure(figsize=(12, 6))
dataset.label.value_counts().plot(kind='bar', title="Dataset distribution")
plt.show()

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

print("Train: %i" % len(train))
print("Test: %i" % len(test))
Example #45
0
def generate_json(wavfile, DT_ID, song_db):
    indv = wavfile.parent.parent.stem
    dt = datetime.strptime(wavfile.stem, "%Y-%m-%d_%H-%M-%S-%f")
    datestring = dt.strftime("%Y-%m-%d")

    row=song_db[
                (song_db.SubjectName == indv)
                & (song_db.recording_date == datestring)
                & (song_db.recording_time == dt.time())
            ].iloc[0]

    # make json dictionary
    json_dict = {}
    for key in dict(row).keys():
        if type(row[key]) == pd._libs.tslibs.timestamps.Timestamp:
            json_dict[key] = row[key].strftime("%Y-%m-%d_%H-%M-%S")
        elif type(row[key]) == dtt:
            json_dict[key] = row[key].strftime("%H:%M:%S")
        elif type(row[key]) == pd._libs.tslibs.nattype.NaTType:
            continue
        else:
            json_dict[key] = row[key]


    json_dict["species"] = "Toxostoma redivivum"
    json_dict["common_name"] = "California thrasher"
    json_dict["datetime"] = datestring

    sr = get_samplerate(wavfile.as_posix())
    wav_duration = librosa.get_duration(filename=wavfile.as_posix())

    # rate and length
    json_dict["samplerate_hz"] = sr
    json_dict["length_s"] = wav_duration
    json_dict["wav_loc"] = wavfile.as_posix()

    tg = wavfile.parent.parent / "TextGrids" / (wavfile.stem + ".TextGrid")

    textgrid = tgio.openTextgrid(fnFullPath=tg)

    tierlist = textgrid.tierDict[textgrid.tierNameList[0]].entryList
    start_times = [i.start for i in tierlist]
    end_times = [i.end for i in tierlist]
    labels = [i.label for i in tierlist]

    json_dict["indvs"] = {
        indv: {
            "syllables": {
                "start_times": NoIndent(start_times),
                "end_times": NoIndent(end_times),
                "labels": NoIndent(labels),
            }
        }
    }

    # generate json
    json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)


    json_out = (
        DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wavfile.stem + ".JSON")
    )

    # save json
    ensure_dir(json_out.as_posix())
    print(json_txt, file=open(json_out.as_posix(), "w"))
def extract_ground_truth(diagnostics_group):
    """
    Extract ground-truth information from one or more MIDI files about a single
    MIDI file based on the results in one or more diagnostics files and return
    a JAMS object with all of the annotations compiled.

    Parameters
    ----------
    - diagnostics_group : list of dict
        List of dicts of diagnostics, each about a successful alignment of a
        different MIDI file to a single audio file.
    """
    # Construct the JAMS object
    jam = jams.JAMS()
    # Load in the first diagnostics (doesn't matter which as they all
    # should correspond the same audio file)
    diagnostics = diagnostics_group[0]
    # Load in the audio file to get its duration for the JAMS file
    audio, fs = librosa.load(
        diagnostics['audio_filename'], feature_extraction.AUDIO_FS)
    jam.file_metadata.duration = librosa.get_duration(y=audio, sr=fs)
    # Also store metadata about the audio file, retrieved from the MSD
    jam.file_metadata.identifiers = {'track_id': diagnostics['audio_id']}
    jam.file_metadata.artist = MSD_LIST[diagnostics['audio_id']]['artist']
    jam.file_metadata.title = MSD_LIST[diagnostics['audio_id']]['title']

    # Iterate over the diagnostics files supplied
    for diagnostics in diagnostics_group:

        # Create annotation metadata object, shared across annotations
        commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
        commit_url = "http://github.com/craffel/midi-dataset/tree/" + commit
        annotator = {'midi_md5': diagnostics['midi_md5'],
                     'commit_url': commit_url,
                     'confidence': diagnostics['score']}
        annotation_metadata = jams.AnnotationMetadata(
            curator=jams.Curator('Colin Raffel', '*****@*****.**'),
            version='0.0.1b', corpus='Million Song Dataset MIDI Matches',
            annotator=annotator,
            annotation_tools=(
                'MIDI files were matched and aligned to audio files using the '
                'code at http://github.com/craffel/midi-dataset.  Information '
                'was extracted from MIDI files using pretty_midi '
                'https://github.com/craffel/pretty-midi.'),
            annotation_rules=(
                'Beat locations and key change times were linearly '
                'interpolated according to an audio-to-MIDI alignment.'),
            validation=(
                'Only MIDI files with alignment confidence scores >= .5 were '
                'considered "correct".  The confidence score can be used as a '
                'rough guide to the potential correctness of the annotation.'),
            data_source='Inferred from a MIDI file.')

        # Load the extracted features
        midi_features = deepdish.io.load(diagnostics['midi_features_filename'])
        audio_features = deepdish.io.load(
            diagnostics['audio_features_filename'])
        # Load in the original MIDI file
        midi_object = pretty_midi.PrettyMIDI(diagnostics['midi_filename'])
        # Compute the times of the frames (will be used for interpolation)
        midi_frame_times = feature_extraction.frame_times(
            midi_features['gram'])[diagnostics['aligned_midi_indices']]
        audio_frame_times = feature_extraction.frame_times(
            audio_features['gram'])[diagnostics['aligned_audio_indices']]

        # Get the interpolated beat locations and add them to the JAM
        adjusted_beats = interpolate_times(
            midi_object.get_beats(), midi_frame_times, audio_frame_times)
        # Create annotation record for the beats
        beat_a = jams.Annotation(namespace='beat')
        beat_a.annotation_metadata = annotation_metadata
        # Add beat timings to the annotation record
        for t in adjusted_beats:
            beat_a.append(time=t, duration=0.0)
        # Add beat annotation record to the JAMS file
        jam.annotations.append(beat_a)

        # Get key signature times and their string names
        key_change_times = [c.time for c in midi_object.key_signature_changes]
        key_names = [pretty_midi.key_number_to_key_name(c.key_number)
                     for c in midi_object.key_signature_changes]
        # JAMS requires that the key name be supplied in the form e.g.
        # "C:major" but pretty_midi returns things in the format "C Major",
        # so the following code converts to JAMS format
        key_names = [name.replace(' ', ':').replace('M', 'm')
                     for name in key_names]
        # Compute interpolated event times
        adjusted_key_change_times, adjusted_key_names = interpolate_times(
            key_change_times, midi_frame_times, audio_frame_times, key_names,
            True)
        # Create JAMS annotation for the key changes
        if len(adjusted_key_change_times) > 0:
            key_a = jams.Annotation(namespace='key_mode')
            key_a.annotation_metadata = annotation_metadata
            # We only have key start times from the MIDI file, but JAMS wants
            # durations too, so create a list of "end times"
            end_times = np.append(adjusted_key_change_times[1:],
                                  jam.file_metadata.duration)
            # Add key labels into the JAMS file
            for start, end, key in zip(adjusted_key_change_times, end_times,
                                       adjusted_key_names):
                key_a.append(time=start, duration=end - start, value=key)
            jam.annotations.append(key_a)

    return jam
Example #47
0
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Created the embedding")

    texts = [in_text]
    embeds = [embed]
    #embeds = [[0] * 256]

    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    griffin_lim = False
    if not griffin_lim:
        generated_wav = vocoder.infer_waveform(spec)
        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                               mode="constant")
        generated_wav = encoder.preprocess_wav(generated_wav)
    else:
        generated_wav = Synthesizer.griffin_lim(spec)

    write = True
    if write:
        sf.write(out_path, generated_wav.astype(np.float32),
                 round(synthesizer.sample_rate / 1.0))
        print("Audio file has been written.")

    audio_length = librosa.get_duration(generated_wav, sr=14545)
    sd.play(generated_wav.astype(np.float32),
            round(synthesizer.sample_rate / 1.0))
    time.sleep(audio_length)
    print("Done")
Example #48
0
    def __test_audio(filename, mono, sr, duration):
        y, sr = librosa.load(filename, sr=sr, mono=mono, duration=duration)

        duration_est = librosa.get_duration(y=y, sr=sr)

        assert np.allclose(duration_est, duration, rtol=1e-3, atol=1e-5)
    def predict(self, audio_data, pedal_gt, batch_size=4):
        pedal_onset_gt, pedal_offset_gt = pedal_gt
        # ======= Get prediciton of all onsets =======
        len_onset_shape = int(
            SAMPLING_RATE * (TRIM_SECOND_BEFORE + TRIM_SECOND_AFTER))
        onsethop_length = HOP_LENGTH
        onsethop_duration = onsethop_length / SAMPLING_RATE
        n_onset = int(
            np.ceil((len(audio_data) - len_onset_shape) / onsethop_length))
        print("Audio duration: {}s".format(
            librosa.get_duration(y=audio_data, sr=SAMPLING_RATE)))
        print("n_onset: {}".format(n_onset))

        data_full_song = FullSongDataset(
            audio_data, n_onset, len_onset_shape, "onset", onsethop_length)
        loader = DataLoader(data_full_song, batch_size)
        pred_onset, _ = models.run_on_dataset(
            self.onset_model, loader, device=self.device)
        pred_onset = pred_onset.squeeze()
        # print("Onset Prediction:\n{}".format(pred_onset))

        pred_onset_filter = medfilt(pred_onset, 15)
        frmtime_onset = np.arange(n_onset) * \
            onsethop_duration + TRIM_SECOND_BEFORE
        print("Filtered Onset Prediction:\n{}\nMax: {}, Min: {}\n".format(
            pred_onset_filter, np.max(pred_onset_filter), np.min(pred_onset_filter)))

        # ======= Get prediciton of all segments =======
        len_segment_shape = int(SAMPLING_RATE * MIN_SRC)
        seghop_length = HOP_LENGTH * 10
        seghop_duration = seghop_length / SAMPLING_RATE
        n_segment = int(
            np.ceil((len(audio_data) - len_segment_shape) / seghop_length))
        print("n_segment: {}".format(n_segment))

        data_full_song.type_excerpt, data_full_song.n_elem = "segment", n_segment
        pred_segment, _ = models.run_on_dataset(
            self.segment_model, loader, device=self.device)
        pred_segment = pred_segment.squeeze()
        # print("Segment Prediction:\n{}".format(pred_segment))

        pred_segment_filter = medfilt(pred_segment, 3)

        frmtime_segment = np.arange(n_segment) * seghop_duration + MIN_SRC / 2
        audio_firstonsettime = librosa.frames_to_time(
            librosa.onset.onset_detect(y=audio, sr=SAMPLING_RATE), sr=SAMPLING_RATE)[0]
        n_segment_tozero = 0
        for t in frmtime_segment:
            if t < audio_firstonsettime:
                n_segment_tozero += 1
            else:
                break
        print("length frmtime_segment: {}".format(len(frmtime_segment)))
        print("n_segment_tozero: {}".format(n_segment_tozero))
        pred_segment_filter[:n_segment_tozero] = 0
        print("Filtered Segment Prediction:\n{}".format(pred_segment))

        # ======= Fuse Prediction =======
        self.onset_threshold = np.median(pred_onset_filter)
        self.segment_threshold = np.median(pred_segment_filter)

        pred_onset_todetect = np.copy(pred_onset_filter)
        # print(pred_onset_todetect)
        pred_onset_todetect[pred_onset_todetect < self.onset_threshold] = 0
        pred_onset_todetect[pred_onset_todetect >= self.onset_threshold] = 1

        pred_segment_todetect = np.copy(pred_segment_filter)
        pred_segment_todetect[pred_segment_todetect <
                              self.segment_threshold] = 0
        pred_segment_todetect[pred_segment_todetect >=
                              self.segment_threshold] = 1

        # print(pred_segment_todetect.any())
        # print(pred_onset_todetect.any())

        # decide the initial indexes of pedal segment boundary
        onseg_initidxs = []
        offseg_initidxs = []
        for idx, v in enumerate(pred_segment_todetect):
            if idx > 0 and idx < len(pred_segment_todetect) - 1:
                if pred_segment_todetect[idx - 1] == 0 and v == 1 and pred_segment_todetect[idx + 1] == 1:
                    onseg_initidxs.append(idx - 1)
                elif pred_segment_todetect[idx - 1] == 1 and v == 1 and pred_segment_todetect[idx + 1] == 0:
                    offseg_initidxs.append(idx + 1)

        print("onseg_initidxs: {}\n{}".format(
            len(onseg_initidxs), onseg_initidxs))
        print("offseg_initidxs: {}\n{}".format(
            len(offseg_initidxs), offseg_initidxs))

        if offseg_initidxs[0] <= onseg_initidxs[0]:
            del offseg_initidxs[0]
        if onseg_initidxs[-1] >= offseg_initidxs[-1]:
            del onseg_initidxs[-1]

        if (len(onseg_initidxs) != len(offseg_initidxs)) or not len(pedal_offset_gt) or not len(pedal_onset_gt):
            print(" skip!")
        else:
            onseg_idxs = []
            offseg_idxs = []
            for idx in range(len(onseg_initidxs)):
                if onseg_initidxs[idx] < offseg_initidxs[idx]:
                    onseg_idxs.append(onseg_initidxs[idx])
                    offseg_idxs.append(offseg_initidxs[idx])

            if not len(onseg_idxs) or not len(offseg_idxs):
                print("no detection!")
            else:
                # decide the boundary times in seconds, combining the effect of pedal onset
                onseg_times = []
                offseg_times = []
                for idx, onseg_idx in enumerate(onseg_idxs):
                    onponset_idx = onseg_idx * 10 - 5
                    if any(pred_onset_todetect[onponset_idx - 5: onponset_idx + 5]):
                        offseg_idx = offseg_idxs[idx]
                        offseg_times.append(frmtime_segment[offseg_idx])
                        onseg_times.append(frmtime_segment[onseg_idx])
                segintervals_est = np.stack(
                    (np.asarray(onseg_times), np.asarray(offseg_times)), axis=-1)

                # set the ground truth and estimation results frame by frame
                audio_duration = librosa.get_duration(
                    y=audio_data, sr=SAMPLING_RATE)
                n_frames = int(np.ceil(audio_duration / seghop_duration))
                segframes_gt = np.zeros(n_frames)
                segframes_est = np.zeros(n_frames)

                longpseg_idx = np.where(
                    (pedal_offset_gt-pedal_onset_gt) > seghop_duration)[0]
                longseg_onset_gt = pedal_onset_gt[longpseg_idx]
                longseg_offset_gt = pedal_offset_gt[longpseg_idx]
                segintervals_gt = np.stack(
                    (longseg_onset_gt, longseg_offset_gt), axis=-1)

                for idx, onset_t in enumerate(longseg_onset_gt):
                    offset_t = longseg_offset_gt[idx]
                    onset_frm = int(onset_t // seghop_duration)
                    offset_frm = int(offset_t // seghop_duration)
                    segframes_gt[onset_frm:offset_frm] = 1

                for idx, onset_t in enumerate(onseg_times):
                    offset_t = offseg_times[idx]
                    onset_frm = int(onset_t // seghop_duration)
                    offset_frm = int(offset_t // seghop_duration)
                    segframes_est[onset_frm: offset_frm] = 1

                # set the ground truth and estimation results as interval format
                segintervals1_gt, segintervals01_gt, labels_gt = intervals1tointervals01(
                    segintervals_gt, audio_duration)
                segintervals1_est, segintervals01_est, labels_est = intervals1tointervals01(
                    segintervals_est, audio_duration)

            frmtimes = np.arange(n_frames) * seghop_duration
            # left, right = [150, 170]
            plt.figure(figsize=(15, 5))
            librosa.display.waveplot(audio_data, SAMPLING_RATE, alpha=0.8)
            plt.fill_between(frmtimes, 0, 0.5, where=segframes_gt >
                             0, facecolor='green', alpha=0.7, label='ground truth')
            plt.fill_between(frmtimes, -0.5, 0, where=segframes_est >
                             0, facecolor='orange', alpha=0.7, label='estimation')
            # plt.title("Pedal segment detection of {}".format(filename))
            plt.legend()
            # plt.xlim([left,right])
            # plt.show()
            plt.savefig("test")

            return segframes_est
Example #50
0
def test_get_duration_fail():
    librosa.get_duration(y=None, S=None, filename=None)
Example #51
0
def main(wav_path,
         text_path=None,
         rttm_path=None,
         uem_path=None,
         ctm_path=None,
         manifest_filepath=None,
         add_duration=False):
    if os.path.exists(manifest_filepath):
        os.remove(manifest_filepath)

    wav_pathlist = read_file(wav_path)
    wav_pathdict = get_dict_from_wavlist(wav_pathlist)
    len_wavs = len(wav_pathlist)
    uniqids = sorted(wav_pathdict.keys())

    text_pathdict = get_path_dict(text_path, uniqids, len_wavs)
    rttm_pathdict = get_path_dict(rttm_path, uniqids, len_wavs)
    uem_pathdict = get_path_dict(uem_path, uniqids, len_wavs)
    ctm_pathdict = get_path_dict(ctm_path, uniqids, len_wavs)

    lines = []
    for uid in uniqids:
        wav, text, rttm, uem, ctm = (
            wav_pathdict[uid],
            text_pathdict[uid],
            rttm_pathdict[uid],
            uem_pathdict[uid],
            ctm_pathdict[uid],
        )

        audio_line = wav.strip()
        if rttm is not None:
            rttm = rttm.strip()
            labels = rttm_to_labels(rttm)
            num_speakers = Counter([l.split()[-1]
                                    for l in labels]).keys().__len__()
        else:
            num_speakers = None

        if uem is not None:
            uem = uem.strip()

        if text is not None:
            text = open(text.strip()).readlines()[0].strip()
        else:
            text = "-"

        if ctm is not None:
            ctm = ctm.strip()

        duration = None
        if add_duration:
            y, sr = librosa.get_duration(filename=audio_line, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
        meta = [{
            "audio_filepath": audio_line,
            "offset": 0,
            "duration": duration,
            "label": "infer",
            "text": text,
            "num_speakers": num_speakers,
            "rttm_filepath": rttm,
            "uem_filepath": uem,
            "ctm_filepath": ctm,
        }]
        lines.extend(meta)

    write_file(manifest_filepath, lines, range(len(lines)))
Example #52
0
    def create_animated_video(self):
        # Load in Audio file
        selected_audio = filedialog.askopenfilename(parent=self,
                                                    initialdir='Resources/',
                                                    title='Select Audio File')
        if not selected_audio:
            return
        else:
            print(f'Selected Audio File: {selected_audio}')
            duration_s = librosa.get_duration(filename=selected_audio)
            duration_m = duration_s / 60
            print(f'Duration in Seconds: {duration_s}')
            print(f'Duration in Seconds: {duration_m}')

        # Load in Image/Video to use as Background
        selected_background = filedialog.askopenfilename(
            parent=self,
            initialdir='Resources/',
            title='Select The Background For The Video')
        selected_background_type = filetype.guess(selected_background)
        selected_background_mime = selected_background_type.mime
        print(selected_background_mime)
        if not selected_background:
            return
        else:
            print(f'Selected Background File: {selected_background}')

        # Load in Image to be Animated
        selected_animation = filedialog.askopenfilename(
            parent=self,
            initialdir='Resources/',
            title='Select The Asset to Animate. Transparent PNGs work best')
        selected_animation_type = filetype.guess(selected_animation)
        selected_animation_mime = selected_animation_type.mime
        print(selected_background_mime)
        if not selected_animation:
            return
        else:
            print(f'Selected File for Animation: {selected_animation}')
            image = Image.open(selected_animation)
            animation_width, animation_height = image.size

        width = 1920
        height = 1080
        vid_size = width, height
        # Flag for Custom Dimensions
        custom_dims = False
        response = messagebox.askquestion(
            'Use Custom Dimensions?',
            'Do you want to specify custom Dimensions? Defaults to 1920x1080')
        if response == 'yes':
            ui_width = simpledialog.askinteger(
                title='Specify Width', prompt='Specify the Video Width')
            ui_height = simpledialog.askinteger(
                title='Specify Height', prompt='Specify the Video Height')
            if ui_width and ui_height:
                custom_dims = True
            else:
                print('Missing Dimension Input. Defaulting to 1920x1080')

        if selected_audio and selected_background and selected_animation:
            audio_path = os.path.basename(str(selected_audio))
            audio = AudioFileClip(selected_audio, fps=44100)
            bg_visual_clip = None
            animation_clip = None

            # Determine Type and Cast to Moviepy Clip
            bg_visual_clip = cast_to_clip(selected_background,
                                          selected_background_mime, duration_s)
            if custom_dims:
                bg_visual_clip = resize(bg_visual_clip, (ui_width, ui_height))

            if selected_animation:
                factor_table = {'50%': 2, '33%': 3, '25%': 4, '20%': 5}
                factor = 5
                factor = simpledialog.askstring(
                    'Specify Scale of Animated Visual',
                    'Specify Scale of Animated Visual from [50%, 33%, 25%, 20%]'
                )
                if isinstance(factor, str) and factor in factor_table.keys():
                    factor = factor_table[factor]
                    print(f'Watermark Scale set to [{factor}]')

                if 'image' in selected_animation_mime:
                    animation_clip = (ImageClip(
                        selected_animation).set_duration(duration_s).resize(
                            (animation_width / factor,
                             animation_height / factor)).set_pos(
                                 ('center', 'center')))
                elif 'gif' in selected_animation_mime or 'video' in selected_animation_mime:
                    animation_clip = (VideoFileClip(selected_animation).resize(
                        (animation_width / factor,
                         animation_height / factor)).set_pos(
                             ('center', 'center')))
                    animation_clip = animation_clip.fx(vfx.loop,
                                                       duration=duration_s)

                animation_clip = animation_clip.fx(
                    vfx.rotate,
                    lambda duration_s: 90 * duration_s,
                    expand=False).set_duration(duration_s)

                if bg_visual_clip and animation_clip:
                    bg_clip = bg_visual_clip.set_audio(audio).set_duration(
                        duration_s)
                    clip = CompositeVideoClip([bg_clip, animation_clip])
                    clip.write_videofile('Exports/Test.mp4', fps=30)
Example #53
0
    print 'Total passed: ' + str(timedelta(seconds = f - s))

songname = sys.argv[1].split('.')
if songname[-1] == 'mp3':
    from pydub import AudioSegment
    song = AudioSegment.from_mp3('.'.join(songname))
    songname[-1] = "wav"
    songname = '.'.join(songname)
    song.export(songname, format = "wav")
else:
    songname = '.'.join(songname)

print 'Start reading file'
# read file
src, samplerate = load(songname)
dur = get_duration(y=src, sr=samplerate)

# set time
stime = time()

# get chromagram
print 'get chromagram'
chromagram = chroma_stft(y = src, sr = samplerate, hop_length = 512 * 8)

printDt(stime, time())

# count correlation
print 'count correlation'
correlation = np.corrcoef(
    np.cov(np.transpose(chromagram)))
Example #54
0
    def __test_audio(filename, mono, sr, duration):
        y, sr = librosa.load(filename, sr=sr, mono=mono, duration=duration)

        duration_est = librosa.get_duration(y=y, sr=sr)

        assert np.allclose(duration_est, duration, rtol=1e-3, atol=1e-5)
Example #55
0
    meta_index[os.path.join(in_path, 'train', m['audio_filepath'])] = m

speakers.sort()
samples = []
n_skipped = 0
n_spk_skipped = 0
n_samples_spk = []
total_dur = 0
for spk_dir in tqdm.tqdm(speakers):
    spk_name = os.path.split(spk_dir)[-1]
    spk_name = 'LSRU' + spk_name
    i = 0
    spk_s = 0
    base_wav_files = sorted(
        glob.glob(os.path.join(spk_dir, '**', '*.wav'), recursive=True))
    durations = [librosa.get_duration(filename=w) for w in base_wav_files]
    scores = [meta_index[w]['score'] for w in base_wav_files]
    wav_files = [(w, d, s)
                 for w, d, s in zip(base_wav_files, durations, scores)
                 if 1 <= d <= 20 and s >= -1]
    n_skipped += len(base_wav_files) - len(wav_files)
    if len(wav_files) < 100:
        n_skipped += len(wav_files)
        n_spk_skipped += 1
        continue

    for wav_file, dur, score in wav_files:
        filename = os.path.split(wav_file)[-1]
        script = meta_index[wav_file]['text_no_preprocessing']
        if any([c in "1234567890" for c in script]):
            n_skipped += 1
Example #56
0
    def predict(self, wav_file_path):
        '''
        Function which generates local predictions using wavefile
        '''

        # Creates local directory to save 2 second clops
        local_dir = "./fastai_dir/"
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        # infer clip length
        max_length = get_duration(filename=wav_file_path)
        max_length = 60
        # Generating 2 sec proposal with 1 sec hop length
        twoSecList = []
        for i in range(int(floor(max_length) - 1)):
            twoSecList.append([i, i + 2])

        # Creating a proposal dictionary
        two_sec_dict = {}
        two_sec_dict[Path(wav_file_path).name] = twoSecList

        # local directory
        extract_segments(str(Path(wav_file_path).parent), two_sec_dict,
                         local_dir, "")

        # Definining Audio config needed to create on the fly mel spectograms
        config = AudioConfig(
            standardize=False,
            sg_cfg=SpectrogramConfig(
                f_min=0.0,  # Minimum frequency to Display
                f_max=10000,  # Maximum Frequency to Display
                hop_length=256,
                n_fft=2560,  # Number of Samples for Fourier
                n_mels=256,  # Mel bins
                pad=0,
                to_db_scale=True,  # Converting to DB sclae
                top_db=100,  # Top decible sound
                win_length=None,
                n_mfcc=20))
        config.duration = 4000  # 4 sec padding or snip
        config.resample_to = 20000  # Every sample at 20000 frequency

        # Creating a Audio DataLoader
        test_data_folder = Path(local_dir)
        tfms = None
        test = AudioList.from_folder(test_data_folder,
                                     config=config).split_none().label_empty()
        testdb = test.transform(tfms).databunch(bs=32)

        # Scoring each 2 sec clip
        predictions = []
        pathList = []
        for item in testdb.x:
            predictions.append(self.model.predict(item)[2][1])
            pathList.append(str(item.path))

        # clean folder
        shutil.rmtree(local_dir)

        # Aggregating predictions

        # Creating a DataFrame
        prediction = pd.DataFrame({'FilePath': pathList, 'pred': predictions})

        # Converting prediction to float
        prediction['pred'] = prediction.pred.astype(float)

        # Extracting filename
        prediction['FileName'] = prediction.FilePath.apply(
            lambda x: x.split('/')[6].split("-")[0])

        # Extracting Starting time from file name
        prediction['startTime'] = prediction.FileName.apply(
            lambda x: int(x.split('__')[1].split('.')[0].split('_')[0]))

        # Sorting the file based on startTime
        prediction = prediction.sort_values(['startTime'
                                             ]).reset_index(drop=True)

        # Rolling Window (to average at per second level)
        submission = pd.DataFrame({
            'pred':
            list(prediction.rolling(2)['pred'].mean().values)
        }).reset_index().rename(columns={'index': 'StartTime'})

        # Updating first row
        submission.loc[0, 'pred'] = prediction.pred[0]

        # Adding lastrow
        lastLine = pd.DataFrame({
            'StartTime': [submission.StartTime.max() + 1],
            'pred': [prediction.pred[prediction.shape[0] - 1]]
        })
        submission = submission.append(lastLine, ignore_index=True)

        # initialize output JSON
        result_json = {}
        result_json["local_predictions"] = list(
            (submission['pred'] > 0.5).astype(int))
        result_json["local_confidences"] = list(submission['pred'])
        result_json["global_predictions"] = int(
            sum(result_json["local_predictions"]) >
            self.global_aggregation_percentile_threshold)
        result_json["global_confidence"] = submission.loc[(
            submission['pred'] > 0.5), 'pred'].mean()

        return result_json
Example #57
0
import numpy as np
import matplotlib.pyplot as plt
import librosa
import matplotlib.patches as mpatches

with open("test_text_files/Female_1a_Amp_4096.txt") as f:
    rms_vals = [line.rstrip('\n') for line in f]

rms_vals = str(rms_vals).strip('[').strip(']').strip('[').strip(']').strip("'").strip('[').strip(']')
new_array = rms_vals.split(", ")
new_array = [float(i) for i in new_array]

print("RMS: " + str(new_array))

ta, rate = librosa.load("Female_1a.wav", sr=44100)
dur = librosa.get_duration(ta, sr=44100)

#make time stamps
times = np.linspace(0, dur, len(new_array))

# ONSET TEST SCRIPT

# make array of third character (we will see silent values)
with open("test_text_files/Female_1a_Labels.txt", 'r+') as f:
    third_char = [line.split()[2] for line in f] #this gets first word
    
# make array of first character (start onset)    
with open("test_text_files/Female_1a_Labels.txt", 'r+') as f: 
    first_char = [line.split()[0] for line in f]

#find indices of SIL