def generate_mfcc_lists(train_aud, test_aud, number_mfcc=20): aud = { 'train_aud': train_aud, 'test_aud': test_aud} for key in aud.keys(): files = aud[key] # initialize general variables mfcc = [] trim_len = [] pretrim_len = [] # conduct loop over set of audio files for file in files: data, rate = librosa.load(file) # Trim leading and trailing silence pretrim_len.append(round(librosa.get_duration(data, rate),1)) data, index = librosa.effects.trim(data, top_db=15) trim_len.append(round(librosa.get_duration(data, rate),1)) mfcc.append(librosa.feature.mfcc(data, rate, n_mfcc=number_mfcc)) # Scale features for i,x in enumerate(mfcc): mfcc[i] = sklearn.preprocessing.scale(mfcc[i], axis=1) # assign result to corresponding variables if (key == 'train_aud'): x_train = mfcc train_len = trim_len elif (key == 'test_aud'): x_test = mfcc test_len = trim_len return x_train, x_test, train_len, test_len
def __test(infile, n_steps, bins_per_octave): y, sr = librosa.load(infile, duration=4.0) ys = librosa.effects.pitch_shift(y, sr, n_steps, bins_per_octave=bins_per_octave) orig_duration = librosa.get_duration(y, sr=sr) new_duration = librosa.get_duration(ys, sr=sr) # We don't have to be too precise here, since this goes through an STFT eq_(orig_duration, new_duration)
def __test(infile, rate): y, sr = librosa.load(infile, duration=4.0) ys = librosa.effects.time_stretch(y, rate) orig_duration = librosa.get_duration(y, sr=sr) new_duration = librosa.get_duration(ys, sr=sr) # We don't have to be too precise here, since this goes through an STFT assert np.allclose(orig_duration, rate * new_duration, rtol=1e-2, atol=1e-3)
def test_get_duration_filename(): filename = 'data/test2_8000.wav' true_duration = 30.197625 duration_fn = librosa.get_duration(filename=filename) y, sr = librosa.load(filename, sr=None) duration_y = librosa.get_duration(y=y, sr=sr) assert np.allclose(duration_fn, true_duration) assert np.allclose(duration_fn, duration_y)
def __init__(self, source, duration = None, sample_rate = None, offset = 0.0): if isinstance(source, basestring): self.signal, self.sample_rate = librosa.load(source, duration=duration, offset=offset) if duration is None: self.duration = librosa.get_duration(self.signal, sr=self.sample_rate) else: self.duration = duration else: self.signal = source self.sample_rate = sample_rate self.duration = librosa.get_duration(self.signal, sr=sample_rate) self.pre_process()
def __test(y, top_db, ref, trim_duration): yt, idx = librosa.effects.trim(y, top_db=top_db, ref=ref) # Test for index position fidx = [slice(None)] * y.ndim fidx[-1] = slice(*idx.tolist()) assert np.allclose(yt, y[tuple(fidx)]) # Verify logamp rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False) logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None) assert np.all(logamp > - top_db) # Verify logamp rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze() logamp_all = librosa.power_to_db(rms_all**2, ref=ref, top_db=None) start = int(librosa.samples_to_frames(idx[0])) stop = int(librosa.samples_to_frames(idx[1])) assert np.all(logamp_all[:start] <= - top_db) assert np.all(logamp_all[stop:] <= - top_db) # Verify duration duration = librosa.get_duration(yt) assert np.allclose(duration, trim_duration, atol=1e-1), duration
def libroRMS(filepath, kRatio): y, sr = librosa.load(filepath) # Load the waveform as y, sr is sample rate clipLength = librosa.get_duration(y=y, sr=sr) kValue = int(clipLength/kRatio +1) #sets up relative ratio of samples ### get the RMS of the audio sample ### data = librosa.feature.rmse(y=y, hop_length=2048) boundaries = librosa.segment.agglomerative(data, k=kValue) # Agglomeration boundary_times = librosa.frames_to_time(boundaries, hop_length=2048) # ~.1s intervals = np.hstack([boundary_times[:-1, np.newaxis], boundary_times[1:, np.newaxis]]) get_rms = librosa.feature.sync(data, boundaries, aggregate=np.max) nkValue = kValue-1 #because, for some reason, the intervals above leave out the last one fixedN = np.delete(get_rms, nkValue, axis=1) npsTurn = np.concatenate((intervals, fixedN.T), axis=1) #transform from np array to regular list flatnps = npsTurn.tolist() slice_value = int(kValue//3) rmsOut1 = sorted(flatnps, key = lambda x: int(x[2]), reverse=True) #rmsOut2 = slice(rmsOut1[0: slice_value]) rmsOut2 = rmsOut1[0 : slice_value] rmsOut3 = sorted(rmsOut2, key = lambda x: int(x[0])) return rmsOut3
def __test(jam_in, audio_file): jam = muda.load_jam_audio(jam_in, audio_file) assert hasattr(jam.sandbox, 'muda') eq_(jam.file_metadata.duration, librosa.get_duration(**jam.sandbox.muda._audio))
def states(self, jam): '''Get the state information from the jam''' state = dict() mudabox = jam.sandbox.muda state['duration'] = librosa.get_duration(y=mudabox._audio['y'], sr=mudabox._audio['sr']) yield state
def test_load_jam_audio(jam_loader, audio_file, validate, strict, fmt): jam = muda.load_jam_audio(jam_loader, audio_file, validate=validate, strict=strict, fmt=fmt) assert hasattr(jam.sandbox, 'muda') duration = librosa.get_duration(**jam.sandbox.muda._audio) assert jam.file_metadata.duration == duration
def __init__(self, file_path, convert_to_mono=False, sample_rate=22050): """ Opens a file path, loads it with librosa. """ self.file_path = file_path y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate) self.sample_rate = float(sr) self.raw_samples = y self.num_channels = y.ndim self.duration = librosa.get_duration(y=y, sr=sr)
def states(self, jam): '''Iterate the impulse respones states''' state = dict() mudabox = jam.sandbox.muda state['duration'] = librosa.get_duration(y=mudabox._audio['y'], sr=mudabox._audio['sr']) for i in range(len(self.ir_)): state['index'] = i yield state
def __test_spec(filename, sr, duration, n_fft, hop_length, center): y, sr = librosa.load(filename, sr=sr, duration=duration) S = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, center=center) duration_est = librosa.get_duration(S=S, sr=sr, n_fft=n_fft, hop_length=hop_length, center=center) # We lose a little accuracy in framing without centering, so it's # not as precise as time-domain duration assert np.allclose(duration_est, duration, rtol=1e-1, atol=1e-2)
def load_jam_audio(jam_in, audio_file, validate=True, strict=True, fmt='auto', **kwargs): '''Load a jam and pack it with audio. Parameters ---------- jam_in : str, file descriptor, or jams.JAMS JAMS filename, open file-descriptor, or object to load. See ``jams.load`` for acceptable formats. audio_file : str Audio filename to load validate : bool strict : bool fmt : str Parameters to `jams.load` kwargs : additional keyword arguments See `librosa.load` Returns ------- jam : jams.JAMS A jams object with audio data in the top-level sandbox Notes ----- This operation can modify the `file_metadata.duration` field of `jam_in`: If it is not currently set, it will be populated with the duration of the audio file. See Also -------- jams.load librosa.core.load ''' if isinstance(jam_in, jams.JAMS): jam = jam_in else: jam = jams.load(jam_in, validate=validate, strict=strict, fmt=fmt) y, sr = librosa.load(audio_file, **kwargs) if jam.file_metadata.duration is None: jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr) return jam_pack(jam, _audio=dict(y=y, sr=sr))
def analyze(filename=None, y=None, sr=None): '''Analyze a recording for all tasks. Parameters ---------- filename : str, optional Path to audio file y : np.ndarray, optional sr : number > 0, optional Audio buffer and sampling rate .. note:: At least one of `filename` or `y, sr` must be provided. Returns ------- jam : jams.JAMS a JAMS object containing all estimated annotations Examples -------- >>> from crema.analyze import analyze >>> import librosa >>> jam = analyze(filename=librosa.util.example_audio_file()) >>> jam <JAMS(file_metadata=<FileMetadata(...)>, annotations=[1 annotation], sandbox=<Sandbox(...)>)> >>> # Get the chord estimates >>> chords = jam.annotations['chord', 0] >>> chords.to_dataframe().head(5) time duration value confidence 0 0.000000 0.092880 E:maj 0.336977 1 0.092880 0.464399 E:7 0.324255 2 0.557279 1.021678 E:min 0.448759 3 1.578957 2.693515 E:maj 0.501462 4 4.272472 1.486077 E:min 0.287264 ''' _load_models() jam = jams.JAMS() # populate file metadata jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr, filename=filename) for model in __MODELS__: jam.annotations.append(model.predict(filename=filename, y=y, sr=sr)) return jam
def beat_track(infile, outfile): # Load the audio file y, sr = librosa.load(infile) # Compute the track duration track_duration = librosa.get_duration(y=y, sr=sr) # Extract tempo and beat estimates tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) # Convert beat frames to time beat_times = librosa.frames_to_time(beat_frames, sr=sr) # Construct a new JAMS object and annotation records jam = jams.JAMS() # Store the track duration jam.file_metadata.duration = track_duration beat_a = jams.Annotation(namespace='beat') beat_a.annotation_metadata = jams.AnnotationMetadata(data_source='librosa beat tracker') # Add beat timings to the annotation record. # The beat namespace does not require value or confidence fields, # so we can leave those blank. for t in beat_times: beat_a.append(time=t, duration=0.0) # Store the new annotation in the jam jam.annotations.append(beat_a) # Add tempo estimation to the annotation. tempo_a = jams.Annotation(namespace='tempo', time=0, duration=track_duration) tempo_a.annotation_metadata = jams.AnnotationMetadata(data_source='librosa tempo estimator') # The tempo estimate is global, so it should start at time=0 and cover the full # track duration. # If we had a likelihood score on the estimation, it could be stored in # `confidence`. Since we have no competing estimates, we'll set it to 1.0. tempo_a.append(time=0.0, duration=track_duration, value=tempo, confidence=1.0) # Store the new annotation in the jam jam.annotations.append(tempo_a) # Save to disk jam.save(outfile)
def __init__(self, file_path=None, raw_samples=None, convert_to_mono=False, sample_rate=44100, analysis_sample_rate=22050): """ Audio constructor. Opens a file path, loads the audio with librosa, and prepares the features Parameters ---------- file_path: string path to the audio file to load raw_samples: np.array samples to use for audio output convert_to_mono: boolean (optional) converts the file to mono on loading sample_rate: number > 0 [scalar] (optional) sample rate to pass to librosa. Returns ------ An Audio object """ if file_path: y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate) elif raw_samples is not None: # This assumes that we're passing in raw_samples # directly from another Audio's raw_samples. y = raw_samples sr = sample_rate self.file_path = file_path self.sample_rate = float(sr) self.analysis_sample_rate = float(analysis_sample_rate) self.num_channels = y.ndim self.duration = librosa.get_duration(y=y, sr=sr) self.analysis_samples = librosa.resample(librosa.to_mono(y), sr, self.analysis_sample_rate, res_type='kaiser_best') self.raw_samples = np.atleast_2d(y) self.zero_indexes = self._create_zero_indexes() self.features = self._create_features() self.timings = self._create_timings()
def __test_time(jam_orig, jam_new, rate): # Test the track length ap_(librosa.get_duration(**jam_orig.sandbox.muda['_audio']), rate * librosa.get_duration(**jam_new.sandbox.muda['_audio'])) # Test the metadata ap_(jam_orig.file_metadata.duration, rate * jam_new.file_metadata.duration) # Test each annotation for ann_orig, ann_new in zip(jam_orig.annotations, jam_new.annotations): # JAMS 0.2.1 support if hasattr(ann_orig, 'time'): ap_(ann_orig.time, rate * ann_new.time) ap_(ann_orig.duration, rate * ann_new.duration) ap_(ann_orig.data.time.values.astype(float), rate * ann_new.data.time.values.astype(float)) ap_(ann_orig.data.duration.values.astype(float), rate * ann_new.data.duration.values.astype(float)) if ann_orig.namespace == 'tempo': ap_(rate * ann_orig.data.value, ann_new.data.value)
def states(self, jam): '''Set the state for the transformation object''' state = dict() mudabox = jam.sandbox.muda state['track_duration'] = librosa.get_duration(y=mudabox._audio['y'], sr=mudabox._audio['sr']) offsets = np.arange(start=0, stop=(state['track_duration'] - self.min_duration), step=self.stride) for t in offsets: state['offset'] = t yield state
def smc_file_metadata(infile): '''Construct a metadata object from an SMC wav file''' match = re.match('.*(?P<index>SMC_\d+).wav$', infile) if not match: raise RuntimeError('Could not index filename {:s}'.format(infile)) # Get the duration of the track y, sr = librosa.load(infile, sr=None) duration = librosa.get_duration(y=y, sr=sr) # Format duration as time metadata = jams.FileMetadata(title=match.group('index'), duration=duration) return metadata
def test_save(): jam = muda.load_jam_audio('data/fixture.jams', 'data/fixture.wav') _, jamfile = tempfile.mkstemp(suffix='.jams') _, audfile = tempfile.mkstemp(suffix='.wav') muda.save(audfile, jamfile, jam) jam2 = muda.load_jam_audio(jamfile, audfile) jam2_raw = jams.load(jamfile) os.unlink(audfile) os.unlink(jamfile) assert hasattr(jam2.sandbox, 'muda') assert '_audio' in jam2.sandbox.muda assert '_audio' not in jam2_raw.sandbox.muda eq_(jam2.file_metadata.duration, librosa.get_duration(**jam2.sandbox.muda['_audio']))
def test_save(jam_in, audio_file, strict, fmt): jam = muda.load_jam_audio(jam_in, audio_file) _, jamfile = tempfile.mkstemp(suffix='.jams') _, audfile = tempfile.mkstemp(suffix='.wav') muda.save(audfile, jamfile, jam, strict=strict, fmt=fmt) jam2 = muda.load_jam_audio(jamfile, audfile, fmt=fmt) jam2_raw = jams.load(jamfile, fmt=fmt) os.unlink(audfile) os.unlink(jamfile) assert hasattr(jam2.sandbox, 'muda') assert '_audio' in jam2.sandbox.muda assert '_audio' not in jam2_raw.sandbox.muda duration = librosa.get_duration(**jam2.sandbox.muda['_audio']) assert jam2.file_metadata.duration == duration
d_text = d_file.read() g_file = open('mood.txt') with g_file: g_text = g_file.read() al_file = open('album.txt') with al_file: al_text = al_file.read() d, g, al = eval(d_text), eval(g_text), eval(al_text) c, c1, c2 = len(d), len(g), len(al) for filename in os.listdir(f'H:/hindisongs/{file}'): singer1, singer2, singer3 = 0, 0, 0 try: audio = f'H:/hindisongs/{file}/{filename}' audio_duration = librosa.get_duration(filename=audio) duration, offset = audio_duration // 2, audio_duration // 3 y, sr = librosa.load(audio, duration=duration, offset=offset) rmse = librosa.feature.rms(y=y) chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr)) zcr = np.mean(librosa.feature.zero_crossing_rate(y=y)) spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)) rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)) mel = np.mean(librosa.feature.melspectrogram(y, sr=sr)) mfcc = librosa.feature.mfcc(y=y, sr=sr) percussion = librosa.effects.percussive(y, margin=8) harmonic = librosa.effects.harmonic(y, margin=8) chromagram = librosa.feature.chroma_cqt(y=harmonic, sr=sr)
def get_wav_duration(file_path): return librosa.get_duration(filename=file_path)
j+=1 s=0 while(s<j): song_segment[s].export("song_segment_"+str(s),format="wav") s+=1 """ ##################Audio测试训练集########################### #When you load the data, it gives you two objects; a numpy array of # an audio file and the corresponding sampling rate by which it # was extracted. Now to represent this as a waveform # (which it originally is), use the following code data, sampling_rate = librosa.load('E:\\Urban_Sound_challenge_data\Train\Train\\2022.wav')##读取文件 plt.figure(figsize=(12, 4)) x_length = librosa.get_duration(data,sampling_rate) print(x_length) librosa.display.waveplot(data,sr=sampling_rate) plt.show() train = read_csv('E:\\Urban_Sound_challenge_data\Train\\train.csv') def parser(row): # function to load files and extract features file_name = os.path.join(os.path.abspath('E:\\Urban_Sound_challenge_data\Train\\'), 'Train', str(row.ID) + '.wav') print(file_name) # handle exception to check if there isn't a file which is corrupted try: # here kaiser_fast is a technique used for faster extraction X, sample_rate = librosa.load(file_name, res_type='kaiser_fast') # we extract mfcc feature from data mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=30).T,axis=0) except Exception as e:
path = input("Path>") if not path: path = "test.mp3" y, sr = librosa.load(path) print(1) y_harmonic, y_percussive = librosa.effects.hpss(y) print(2) tempo, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr) print(3) # with open("temp.pkl","wb") as f: # pickle.dump([y,sr,y_harmonic,y_percussive,tempo,beat_frames],f) # with open("temp.pkl", "rb") as f: # y, sr, y_harmonic, y_percussive, tempo, beat_frames = pickle.load(f) # 淡入淡出部分重叠? # 每一段开始结束响度一样? time = librosa.get_duration(y=y, sr=sr) total_beat = librosa.time_to_frames(time) # print((time,total_beat,beat_frames)) print(sr) begin = 0 parts = [] last = 0 for i in beat_frames: end = int((len(y) * i) / total_beat) # +(sr)//120 # print(i) # sum=0 # for i in y[begin:end]: # sum+=i # 平均值是没有意义的!!!!!!!!!!!!!!!! # 眼睛啦 # 简单过0频率检测
def main(args): # load parameters and data path music_type = args.music_type onset_threshold = args.onset_threshold segment_threshold = args.segment_threshold input_type = 'melspectrogram' if music_type == 'synth': dataset_name = 'pedal-times_test.npz' npz_path = os.path.join(DIR_PEDAL_METADATA, dataset_name) elif music_type == 'real': npz_dir = os.path.join(DIR_REAL_DATA, 'reference') dataset_name = 'pedal-times_realaudio.npz' npz_path = os.path.join(npz_dir, dataset_name) else: print("Error: Please set the music_type to either synth or real!") tracks = np.load(npz_path) filenames = tracks['filename'] pedal_offset_gt_tracks = tracks['pedal_offset'] pedal_onset_gt_tracks = tracks['pedal_onset'] # load convnet models onset_exp_name = 'sub-onset_cnnkernel-melspectrogram_l4c13' onset_model = keras.models.load_model(os.path.join(DIR_SAVE_MODEL,"{}_best_model.h5".format(onset_exp_name)), custom_objects={'Melspectrogram':Melspectrogram}) onset_model.load_weights(os.path.join(DIR_SAVE_MODEL,"{}_best_weights.h5".format(onset_exp_name))) segment_exp_name = 'sub-segment_cnnkernel-melspectrogram_multift' segment_model = keras.models.load_model(os.path.join(DIR_SAVE_MODEL,"{}_best_model.h5".format(segment_exp_name)), custom_objects={'Melspectrogram':Melspectrogram}) segment_model.load_weights(os.path.join(DIR_SAVE_MODEL,"{}_best_weights.h5".format(segment_exp_name))) # initialise performance measurement filename_records = [] accuracys = [] precisions = [] recalls = [] fscores = [] support0s = [] support1s = [] fp_rates = [] fn_rates = [] # append to lists filename_records = [] support0s = [] support1s = [] acc01_frms = [] p1_frms = [] r1_frms = [] f1_frms = [] fp_rates = [] fn_rates = [] # boundary matrixs boundary_wins = [] p1_sbrs = [] r1_sbrs = [] f1_sbrs = [] r2e_deviation1s = [] e2r_deviation1s = [] p01_sbrs = [] r01_sbrs = [] f01_sbrs = [] r2e_deviation01s = [] e2r_deviation01s = [] # structural matrixs p_pairwises = [] r_pairwises = [] f_pairwises = [] nce_overs = [] nce_unders = [] nce_fs = [] rand_indexs = [] adjrand_indexs = [] mutual_infos = [] adjmutual_infos = [] normmutual_infos = [] # do detection piece by piece for filename_idx, filename in enumerate(filenames): # load ground truth of the current piece pedal_offset_gt = np.array(pedal_offset_gt_tracks[filename_idx]) pedal_onset_gt = np.array(pedal_onset_gt_tracks[filename_idx]) # load audio data of the current piece if music_type == 'synth': paudio_path = os.path.join(DIR_RENDERED, '{}-p.wav'.format(filename)) elif music_type == 'real': paudio_dir = os.path.join(DIR_REAL_DATA, '{}'.format(filename)) paudio_path = os.path.join(paudio_dir, '{}.wav'.format(filename)) paudio, sr = librosa.load(paudio_path, sr=SR) print("{}...".format(filename)) # detect pedal onset if threshold is greater than 0 if onset_threshold>0: len_onset_shape = int(SR * (TRIM_SECOND_BEFORE + TRIM_SECOND_AFTER)) onsethop_length = HOP_LENGTH onsethop_duration = onsethop_length/SR n_ponset = int(np.ceil((len(paudio)-len_onset_shape)/onsethop_length)) gen_ponset = data_gen(paudio, n_ponset, len_onset_shape, 'onset', hop_length=onsethop_length) pred_ponset = onset_model.predict_generator(gen_ponset, n_ponset // batch_size) # filter to reduce fragmentation pred_ponset_filter = medfilt(pred_ponset[:,1],15) # the corresponding time in second each frame represents frmtime_ponset = np.arange(n_ponset)*onsethop_duration+TRIM_SECOND_BEFORE # represent as frame wise binary results pred_ponset_todetect = np.copy(pred_ponset_filter) pred_ponset_todetect[pred_ponset_todetect<onset_threshold]=0 pred_ponset_todetect[pred_ponset_todetect>=onset_threshold]=1 # detect pedalled segment len_segment_shape = int(SR * MIN_SRC) seghop_length = HOP_LENGTH*10 seghop_duration = seghop_length/SR n_psegment = int(np.ceil((len(paudio)-len_segment_shape)/seghop_length)) gen_psegment = data_gen(paudio, n_psegment, len_segment_shape, 'segment', hop_length=seghop_length) pred_psegment = segment_model.predict_generator(gen_psegment, n_psegment // batch_size) # filter to reduce fragmentation pred_psegment_filter = medfilt(pred_psegment[:,1],3) # the corresponding time in second each frame represents frmtime_psegment = np.arange(n_psegment)*seghop_duration+MIN_SRC/2 # remove the predicted value before the note onset paudio_firstonsettime = librosa.frames_to_time(librosa.onset.onset_detect(y=paudio, sr=SR), sr=SR)[0] n_segment_tozero=0 for t in frmtime_psegment: if t < paudio_firstonsettime: n_segment_tozero+=1 else: break pred_psegment_filter[:n_segment_tozero] = 0 # represent as frame wise binary results pred_psegment_todetect = np.copy(pred_psegment_filter) pred_psegment_todetect[pred_psegment_todetect<segment_threshold]=0 pred_psegment_todetect[pred_psegment_todetect>=segment_threshold]=1 # decide the initial indexes of pedal segment boundary onseg_initidxs = [] offseg_initidxs = [] for idx, v in enumerate(pred_psegment_todetect): if idx>0 and idx<len(pred_psegment_todetect)-1: if pred_psegment_todetect[idx-1]==0 and v==1 and pred_psegment_todetect[idx+1]==1: onseg_initidxs.append(idx-1) elif pred_psegment_todetect[idx-1]==1 and v==1 and pred_psegment_todetect[idx+1]==0: offseg_initidxs.append(idx+1) if offseg_initidxs[0] <= onseg_initidxs[0]: del offseg_initidxs[0] if onseg_initidxs[-1] >= offseg_initidxs[-1]: del onseg_initidxs[-1] if (len(onseg_initidxs) != len(offseg_initidxs)) or not len(pedal_offset_gt) or not len(pedal_onset_gt): print(" skip!") else: onseg_idxs = [] offseg_idxs = [] for idx in range(len(onseg_initidxs)): if onseg_initidxs[idx] < offseg_initidxs[idx]: onseg_idxs.append(onseg_initidxs[idx]) offseg_idxs.append(offseg_initidxs[idx]) if not len(onseg_idxs) or not len(offseg_idxs): print(" no detection!") else: if onset_threshold>0: # decide the boundary times in seconds with pedal onset candidates onseg_times = [] offseg_times = [] for idx, onseg_idx in enumerate(onseg_idxs): onponset_idx = onseg_idx*10-5 if any(pred_ponset_todetect[onponset_idx-5:onponset_idx+5]): offseg_idx = offseg_idxs[idx] offseg_times.append(frmtime_psegment[offseg_idx]) onseg_times.append(frmtime_psegment[onseg_idx]) else: onseg_times = frmtime_psegment[onseg_idxs] offseg_times = frmtime_psegment[offseg_idxs] segintervals_est = np.stack((np.asarray(onseg_times),np.asarray(offseg_times)), axis=-1) # set the ground truth and estimation results frame by frame paudio_duration = librosa.get_duration(y=paudio, sr=SR) n_frames = int(np.ceil(paudio_duration/seghop_duration)) segframes_gt = np.zeros(n_frames) segframes_est = np.zeros(n_frames) pedal_offset_gt = np.array(tracks['pedal_offset'][filename_idx]) pedal_onset_gt = np.array(tracks['pedal_onset'][filename_idx]) longpseg_idx = np.where((pedal_offset_gt-pedal_onset_gt)>seghop_duration)[0] longseg_onset_gt = pedal_onset_gt[longpseg_idx] longseg_offset_gt = pedal_offset_gt[longpseg_idx] segintervals_gt = np.stack((longseg_onset_gt,longseg_offset_gt), axis=-1) for idx, onset_t in enumerate(longseg_onset_gt): offset_t = longseg_offset_gt[idx] onset_frm = int(onset_t//seghop_duration) offset_frm = int(offset_t//seghop_duration) segframes_gt[onset_frm:offset_frm] = 1 for idx, onset_t in enumerate(onseg_times): offset_t = offseg_times[idx] onset_frm = int(onset_t//seghop_duration) offset_frm = int(offset_t//seghop_duration) segframes_est[onset_frm:offset_frm] = 1 # set the ground truth and estimation results as interval format segintervals1_gt, segintervals01_gt, labels_gt = intervals1tointervals01(segintervals_gt, paudio_duration) segintervals1_est, segintervals01_est, labels_est = intervals1tointervals01(segintervals_est, paudio_duration) # Metrics for frame-wise label 'p' acc01_frm = accuracy_score(segframes_gt,segframes_est) p1_frm, r1_frm, f1_frm, support = precision_recall_fscore_support(segframes_gt,segframes_est) tn, fp, fn, tp = confusion_matrix(segframes_gt,segframes_est).ravel() fp_rate = fp/(fp+tn) fn_rate = fn/(fn+tp) # performance matrix based on boundary annotation of 'p' # window depends on duration of a beat onset_env = librosa.onset.onset_strength(paudio, sr=SR) tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=SR)[0] beat_insecond = 60/tempo p1_sbr,r1_sbr,f1_sbr = mir_eval.segment.detection(segintervals1_gt, segintervals1_est, window=beat_insecond) r2e_deviation1, e2r_deviation1 = mir_eval.segment.deviation(segintervals1_gt, segintervals1_est) # performance matrix based on boundary annotation of both 'p' and 'np' p01_sbr,r01_sbr,f01_sbr = mir_eval.segment.detection(segintervals01_gt, segintervals01_est, window=beat_insecond) # performance matrix based on structural annotation scores = mir_eval.segment.evaluate(segintervals01_gt, labels_gt, segintervals01_est, labels_est) r2e_deviation01, e2r_deviation01 = [scores['Ref-to-est deviation'], scores['Est-to-ref deviation']] p_pairwise, r_pairwise, f_pairwise = [scores['Pairwise Precision'], scores['Pairwise Recall'], scores['Pairwise F-measure']] rand_index, adjrand_index = [scores['Rand Index'], scores['Adjusted Rand Index']] mutual_info, adjmutual_info, normmutual_info = [scores['Mutual Information'], scores['Adjusted Mutual Information'], scores['Normalized Mutual Information']] nce_over, nce_under, nce_f = [scores['NCE Over'], scores['NCE Under'], scores['NCE F-measure']] # append to lists filename_records.append(filename) support0s.append(support[0]) support1s.append(support[1]) acc01_frms.append(acc01_frm) p1_frms.append(p1_frm[1]) r1_frms.append(r1_frm[1]) f1_frms.append(f1_frm[1]) fp_rates.append(fp_rate) fn_rates.append(fn_rate) # boundary matrixs boundary_wins.append(beat_insecond) p1_sbrs.append(p1_sbr) r1_sbrs.append(r1_sbr) f1_sbrs.append(f1_sbr) r2e_deviation1s.append(r2e_deviation1) e2r_deviation1s.append(e2r_deviation1) p01_sbrs.append(p01_sbr) r01_sbrs.append(r01_sbr) f01_sbrs.append(f01_sbr) r2e_deviation01s.append(r2e_deviation01) e2r_deviation01s.append(e2r_deviation01) # structural matrixs p_pairwises.append(p_pairwise) r_pairwises.append(r_pairwise) f_pairwises.append(f_pairwise) nce_overs.append(nce_over) nce_unders.append(nce_under) nce_fs.append(nce_f) rand_indexs.append(rand_index) adjrand_indexs.append(adjrand_index) mutual_infos.append(mutual_info) adjmutual_infos.append(adjmutual_info) normmutual_infos.append(normmutual_info) print(" done!") rows = zip(*[filename_records, support0s, support1s, acc01_frms, p1_frms, r1_frms, f1_frms, fp_rates, fn_rates, boundary_wins, p1_sbrs, r1_sbrs, f1_sbrs, r2e_deviation1s, e2r_deviation1s, p01_sbrs, r01_sbrs, f01_sbrs, r2e_deviation01s, e2r_deviation01s, p_pairwises, r_pairwises, f_pairwises, nce_overs, nce_unders, nce_fs, rand_indexs, adjrand_indexs, mutual_infos, adjmutual_infos, normmutual_infos]) column_names = ['filename_record', 'support0', 'support1', 'acc01_frm', 'p1_frm', 'r1_frm', 'f1_frm', 'fp_rate', 'fn_rate', 'boundary_win', 'p1_sbr', 'r1_sbr', 'f1_sbr', 'r2e_deviation1', 'e2r_deviation1', 'p01_sbr', 'r01_sbr', 'f01_sbr', 'r2e_deviation01', 'e2r_deviation01', 'p_pairwise', 'r_pairwise', 'f_pairwise', 'nce_over', 'nce_under', 'nce_f', 'rand_index', 'adjrand_index', 'mutual_info', 'adjmutual_info', 'normmutual_info'] df = pd.DataFrame(rows, columns = column_names) if music_type == 'synth': df.to_csv('psegment-testresult_onset{}_seg{}.csv'.format(int(onset_threshold*100),int(segment_threshold*100))) elif music_type == 'real': df.to_csv('psegment-testresult-realaudio_onset{}_seg{}.csv'.format(int(onset_threshold*100),int(segment_threshold*100)))
def harmonic_index( sourcefile, offset=0.0, duration=120.0, key=None, output_dir=None, n_fft=4096, hop_length=1024, pitch_median=5, # how many frames for running medians? high_pass_f=40.0, low_pass_f=4000.0, debug=False, cached=True, n_peaks=16, **kwargs): """ Index spectral peaks """ if debug: from librosa.display import specshow import matplotlib.pyplot as plt # args that will make a difference to content, # apart from the sourcefile itself argset = dict( analysis="harmonic_index", # sourcefile=sourcefile, offset=offset, duration=duration, n_fft=n_fft, hop_length=hop_length, high_pass_f=high_pass_f, low_pass_f=low_pass_f, pitch_median=pitch_median, n_peaks=n_peaks, ) sourcefile = Path(sourcefile).resolve() if output_dir is None: output_dir = sourcefile.parent output_dir = Path(output_dir) if key is None: key = str(sourcefile.stem) + "___" + sfio.safeish_hash(argset) metadatafile = (output_dir / key).with_suffix(".json") if cached and metadatafile.exists(): return json.load(metadatafile.open("r")) metadata = dict(key=key, metadatafile=str(metadatafile), **argset) y, sr = sfio.load(str(sourcefile), sr=None, mono=True, offset=offset, duration=duration) if high_pass_f is not None: y = basicfilter.high_passed(y, sr, high_pass_f) dur = librosa.get_duration(y=y, sr=sr) metadata["dur"] = dur metadata["sr"] = sr # convert to spectral frames D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length) y_rms = librosa.feature.rmse(S=D) # Separate into harmonic and percussive. I think this preserves phase? H, P = librosa.decompose.hpss(D) # Resynthesize the harmonic component as waveforms y_harmonic = librosa.istft(H) harmonicfile = str(output_dir / key) + ".harmonic.wav" sfio.save(harmonicfile, y_harmonic, sr=sr, norm=True) metadata["harmonicfile"] = harmonicfile # Now, power spectrogram H_mag, H_phase = librosa.magphase(H) H_peak_f, H_peak_mag = librosa.piptrack(S=H_mag, sr=sr, fmin=high_pass_f, fmax=low_pass_f) # First we smooth to use inter-bin information H_peak_f = median_filter(H_peak_f, size=(1, pitch_median)) H_peak_mag = median_filter(H_peak_mag, size=(1, pitch_median)) H_peak_power = np.real(H_peak_mag**2) H_rms = librosa.feature.rmse(S=H_peak_mag) if debug: plt.figure() specshow(librosa.logamplitude(H_peak_f, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak Freqs') plt.figure() specshow(librosa.logamplitude(H_peak_power, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak amps') plt.figure() # Now we pack down to the biggest few peaks: H_peak_f, H_peak_power = compress_peaks(H_peak_f, H_peak_power, n_peaks) if debug: plt.figure() specshow(librosa.logamplitude(H_peak_f, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak Freqs packed') plt.figure() specshow(librosa.logamplitude(H_peak_power, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak amps packed') # plt.figure() # plt.scatter( # librosa.logamplitude(H_peak_power, ref_power=np.max), # y_axis='log', # sr=sr) # plt.title('Compressed') return dict( metadata=metadata, peak_f=H_peak_f, peak_power=H_peak_power, rms=y_rms, harm_rms=H_rms, )
SRC_PATH = 'I:\\dataset-esc\\ESC-10-oggtowav\\' DST_PATH = 'I:\\dataset-esc\\ESC-10-oggtowavtrimmed\\' folder_list = os.listdir(SRC_PATH) folder_list.sort() for i in range(0, len(folder_list)): files = os.listdir(SRC_PATH + folder_list[i]) files.sort() for name in sorted(files): if fnmatch(name, "*.wav"): y, sr = librosa.load(SRC_PATH + folder_list[i] + '\\' + name) # trim the zeros yt, index = librosa.effects.trim(y) # create a class folder if it does not exist if not os.path.exists(DST_PATH + folder_list[i]): os.makedirs(DST_PATH + folder_list[i]) # save the trimmed file librosa.output.write_wav(DST_PATH + folder_list[i] + '\\' + name, yt, sr) # print the original duration and the new one print(librosa.get_duration(y), librosa.get_duration(yt) , max(y), max(yt))
samples.sort() spk_name = 'enbible' lang = 'en_us' n_skip = 0 total_dur = 0 fw = open(os.path.join(output_path, 'metadata.csv'), 'w', encoding='utf-8') i = 0 for l in samples: filename, script, _ = l.split('\t') wav_file = os.path.join(in_path, filename + '.wav') if not os.path.exists(wav_file): print("Missing", wav_file) continue dur = librosa.get_duration(filename=wav_file) if not 1 <= dur <= 20 or any([c.isdigit() for c in script]): print(filename, script, dur) n_skip += 1 continue total_dur += dur shutil.copy(wav_file, os.path.join(wav_output_path, '%s_%010d.wav' % (spk_name, i))) fw.write('|'.join(['%s_%010d' % (spk_name, i), script, spk_name, lang]) + '\n') i += 1 print("%d samples, %d skipped" % (len(samples) - n_skip, n_skip)) print("Total duration: %.2f h, %.2f min" % (total_dur / 60 / 60, total_dur / 60))
else: etap_2 = '-' print('Второго этапа не будет') #Фиксируем дату и время date_time = str(datetime.now()) date_spliter = date_time.split(' ') time_spliter = date_spliter[1].split('.') date = date_spliter[0] time = time_spliter[0] #Узнаем id результата распознавания и длительность аудио unique_id = id(result) duration = librosa.get_duration(filename=pat_h) #Таблица project test = 'Testing' description_1 = '-' #Узнаем имя хоста и ip. Таблица server hostname = socket.gethostname() ip_address = socket.gethostbyname(hostname) description_2 = '-' #Создаем лог-файл и фиксируем в него инфу logging.basicConfig(level=logging.DEBUG, filename="tinkoff.log") logging.debug( 'Logging: %s', { 'date': date,
def gather_data(filename): ''' Formats the analysis of a sound file into a single easy-to-use dictionary! filename: the path to the sound file to be analyzed returns: a dictionary with lots of juicy info! { "beats" : a list of times at which a beat event occurs "framerate" : the size of a frame in seconds "numframes" : the total number of frames "frequencies" : a list of frequency spectrums (256 bins) at each frame "elevations" : a list of the relative pitch heights of each frame } ''' print "Gathering song data..." # get our song y, sr = librosa.load(filename) # separate the foreground and background y_harmonic, y_percussive = librosa.effects.hpss(y) # compute the frequency spectrum S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=316) # Convert to log scale (dB) using the peak power as a reference log_S = librosa.logamplitude(S, ref_power=np.max) # format the frequencies into a list of 256 amplitudes at each frame frequencies = [[int(f[t]+80) for f in log_S[30:-30]] for t in xrange(len(log_S[0])) if t%30==0] # repeat each value 256 times frequencies = [[f for f in frame for _ in xrange(256)] for frame in frequencies] # get the framerate of the frequencies dur = librosa.get_duration(y) numframes = len(frequencies) framerate = dur / numframes # calculate the times of each beat event tempo, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=64) beats = [0] + [librosa.frames_to_time(b, sr=sr, hop_length=64) for b in beat_frames] + [dur] # get the elevation at each frame elevations = [ elevation(freqs) for freqs in frequencies ] # set the list of amplitudes for each semitone chromagram = librosa.feature.chromagram(y, sr) amplitudes = [] for i in range(len(chromagram)): amplitudes.append(0) for t in chromagram[i]: amplitudes[i] += t # get the key of the whole audio base_key = key_finder(amplitudes) print base_key # get the base colors for the sphere amplitude_sum = 0.0 for amplitude in amplitudes: amplitude_sum += amplitude print major_score(amplitudes, base_key, amplitude_sum) print minor_score(amplitudes, base_key, amplitude_sum) base_red, base_green, base_blue = mood_finder(major_score( amplitudes, base_key, amplitude_sum) < minor_score( amplitudes, base_key, amplitude_sum), tempo) base_colors = [ base_red, base_green, base_blue ] return { "beats": beats, "framerate": framerate, "numframes": numframes, "frequencies": frequencies, "elevations": elevations, "base_colors": base_colors, }
def audio_length(filename): with open(filename, 'rb') as file: y = np.frombuffer(file.read(), dtype=np.float32) return librosa.get_duration(y, sr)
def create_video(self): selected_audio = filedialog.askopenfilename(parent=self, initialdir='Resources/', title='Select Audio File') if not selected_audio: return else: print(f'Selected Audio File: {selected_audio}') duration_s = librosa.get_duration(filename=selected_audio) duration_m = duration_s / 60 print(f'Duration in Seconds: {duration_s}') print(f'Duration in Seconds: {duration_m}') selected_visual = filedialog.askopenfilename( parent=self, initialdir='Resources/', title='Select The Image For The Video') selected_visual_type = filetype.guess(selected_visual) selected_visual_mime = selected_visual_type.mime print(selected_visual_mime) if not selected_visual: return else: print(f'Selected Image File: {selected_visual}') width = 1920 height = 1080 vid_size = width, height # Flag for Custom Dimensions custom_dims = False response = messagebox.askquestion( 'Use Custom Dimensions?', 'Do you want to specify custom Dimensions? Defaults to 1920x1080') if response == 'yes': ui_width = simpledialog.askinteger( title='Specify Width', prompt='Specify the Video Width') ui_height = simpledialog.askinteger( title='Specify Height', prompt='Specify the Video Height') if ui_width and ui_height: custom_dims = True else: print('Missing Dimension Input. Defaulting to 1920x1080') if selected_audio and selected_visual: audio_path = os.path.basename(str(selected_audio)) audio = AudioFileClip(selected_audio, fps=44100) visual_clip = None visual_clip = cast_to_clip(selected_visual, selected_visual_mime, duration_s) if custom_dims: visual_clip = resize(visual_clip, (ui_width, ui_height)) response = messagebox.askquestion( 'Add Watermark', 'Do you want to add a Watermark?') watermark_clip = None if response == 'yes': watermark = filedialog.askopenfilename( parent=self, initialdir='Resources/', title='Select The Image For The Watermark') watermark_type = filetype.guess(watermark) watermark_mime = watermark_type.mime if watermark: print(f'Adding Watermark: {watermark}') factor_table = {'50%': 2, '33%': 3, '25%': 4, '20%': 5} factor = 5 factor = simpledialog.askstring( 'Specify Scale of Watermark', 'Specify Scale of Watermark from [50%, 33%, 25%, 20%]') if isinstance(factor, str) and factor in factor_table.keys(): factor = factor_table[factor] print(f'Watermark Scale set to [{factor}]') if 'image' in watermark_mime: watermark_clip = (ImageClip(watermark).set_duration( duration_s).resize( (width / factor, height / factor)).set_pos( ('right', 'bottom'))) elif 'gif' in watermark_mime or 'video' in watermark_mime: watermark_clip = (VideoFileClip(watermark).resize( (width / factor, height / factor)).set_pos( ('right', 'bottom'))) watermark_clip = watermark_clip.fx(vfx.loop, duration=duration_s) if visual_clip: clip = visual_clip.set_audio(audio).set_duration(duration_s) if watermark_clip: clip = CompositeVideoClip([clip, watermark_clip]) clip.write_videofile('Exports/Test.mp4', fps=30)
def getTimeLenSec(file0): y, sr = librosa.load(file0, sr=None) timeLen = librosa.get_duration(y, sr) return timeLen
import librosa import os import config as hp from tqdm import tqdm if __name__ == "__main__": files = [f for f in os.listdir(hp.wav_folder) if f.endswith('.wav')] files = [os.path.join(hp.wav_folder, f) for f in files] print('num_files: ' + str(len(files))) total_duration = 0 for file in tqdm(files): y, sr = librosa.load(file) duration = librosa.get_duration(y, sr) total_duration += duration print('total_duration: ' + str(total_duration))
def generate_vectors(self): '''Generates noise and class vectors as inputs for each frame''' PULSE_SMOOTH = 0.75 MOTION_SMOOTH = 0.75 classes = self.classes class_shuffle_seconds = self.class_shuffle_seconds or [0] class_shuffle_strength = round(self.class_shuffle_strength * 12) fps = self.fps class_smooth_frames = self.class_smooth_seconds * fps motion_react = self.motion_react * 20 / fps # Get number of noise vectors to initialize (based on speed_fpm) num_init_noise = round( librosa.get_duration(self.wav, self.sr) / 60 * self.speed_fpm) # If num_init_noise < 2, simply initialize the same # noise vector for all frames if num_init_noise < 2: noise = [self.truncation * \ truncnorm.rvs(-2, 2, size = (self.batch_size, self.input_shape)) \ .astype(np.float32)[0]] * \ len(self.spec_norm_class) # Otherwise, initialize num_init_noise different vectors, and generate # linear interpolations between these vectors else: # Initialize vectors init_noise = [self.truncation * \ truncnorm.rvs(-2, 2, size=(self.batch_size, self.input_shape)) \ .astype(np.float32)[0]\ for i in range(num_init_noise)] # Compute number of steps between each pair of vectors steps = int( np.floor(len(self.spec_norm_class)) / len(init_noise) - 1) # Interpolate noise = full_frame_interpolation(init_noise, steps, len(self.spec_norm_class)) # Initialize lists of Pulse, Motion, and Class vectors pulse_noise = [] motion_noise = [] self.class_vecs = [] # Initialize "base" vectors based on Pulse/Motion Reactivity values pulse_base = np.array([self.pulse_react] * self.input_shape) motion_base = np.array([motion_react] * self.input_shape) # Randomly initialize "update directions" of noise vectors self.motion_signs = np.array([random.choice([1,-1]) \ for n in range(self.input_shape)]) # Randomly initialize factors based on motion_randomness rand_factors = np.array([random.choice([1,1-self.motion_randomness]) \ for n in range(self.input_shape)]) for i in range(len(self.spec_norm_class)): # UPDATE NOISE # # Re-initialize randomness factors every 4 seconds if i % round(fps * 4) == 0: rand_factors = np.array([random.choice([1, 1-self.motion_randomness]) \ for n in range(self.input_shape)]) # Generate incremental update vectors for Pulse and Motion pulse_noise_add = pulse_base * self.spec_norm_pulse[i] motion_noise_add = motion_base * self.spec_norm_motion[i] * \ self.motion_signs * rand_factors # Smooth each update vector using a weighted average of # itself and the previous vector if i > 0: pulse_noise_add = pulse_noise[i-1]*PULSE_SMOOTH + \ pulse_noise_add*(1 - PULSE_SMOOTH) motion_noise_add = motion_noise[i-1]*MOTION_SMOOTH + \ motion_noise_add*(1 - MOTION_SMOOTH) # Append Pulse and Motion update vectors to respective lists pulse_noise.append(pulse_noise_add) motion_noise.append(motion_noise_add) # Update current noise vector by adding current Pulse vector and # a cumulative sum of Motion vectors noise[i] = noise[i] + pulse_noise_add + sum(motion_noise[:i + 1]) self.noise = noise self.current_noise = noise[i] # Update directions self.motion_signs = self.update_motion_signs() # UPDATE CLASSES # # If current frame is a shuffle frame, shuffle classes accordingly if self.is_shuffle_frame(i): self.classes = self.classes[class_shuffle_strength:] + \ self.classes[:class_shuffle_strength] # Generate class update vector and append to list class_vec_add = self.generate_class_vec(frame=i) self.class_vecs.append(class_vec_add) # Smoothen class vectors by obtaining the mean vector per # class_smooth_frames frames, and interpolating between these vectors if class_smooth_frames > 1: # Obtain mean vectors class_frames_interp = [np.mean(self.class_vecs[i:i + class_smooth_frames], axis = 0) \ for i in range(0, len(self.class_vecs), class_smooth_frames)] # Interpolate self.class_vecs = full_frame_interpolation(class_frames_interp, class_smooth_frames, len(self.class_vecs))
#model_cutoff = 200 render_duration = 1000.0 render_size = 1920 extension = 'png' model_cutoff = 200 beats_per_frame = 4 sigma_weight = 1 / 2.5 exageration_weight = 0.10 exageration_sigma = 1 / 5.0 fps = 30 f_wav = "sound/secret_crates.wav" WAV, sr = librosa.load(f_wav, duration=render_duration) total_seconds = librosa.get_duration(WAV, sr) save_dest = "results/interpolation_matching_beats" os.system(f'mkdir -p {save_dest}') f_beats = f_wav + '_beats.npy' f_onset = f_wav + '_onset.npy' beats = np.load(f_beats) onsets = np.load(f_onset) sess = create_session() t_size = tf.placeholder_with_default(200, []) t_image = cppn(t_size) train_vars = sess.graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
#MARTINV #FRANCISCOS #ALANG #FERNANDOO #CESARA #MARIOV #KEIKOF #VERONICAM #LOCUTORA1 #LOCUTORA2 nombre_politico = "_JULIOG" step = 1 #Intervalo en segundos del audio file_name = 'audio0010' #nombre del archivo file_name_full = "pasos/" + file_name + ".wav" #Crear una carpeta "pasos" en la carpeta donde esta el proyecto len_audio = int(librosa.get_duration( filename=file_name_full)) #Longitud del audio en segundos. print("El audio tiene una duracion de ", len_audio, " segundos") espaciado = "_______________________________" audio_vacio = [] #abrir el archivo wav y convertirlo a monaural. sound = AudioSegment.from_wav(file_name_full) sound = sound.set_channels(1) file_name_full_mono = "pasos/" + file_name + "_mono.wav" sound.export(file_name_full_mono, format="wav") data1, rate1 = sf.read(file_name_full_mono) # Carga el audio meter1 = pyln.Meter(rate1) loudness1 = meter1.integrated_loudness(data1) print(loudness1)
def add_background(cls, file_name, noise_path, out_dir, len_noise_to_add=5.0): ''' Takes an absolute file path, and the path to a directory that contains noise to overlay onto the given sound file (wind, rain, etc.). Returns a numpy structure corresponding to the original audio with the noise overlaid, plus the sample rate of the new sample. A file name is suggested for the sample. It is composed of elements such as the nature and duration of the noise. Client may choose to ignore or use. :param file_name: absolute path to sound file :type file_name: str :param noise_path: absolute path to directory with noise files :type noise_path: str :param out_dir: destination directory of new audio file :type out_dir: str :param len_noise_to_add: how much of a noise snippet to overlay (seconds) :type len_noise_to_add: float :return: full path of new audio file :rtype: str ''' len_noise_to_add = float(len_noise_to_add) backgrounds = os.listdir(noise_path) # Pick a random noise file: background_name = backgrounds[random.randint(0, len(backgrounds) - 1)] cls.log.info(f"Adding {background_name} to {file_name}.") # We will be working with 1 second as the smallest unit of time # load all of both wav files and determine the length of each noise, noise_sr = SoundProcessor.load_audio( os.path.join(noise_path, background_name)) # type(noise) = np.ndarray orig_recording, orig_sr = SoundProcessor.load_audio(file_name) new_sr = math.gcd(noise_sr, orig_sr) if noise_sr != orig_sr: # Resample both noise and orig records so that they have same sample rate cls.log.info(f"Resampling: {background_name} and {file_name}") noise = librosa.resample(noise, noise_sr, new_sr) orig_recording = librosa.resample(orig_recording, orig_sr, new_sr) # input("ready?") noise_duration = librosa.get_duration(noise, noise_sr) if noise_duration < len_noise_to_add: cls.log.info( f"Duration:{noise_duration} < len_noise_to_add:{len_noise_to_add}. Will only add {noise_duration}s of noise" ) samples_per_segment = len(noise) elif noise_duration >= len_noise_to_add: # randomly choose noise segment samples_per_segment = int( new_sr * len_noise_to_add ) # this is the number of samples per 5 seconds # Place noise randomly: subsegment_start = random.randint(0, len(noise) - samples_per_segment) noise = noise[subsegment_start:subsegment_start + samples_per_segment] cls.log.info( f"len(noise) after random segment: {len(noise)}; noise duration: {len(noise)/new_sr}" ) orig_duration = librosa.core.get_duration(orig_recording, orig_sr) # if orig_recording is shorter than the noise we want to add, just add 5% noise if orig_duration < len_noise_to_add: cls.log.info( f"Recording: {file_name} was shorter than len_noise_to_add. Adding 5% of recording len worth of noise." ) new_noise_len = orig_duration * 0.05 noise = noise[:int(new_noise_len * new_sr)] noise_start_loc = random.randint( 0, len(orig_recording) - samples_per_segment) cls.log.info( f"Inserting noise starting at {noise_start_loc/new_sr} seconds.") # split original into three parts: before_noise, during_noise, after_noise before_noise = orig_recording[:noise_start_loc] during_noise = orig_recording[noise_start_loc:noise_start_loc + samples_per_segment] after_noise = orig_recording[noise_start_loc + samples_per_segment:] assert len(during_noise) == len(noise) segment_with_noise = during_noise + Utils.noise_multiplier( orig_recording, noise) * noise first_half = np.concatenate((before_noise, segment_with_noise)) new_sample = np.concatenate( (first_half, after_noise)) # what i think it should be new_duration = librosa.get_duration(new_sample, float(new_sr)) assert new_duration == orig_duration # File name w/o extension: sample_file_stem = Path(file_name).stem noise_file_stem = Path(background_name).stem noise_dur = str(int(noise_start_loc / new_sr * 1000)) file_name = f"{sample_file_stem}-{noise_file_stem}_bgd{noise_dur}ms.wav" # Ensure that the fname doesn't exist: uniq_fname = Utils.unique_fname(out_dir, file_name) out_path = os.path.join(out_dir, uniq_fname) soundfile.write(out_path, new_sample, new_sr) return out_path
train_sets = ['test'] for train_set in train_sets: #train_csv = base_dir + train_set + '.csv' train_audio_dir = base_dir + train_set + '/' output_train_csv = ourput_dir + train_set + '.csv' with open(output_train_csv, 'w') as ofp: #for audio_file in os.listdir(train_audio_dir): #if True: header = True ctr = 0 #for line in tqdm(open(train_csv)): for audio_file in tqdm(os.listdir(train_audio_dir)): ctr += 1 if ctr > 10: #break pass if header: ofp.write('duration,sampling_rate,waveform_length,' + 'fname' + '\n') header = False continue #audio_file = line.split(',')[0] audio_file_path = train_audio_dir + audio_file wave_form, sampling_rate = librosa.load(audio_file_path) duration = librosa.get_duration(y=wave_form, sr=sampling_rate) output_line = str(duration) + ',' + str(sampling_rate) + ',' + str(len(wave_form)) + ',' + audio_file + '\n' ofp.write(output_line)
ref_recording, sr = music_parser.readMusicFile(f'assets/{ref_track}') test_recording, sr = music_parser.readMusicFile(f'assets/{test_track}') # Importing the metadata from the JSON-file meta_data = JSON_Classifier() meta_data.readJSON('assets/testdata.json') # Splitting the audio file in segments, according to the metadata segment_list = music_parser.splitReferenceRecording( meta_data.segments, sr, ref_recording, ) # Feature Extraction/Definition ref_length = librosa.get_duration(ref_recording, sr=sr) test_length = librosa.get_duration(test_recording, sr=sr) frame_length = 9600 hopsize = 4800 window = 'hann' # Creating each chromagram ref_chromagram = music_parser.compute_chromagrams(segment_list, sr, norm=None, hop_length=hopsize, n_fft=frame_length, window=window, tuning=0) test_chromagram = music_parser.compute_one_chromagram(test_recording, sr,
def save_trimwav(wavpath, putpath, top_db=20): y, sr = lib.load(wavpath) yt, index = lib.effects.trim(y, top_db=top_db) print(lib.get_duration(y), lib.get_duration(yt)) lib.output.write_wav(putpath, yt, sr)
import matplotlib.pyplot as plt # Utility import os import glob import numpy as np from tqdm import tqdm import itertools dataset = [] for folder in ["dataset/set_a/**", "dataset/set_b/**"]: for filename in glob.iglob(folder): if os.path.exists(filename): label = os.path.basename(filename).split("_")[0] # skip audio smaller than 4 secs if librosa.get_duration(filename=filename) >= 4: if label not in ["Aunlabelledtest", "Bunlabelledtest"]: dataset.append({"filename": filename, "label": label}) dataset = pd.DataFrame(dataset) dataset = shuffle(dataset, random_state=42) print(dataset.info()) plt.figure(figsize=(12, 6)) dataset.label.value_counts().plot(kind='bar', title="Dataset distribution") plt.show() train, test = train_test_split(dataset, test_size=0.2, random_state=42) print("Train: %i" % len(train)) print("Test: %i" % len(test))
def generate_json(wavfile, DT_ID, song_db): indv = wavfile.parent.parent.stem dt = datetime.strptime(wavfile.stem, "%Y-%m-%d_%H-%M-%S-%f") datestring = dt.strftime("%Y-%m-%d") row=song_db[ (song_db.SubjectName == indv) & (song_db.recording_date == datestring) & (song_db.recording_time == dt.time()) ].iloc[0] # make json dictionary json_dict = {} for key in dict(row).keys(): if type(row[key]) == pd._libs.tslibs.timestamps.Timestamp: json_dict[key] = row[key].strftime("%Y-%m-%d_%H-%M-%S") elif type(row[key]) == dtt: json_dict[key] = row[key].strftime("%H:%M:%S") elif type(row[key]) == pd._libs.tslibs.nattype.NaTType: continue else: json_dict[key] = row[key] json_dict["species"] = "Toxostoma redivivum" json_dict["common_name"] = "California thrasher" json_dict["datetime"] = datestring sr = get_samplerate(wavfile.as_posix()) wav_duration = librosa.get_duration(filename=wavfile.as_posix()) # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration json_dict["wav_loc"] = wavfile.as_posix() tg = wavfile.parent.parent / "TextGrids" / (wavfile.stem + ".TextGrid") textgrid = tgio.openTextgrid(fnFullPath=tg) tierlist = textgrid.tierDict[textgrid.tierNameList[0]].entryList start_times = [i.start for i in tierlist] end_times = [i.end for i in tierlist] labels = [i.label for i in tierlist] json_dict["indvs"] = { indv: { "syllables": { "start_times": NoIndent(start_times), "end_times": NoIndent(end_times), "labels": NoIndent(labels), } } } # generate json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) json_out = ( DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wavfile.stem + ".JSON") ) # save json ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def extract_ground_truth(diagnostics_group): """ Extract ground-truth information from one or more MIDI files about a single MIDI file based on the results in one or more diagnostics files and return a JAMS object with all of the annotations compiled. Parameters ---------- - diagnostics_group : list of dict List of dicts of diagnostics, each about a successful alignment of a different MIDI file to a single audio file. """ # Construct the JAMS object jam = jams.JAMS() # Load in the first diagnostics (doesn't matter which as they all # should correspond the same audio file) diagnostics = diagnostics_group[0] # Load in the audio file to get its duration for the JAMS file audio, fs = librosa.load( diagnostics['audio_filename'], feature_extraction.AUDIO_FS) jam.file_metadata.duration = librosa.get_duration(y=audio, sr=fs) # Also store metadata about the audio file, retrieved from the MSD jam.file_metadata.identifiers = {'track_id': diagnostics['audio_id']} jam.file_metadata.artist = MSD_LIST[diagnostics['audio_id']]['artist'] jam.file_metadata.title = MSD_LIST[diagnostics['audio_id']]['title'] # Iterate over the diagnostics files supplied for diagnostics in diagnostics_group: # Create annotation metadata object, shared across annotations commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip() commit_url = "http://github.com/craffel/midi-dataset/tree/" + commit annotator = {'midi_md5': diagnostics['midi_md5'], 'commit_url': commit_url, 'confidence': diagnostics['score']} annotation_metadata = jams.AnnotationMetadata( curator=jams.Curator('Colin Raffel', '*****@*****.**'), version='0.0.1b', corpus='Million Song Dataset MIDI Matches', annotator=annotator, annotation_tools=( 'MIDI files were matched and aligned to audio files using the ' 'code at http://github.com/craffel/midi-dataset. Information ' 'was extracted from MIDI files using pretty_midi ' 'https://github.com/craffel/pretty-midi.'), annotation_rules=( 'Beat locations and key change times were linearly ' 'interpolated according to an audio-to-MIDI alignment.'), validation=( 'Only MIDI files with alignment confidence scores >= .5 were ' 'considered "correct". The confidence score can be used as a ' 'rough guide to the potential correctness of the annotation.'), data_source='Inferred from a MIDI file.') # Load the extracted features midi_features = deepdish.io.load(diagnostics['midi_features_filename']) audio_features = deepdish.io.load( diagnostics['audio_features_filename']) # Load in the original MIDI file midi_object = pretty_midi.PrettyMIDI(diagnostics['midi_filename']) # Compute the times of the frames (will be used for interpolation) midi_frame_times = feature_extraction.frame_times( midi_features['gram'])[diagnostics['aligned_midi_indices']] audio_frame_times = feature_extraction.frame_times( audio_features['gram'])[diagnostics['aligned_audio_indices']] # Get the interpolated beat locations and add them to the JAM adjusted_beats = interpolate_times( midi_object.get_beats(), midi_frame_times, audio_frame_times) # Create annotation record for the beats beat_a = jams.Annotation(namespace='beat') beat_a.annotation_metadata = annotation_metadata # Add beat timings to the annotation record for t in adjusted_beats: beat_a.append(time=t, duration=0.0) # Add beat annotation record to the JAMS file jam.annotations.append(beat_a) # Get key signature times and their string names key_change_times = [c.time for c in midi_object.key_signature_changes] key_names = [pretty_midi.key_number_to_key_name(c.key_number) for c in midi_object.key_signature_changes] # JAMS requires that the key name be supplied in the form e.g. # "C:major" but pretty_midi returns things in the format "C Major", # so the following code converts to JAMS format key_names = [name.replace(' ', ':').replace('M', 'm') for name in key_names] # Compute interpolated event times adjusted_key_change_times, adjusted_key_names = interpolate_times( key_change_times, midi_frame_times, audio_frame_times, key_names, True) # Create JAMS annotation for the key changes if len(adjusted_key_change_times) > 0: key_a = jams.Annotation(namespace='key_mode') key_a.annotation_metadata = annotation_metadata # We only have key start times from the MIDI file, but JAMS wants # durations too, so create a list of "end times" end_times = np.append(adjusted_key_change_times[1:], jam.file_metadata.duration) # Add key labels into the JAMS file for start, end, key in zip(adjusted_key_change_times, end_times, adjusted_key_names): key_a.append(time=start, duration=end - start, value=key) jam.annotations.append(key_a) return jam
embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") texts = [in_text] embeds = [embed] #embeds = [[0] * 256] specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") griffin_lim = False if not griffin_lim: generated_wav = vocoder.infer_waveform(spec) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") generated_wav = encoder.preprocess_wav(generated_wav) else: generated_wav = Synthesizer.griffin_lim(spec) write = True if write: sf.write(out_path, generated_wav.astype(np.float32), round(synthesizer.sample_rate / 1.0)) print("Audio file has been written.") audio_length = librosa.get_duration(generated_wav, sr=14545) sd.play(generated_wav.astype(np.float32), round(synthesizer.sample_rate / 1.0)) time.sleep(audio_length) print("Done")
def __test_audio(filename, mono, sr, duration): y, sr = librosa.load(filename, sr=sr, mono=mono, duration=duration) duration_est = librosa.get_duration(y=y, sr=sr) assert np.allclose(duration_est, duration, rtol=1e-3, atol=1e-5)
def predict(self, audio_data, pedal_gt, batch_size=4): pedal_onset_gt, pedal_offset_gt = pedal_gt # ======= Get prediciton of all onsets ======= len_onset_shape = int( SAMPLING_RATE * (TRIM_SECOND_BEFORE + TRIM_SECOND_AFTER)) onsethop_length = HOP_LENGTH onsethop_duration = onsethop_length / SAMPLING_RATE n_onset = int( np.ceil((len(audio_data) - len_onset_shape) / onsethop_length)) print("Audio duration: {}s".format( librosa.get_duration(y=audio_data, sr=SAMPLING_RATE))) print("n_onset: {}".format(n_onset)) data_full_song = FullSongDataset( audio_data, n_onset, len_onset_shape, "onset", onsethop_length) loader = DataLoader(data_full_song, batch_size) pred_onset, _ = models.run_on_dataset( self.onset_model, loader, device=self.device) pred_onset = pred_onset.squeeze() # print("Onset Prediction:\n{}".format(pred_onset)) pred_onset_filter = medfilt(pred_onset, 15) frmtime_onset = np.arange(n_onset) * \ onsethop_duration + TRIM_SECOND_BEFORE print("Filtered Onset Prediction:\n{}\nMax: {}, Min: {}\n".format( pred_onset_filter, np.max(pred_onset_filter), np.min(pred_onset_filter))) # ======= Get prediciton of all segments ======= len_segment_shape = int(SAMPLING_RATE * MIN_SRC) seghop_length = HOP_LENGTH * 10 seghop_duration = seghop_length / SAMPLING_RATE n_segment = int( np.ceil((len(audio_data) - len_segment_shape) / seghop_length)) print("n_segment: {}".format(n_segment)) data_full_song.type_excerpt, data_full_song.n_elem = "segment", n_segment pred_segment, _ = models.run_on_dataset( self.segment_model, loader, device=self.device) pred_segment = pred_segment.squeeze() # print("Segment Prediction:\n{}".format(pred_segment)) pred_segment_filter = medfilt(pred_segment, 3) frmtime_segment = np.arange(n_segment) * seghop_duration + MIN_SRC / 2 audio_firstonsettime = librosa.frames_to_time( librosa.onset.onset_detect(y=audio, sr=SAMPLING_RATE), sr=SAMPLING_RATE)[0] n_segment_tozero = 0 for t in frmtime_segment: if t < audio_firstonsettime: n_segment_tozero += 1 else: break print("length frmtime_segment: {}".format(len(frmtime_segment))) print("n_segment_tozero: {}".format(n_segment_tozero)) pred_segment_filter[:n_segment_tozero] = 0 print("Filtered Segment Prediction:\n{}".format(pred_segment)) # ======= Fuse Prediction ======= self.onset_threshold = np.median(pred_onset_filter) self.segment_threshold = np.median(pred_segment_filter) pred_onset_todetect = np.copy(pred_onset_filter) # print(pred_onset_todetect) pred_onset_todetect[pred_onset_todetect < self.onset_threshold] = 0 pred_onset_todetect[pred_onset_todetect >= self.onset_threshold] = 1 pred_segment_todetect = np.copy(pred_segment_filter) pred_segment_todetect[pred_segment_todetect < self.segment_threshold] = 0 pred_segment_todetect[pred_segment_todetect >= self.segment_threshold] = 1 # print(pred_segment_todetect.any()) # print(pred_onset_todetect.any()) # decide the initial indexes of pedal segment boundary onseg_initidxs = [] offseg_initidxs = [] for idx, v in enumerate(pred_segment_todetect): if idx > 0 and idx < len(pred_segment_todetect) - 1: if pred_segment_todetect[idx - 1] == 0 and v == 1 and pred_segment_todetect[idx + 1] == 1: onseg_initidxs.append(idx - 1) elif pred_segment_todetect[idx - 1] == 1 and v == 1 and pred_segment_todetect[idx + 1] == 0: offseg_initidxs.append(idx + 1) print("onseg_initidxs: {}\n{}".format( len(onseg_initidxs), onseg_initidxs)) print("offseg_initidxs: {}\n{}".format( len(offseg_initidxs), offseg_initidxs)) if offseg_initidxs[0] <= onseg_initidxs[0]: del offseg_initidxs[0] if onseg_initidxs[-1] >= offseg_initidxs[-1]: del onseg_initidxs[-1] if (len(onseg_initidxs) != len(offseg_initidxs)) or not len(pedal_offset_gt) or not len(pedal_onset_gt): print(" skip!") else: onseg_idxs = [] offseg_idxs = [] for idx in range(len(onseg_initidxs)): if onseg_initidxs[idx] < offseg_initidxs[idx]: onseg_idxs.append(onseg_initidxs[idx]) offseg_idxs.append(offseg_initidxs[idx]) if not len(onseg_idxs) or not len(offseg_idxs): print("no detection!") else: # decide the boundary times in seconds, combining the effect of pedal onset onseg_times = [] offseg_times = [] for idx, onseg_idx in enumerate(onseg_idxs): onponset_idx = onseg_idx * 10 - 5 if any(pred_onset_todetect[onponset_idx - 5: onponset_idx + 5]): offseg_idx = offseg_idxs[idx] offseg_times.append(frmtime_segment[offseg_idx]) onseg_times.append(frmtime_segment[onseg_idx]) segintervals_est = np.stack( (np.asarray(onseg_times), np.asarray(offseg_times)), axis=-1) # set the ground truth and estimation results frame by frame audio_duration = librosa.get_duration( y=audio_data, sr=SAMPLING_RATE) n_frames = int(np.ceil(audio_duration / seghop_duration)) segframes_gt = np.zeros(n_frames) segframes_est = np.zeros(n_frames) longpseg_idx = np.where( (pedal_offset_gt-pedal_onset_gt) > seghop_duration)[0] longseg_onset_gt = pedal_onset_gt[longpseg_idx] longseg_offset_gt = pedal_offset_gt[longpseg_idx] segintervals_gt = np.stack( (longseg_onset_gt, longseg_offset_gt), axis=-1) for idx, onset_t in enumerate(longseg_onset_gt): offset_t = longseg_offset_gt[idx] onset_frm = int(onset_t // seghop_duration) offset_frm = int(offset_t // seghop_duration) segframes_gt[onset_frm:offset_frm] = 1 for idx, onset_t in enumerate(onseg_times): offset_t = offseg_times[idx] onset_frm = int(onset_t // seghop_duration) offset_frm = int(offset_t // seghop_duration) segframes_est[onset_frm: offset_frm] = 1 # set the ground truth and estimation results as interval format segintervals1_gt, segintervals01_gt, labels_gt = intervals1tointervals01( segintervals_gt, audio_duration) segintervals1_est, segintervals01_est, labels_est = intervals1tointervals01( segintervals_est, audio_duration) frmtimes = np.arange(n_frames) * seghop_duration # left, right = [150, 170] plt.figure(figsize=(15, 5)) librosa.display.waveplot(audio_data, SAMPLING_RATE, alpha=0.8) plt.fill_between(frmtimes, 0, 0.5, where=segframes_gt > 0, facecolor='green', alpha=0.7, label='ground truth') plt.fill_between(frmtimes, -0.5, 0, where=segframes_est > 0, facecolor='orange', alpha=0.7, label='estimation') # plt.title("Pedal segment detection of {}".format(filename)) plt.legend() # plt.xlim([left,right]) # plt.show() plt.savefig("test") return segframes_est
def test_get_duration_fail(): librosa.get_duration(y=None, S=None, filename=None)
def main(wav_path, text_path=None, rttm_path=None, uem_path=None, ctm_path=None, manifest_filepath=None, add_duration=False): if os.path.exists(manifest_filepath): os.remove(manifest_filepath) wav_pathlist = read_file(wav_path) wav_pathdict = get_dict_from_wavlist(wav_pathlist) len_wavs = len(wav_pathlist) uniqids = sorted(wav_pathdict.keys()) text_pathdict = get_path_dict(text_path, uniqids, len_wavs) rttm_pathdict = get_path_dict(rttm_path, uniqids, len_wavs) uem_pathdict = get_path_dict(uem_path, uniqids, len_wavs) ctm_pathdict = get_path_dict(ctm_path, uniqids, len_wavs) lines = [] for uid in uniqids: wav, text, rttm, uem, ctm = ( wav_pathdict[uid], text_pathdict[uid], rttm_pathdict[uid], uem_pathdict[uid], ctm_pathdict[uid], ) audio_line = wav.strip() if rttm is not None: rttm = rttm.strip() labels = rttm_to_labels(rttm) num_speakers = Counter([l.split()[-1] for l in labels]).keys().__len__() else: num_speakers = None if uem is not None: uem = uem.strip() if text is not None: text = open(text.strip()).readlines()[0].strip() else: text = "-" if ctm is not None: ctm = ctm.strip() duration = None if add_duration: y, sr = librosa.get_duration(filename=audio_line, sr=None) duration = librosa.get_duration(y=y, sr=sr) meta = [{ "audio_filepath": audio_line, "offset": 0, "duration": duration, "label": "infer", "text": text, "num_speakers": num_speakers, "rttm_filepath": rttm, "uem_filepath": uem, "ctm_filepath": ctm, }] lines.extend(meta) write_file(manifest_filepath, lines, range(len(lines)))
def create_animated_video(self): # Load in Audio file selected_audio = filedialog.askopenfilename(parent=self, initialdir='Resources/', title='Select Audio File') if not selected_audio: return else: print(f'Selected Audio File: {selected_audio}') duration_s = librosa.get_duration(filename=selected_audio) duration_m = duration_s / 60 print(f'Duration in Seconds: {duration_s}') print(f'Duration in Seconds: {duration_m}') # Load in Image/Video to use as Background selected_background = filedialog.askopenfilename( parent=self, initialdir='Resources/', title='Select The Background For The Video') selected_background_type = filetype.guess(selected_background) selected_background_mime = selected_background_type.mime print(selected_background_mime) if not selected_background: return else: print(f'Selected Background File: {selected_background}') # Load in Image to be Animated selected_animation = filedialog.askopenfilename( parent=self, initialdir='Resources/', title='Select The Asset to Animate. Transparent PNGs work best') selected_animation_type = filetype.guess(selected_animation) selected_animation_mime = selected_animation_type.mime print(selected_background_mime) if not selected_animation: return else: print(f'Selected File for Animation: {selected_animation}') image = Image.open(selected_animation) animation_width, animation_height = image.size width = 1920 height = 1080 vid_size = width, height # Flag for Custom Dimensions custom_dims = False response = messagebox.askquestion( 'Use Custom Dimensions?', 'Do you want to specify custom Dimensions? Defaults to 1920x1080') if response == 'yes': ui_width = simpledialog.askinteger( title='Specify Width', prompt='Specify the Video Width') ui_height = simpledialog.askinteger( title='Specify Height', prompt='Specify the Video Height') if ui_width and ui_height: custom_dims = True else: print('Missing Dimension Input. Defaulting to 1920x1080') if selected_audio and selected_background and selected_animation: audio_path = os.path.basename(str(selected_audio)) audio = AudioFileClip(selected_audio, fps=44100) bg_visual_clip = None animation_clip = None # Determine Type and Cast to Moviepy Clip bg_visual_clip = cast_to_clip(selected_background, selected_background_mime, duration_s) if custom_dims: bg_visual_clip = resize(bg_visual_clip, (ui_width, ui_height)) if selected_animation: factor_table = {'50%': 2, '33%': 3, '25%': 4, '20%': 5} factor = 5 factor = simpledialog.askstring( 'Specify Scale of Animated Visual', 'Specify Scale of Animated Visual from [50%, 33%, 25%, 20%]' ) if isinstance(factor, str) and factor in factor_table.keys(): factor = factor_table[factor] print(f'Watermark Scale set to [{factor}]') if 'image' in selected_animation_mime: animation_clip = (ImageClip( selected_animation).set_duration(duration_s).resize( (animation_width / factor, animation_height / factor)).set_pos( ('center', 'center'))) elif 'gif' in selected_animation_mime or 'video' in selected_animation_mime: animation_clip = (VideoFileClip(selected_animation).resize( (animation_width / factor, animation_height / factor)).set_pos( ('center', 'center'))) animation_clip = animation_clip.fx(vfx.loop, duration=duration_s) animation_clip = animation_clip.fx( vfx.rotate, lambda duration_s: 90 * duration_s, expand=False).set_duration(duration_s) if bg_visual_clip and animation_clip: bg_clip = bg_visual_clip.set_audio(audio).set_duration( duration_s) clip = CompositeVideoClip([bg_clip, animation_clip]) clip.write_videofile('Exports/Test.mp4', fps=30)
print 'Total passed: ' + str(timedelta(seconds = f - s)) songname = sys.argv[1].split('.') if songname[-1] == 'mp3': from pydub import AudioSegment song = AudioSegment.from_mp3('.'.join(songname)) songname[-1] = "wav" songname = '.'.join(songname) song.export(songname, format = "wav") else: songname = '.'.join(songname) print 'Start reading file' # read file src, samplerate = load(songname) dur = get_duration(y=src, sr=samplerate) # set time stime = time() # get chromagram print 'get chromagram' chromagram = chroma_stft(y = src, sr = samplerate, hop_length = 512 * 8) printDt(stime, time()) # count correlation print 'count correlation' correlation = np.corrcoef( np.cov(np.transpose(chromagram)))
meta_index[os.path.join(in_path, 'train', m['audio_filepath'])] = m speakers.sort() samples = [] n_skipped = 0 n_spk_skipped = 0 n_samples_spk = [] total_dur = 0 for spk_dir in tqdm.tqdm(speakers): spk_name = os.path.split(spk_dir)[-1] spk_name = 'LSRU' + spk_name i = 0 spk_s = 0 base_wav_files = sorted( glob.glob(os.path.join(spk_dir, '**', '*.wav'), recursive=True)) durations = [librosa.get_duration(filename=w) for w in base_wav_files] scores = [meta_index[w]['score'] for w in base_wav_files] wav_files = [(w, d, s) for w, d, s in zip(base_wav_files, durations, scores) if 1 <= d <= 20 and s >= -1] n_skipped += len(base_wav_files) - len(wav_files) if len(wav_files) < 100: n_skipped += len(wav_files) n_spk_skipped += 1 continue for wav_file, dur, score in wav_files: filename = os.path.split(wav_file)[-1] script = meta_index[wav_file]['text_no_preprocessing'] if any([c in "1234567890" for c in script]): n_skipped += 1
def predict(self, wav_file_path): ''' Function which generates local predictions using wavefile ''' # Creates local directory to save 2 second clops local_dir = "./fastai_dir/" if not os.path.exists(local_dir): os.makedirs(local_dir) # infer clip length max_length = get_duration(filename=wav_file_path) max_length = 60 # Generating 2 sec proposal with 1 sec hop length twoSecList = [] for i in range(int(floor(max_length) - 1)): twoSecList.append([i, i + 2]) # Creating a proposal dictionary two_sec_dict = {} two_sec_dict[Path(wav_file_path).name] = twoSecList # local directory extract_segments(str(Path(wav_file_path).parent), two_sec_dict, local_dir, "") # Definining Audio config needed to create on the fly mel spectograms config = AudioConfig( standardize=False, sg_cfg=SpectrogramConfig( f_min=0.0, # Minimum frequency to Display f_max=10000, # Maximum Frequency to Display hop_length=256, n_fft=2560, # Number of Samples for Fourier n_mels=256, # Mel bins pad=0, to_db_scale=True, # Converting to DB sclae top_db=100, # Top decible sound win_length=None, n_mfcc=20)) config.duration = 4000 # 4 sec padding or snip config.resample_to = 20000 # Every sample at 20000 frequency # Creating a Audio DataLoader test_data_folder = Path(local_dir) tfms = None test = AudioList.from_folder(test_data_folder, config=config).split_none().label_empty() testdb = test.transform(tfms).databunch(bs=32) # Scoring each 2 sec clip predictions = [] pathList = [] for item in testdb.x: predictions.append(self.model.predict(item)[2][1]) pathList.append(str(item.path)) # clean folder shutil.rmtree(local_dir) # Aggregating predictions # Creating a DataFrame prediction = pd.DataFrame({'FilePath': pathList, 'pred': predictions}) # Converting prediction to float prediction['pred'] = prediction.pred.astype(float) # Extracting filename prediction['FileName'] = prediction.FilePath.apply( lambda x: x.split('/')[6].split("-")[0]) # Extracting Starting time from file name prediction['startTime'] = prediction.FileName.apply( lambda x: int(x.split('__')[1].split('.')[0].split('_')[0])) # Sorting the file based on startTime prediction = prediction.sort_values(['startTime' ]).reset_index(drop=True) # Rolling Window (to average at per second level) submission = pd.DataFrame({ 'pred': list(prediction.rolling(2)['pred'].mean().values) }).reset_index().rename(columns={'index': 'StartTime'}) # Updating first row submission.loc[0, 'pred'] = prediction.pred[0] # Adding lastrow lastLine = pd.DataFrame({ 'StartTime': [submission.StartTime.max() + 1], 'pred': [prediction.pred[prediction.shape[0] - 1]] }) submission = submission.append(lastLine, ignore_index=True) # initialize output JSON result_json = {} result_json["local_predictions"] = list( (submission['pred'] > 0.5).astype(int)) result_json["local_confidences"] = list(submission['pred']) result_json["global_predictions"] = int( sum(result_json["local_predictions"]) > self.global_aggregation_percentile_threshold) result_json["global_confidence"] = submission.loc[( submission['pred'] > 0.5), 'pred'].mean() return result_json
import numpy as np import matplotlib.pyplot as plt import librosa import matplotlib.patches as mpatches with open("test_text_files/Female_1a_Amp_4096.txt") as f: rms_vals = [line.rstrip('\n') for line in f] rms_vals = str(rms_vals).strip('[').strip(']').strip('[').strip(']').strip("'").strip('[').strip(']') new_array = rms_vals.split(", ") new_array = [float(i) for i in new_array] print("RMS: " + str(new_array)) ta, rate = librosa.load("Female_1a.wav", sr=44100) dur = librosa.get_duration(ta, sr=44100) #make time stamps times = np.linspace(0, dur, len(new_array)) # ONSET TEST SCRIPT # make array of third character (we will see silent values) with open("test_text_files/Female_1a_Labels.txt", 'r+') as f: third_char = [line.split()[2] for line in f] #this gets first word # make array of first character (start onset) with open("test_text_files/Female_1a_Labels.txt", 'r+') as f: first_char = [line.split()[0] for line in f] #find indices of SIL