def mir_1k_data_generator(train): for wav in glob.glob('Dataset/MIR-1K/Wavfile/*.wav'): filename = os.path.split(wav)[1] if (filename.startswith('abjones') or filename.startswith('amy')) == train: origin_source, origin_sr = librosa.load( wav, sr=None, mono=False) # TODO 这里的采样率,该文中用了8k,实际上原音频是16k,能不能不重采样 resample_source = librosa.resample(origin_source, origin_sr, 8000) mixed_source_origin = librosa.to_mono(resample_source) left_resample_origin = resample_source[0] right_resample_origin = resample_source[1] # print(np.min(left_source), np.min(right_source), np.min(mixed_source)) TODO 为什么声压有负的? mixed_source_magnitude_spectrum = np.abs( to_spectrum(mixed_source_origin)) left_source_magnitude_spectrum = np.abs( to_spectrum(np.asfortranarray(left_resample_origin)) ) # 以前是不需要做这一步的来让flags['F_CONTIGUOUS']=True的 right_source_magnitude_spectrum = np.abs( to_spectrum(np.asfortranarray(right_resample_origin))) # 归一化 TODO 可以试试不做归一化会怎么样 max_value = np.max(mixed_source_magnitude_spectrum) mixed_source_magnitude_spectrum = mixed_source_magnitude_spectrum / max_value left_source_magnitude_spectrum = left_source_magnitude_spectrum / max_value right_source_magnitude_spectrum = right_source_magnitude_spectrum / max_value # TODO 原文只用了幅度来做,能不能把相位也加进来 mixed_spec_phase = np.angle(to_spectrum(mixed_source_origin)) yield origin_source[0, :], origin_source[1, :], librosa.to_mono( origin_source ), left_source_magnitude_spectrum, right_source_magnitude_spectrum, mixed_source_magnitude_spectrum, max_value, mixed_spec_phase
def __test(y, top_db, ref, trim_duration): yt, idx = librosa.effects.trim(y, top_db=top_db, ref=ref) # Test for index position fidx = [slice(None)] * y.ndim fidx[-1] = slice(*idx.tolist()) assert np.allclose(yt, y[tuple(fidx)]) # Verify logamp rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False) logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None) assert np.all(logamp > - top_db) # Verify logamp rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze() logamp_all = librosa.power_to_db(rms_all**2, ref=ref, top_db=None) start = int(librosa.samples_to_frames(idx[0])) stop = int(librosa.samples_to_frames(idx[1])) assert np.all(logamp_all[:start] <= - top_db) assert np.all(logamp_all[stop:] <= - top_db) # Verify duration duration = librosa.get_duration(yt) assert np.allclose(duration, trim_duration, atol=1e-1), duration
def __test(y, top_db, ref, trim_duration): yt, idx = librosa.effects.trim(y, top_db=top_db, ref=ref) # Test for index position fidx = [slice(None)] * y.ndim fidx[-1] = slice(*idx.tolist()) assert np.allclose(yt, y[tuple(fidx)]) # Verify logamp rms = librosa.feature.rms(y=librosa.to_mono(yt), center=False) logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None) assert np.all(logamp > - top_db) # Verify logamp rms_all = librosa.feature.rms(y=librosa.to_mono(y)).squeeze() logamp_all = librosa.power_to_db(rms_all**2, ref=ref, top_db=None) start = int(librosa.samples_to_frames(idx[0])) stop = int(librosa.samples_to_frames(idx[1])) assert np.all(logamp_all[:start] <= - top_db) assert np.all(logamp_all[stop:] <= - top_db) # Verify duration duration = librosa.get_duration(yt) assert np.allclose(duration, trim_duration, atol=1e-1), duration
def __init__(self, track, part_size=25): self.stft = librosa.stft(librosa.to_mono(track)) padding = np.zeros((self.stft.shape[0], part_size // 2)) self.stftPadded = np.abs( np.concatenate((padding, librosa.stft( librosa.to_mono(track)), padding, padding), axis=1)) self.size = part_size
def visualize_wavs(self, wavs, sr): for med in wavs: mixed, src1, src2 = med plt.subplot(311) librosa.display.waveplot(librosa.to_mono(src1[:10 * sr]), sr) plt.subplot(312) librosa.display.waveplot(librosa.to_mono(src2[:10 * sr]), sr) plt.subplot(313) librosa.display.waveplot(librosa.to_mono(mixed[:10 * sr]), sr) plt.show() input("Visualization done.")
def make_spectrograms(data, test): """ Generates all of the spectrograms from the musdb18 dataset. :return: None """ if data=="train": mus_data = musdb.DB(root="data/musdb18", subsets="train") else: mus_data = musdb.DB(root="data/musdb18", subsets="test") dictionary = {} dictionary["mix"] = [] dictionary["vocals"] = [] dictionary["instrumental"] = [] # Creating the spectogram arrays from the training data num_tracks = len(mus_data) percent = 0.1 for i, track in enumerate(mus_data): if i / num_tracks > percent: print(int(100 * percent), "%", "of " + data + " data generated") percent += 0.1 # Converting samples to target rate of 22050 original_sr = track.rate target_sr = 22050 mix_data = librosa.resample(librosa.to_mono(track.audio.T), orig_sr=original_sr, target_sr=target_sr, res_type='kaiser_best', fix=True, scale=False) vocal_data = librosa.resample(librosa.to_mono(track.targets['vocals'].audio.T), orig_sr=original_sr, target_sr=target_sr, res_type='kaiser_best', fix=True, scale=False) instrumental_data = librosa.resample(librosa.to_mono(track.targets['accompaniment'].audio.T), orig_sr=original_sr, target_sr=target_sr, res_type='kaiser_best', fix=True, scale=False) # Length of frame; 66150 should be 3 seconds (appears as 6 seconds on graph) len_frame = target_sr*3 num_frames = int(len(mix_data)/len_frame) # Saving each frame as a spectrogram array (and putting track in mix folders and vocals in vocals folder) for frame in range(num_frames): dictionary["mix"].append(generate_spectrogram_array(mix_data[frame * len_frame : frame * len_frame + len_frame])) dictionary["vocals"].append(generate_spectrogram_array(vocal_data[frame * len_frame : frame * len_frame + len_frame])) dictionary["instrumental"].append(generate_spectrogram_array(instrumental_data[frame * len_frame : frame * len_frame + len_frame])) if test: pickle.dump(dictionary, open( "data/spectrograms/" + data + "-1", "wb" )) make_spectrogram_image(mix_data[frame * len_frame : frame * len_frame + len_frame],"test-image") return # pickle dictionary here pickle.dump(dictionary, open( "data/spectrograms/" + data, "wb" )) return
def calculate_SDR(music, model, n_fft=2048, hop_length=512, slice_duration=2): model.eval() scores = [] sr = music.rate ind = 0 mixture = librosa.to_mono(music.audio.transpose()) vocal = librosa.to_mono(music.targets['vocals'].audio.transpose()) for i in range(0, len(music.audio), slice_duration * sr): ind += 1 mixture = mixture[i:i + slice_duration * sr] vocal = vocal[i:i + slice_duration * sr] if np.all(vocal == 0): # print('[!] - all 0s, skipping') continue if i + 2 * sr >= len(music.audio): break resampled_mixture = mixture mixture_stft = librosa.stft(resampled_mixture, n_fft=n_fft, hop_length=512, window='hann', center=True) magnitude_mixture_stft, mixture_phase = librosa.magphase(mixture_stft) normalized_magnitude_mixture_stft = torch.Tensor(Normalize().forward( [magnitude_mixture_stft])[0]) sr_v = music.rate with torch.no_grad(): mask = model.forward( normalized_magnitude_mixture_stft.unsqueeze(0)).squeeze(0) out = mask * torch.Tensor(normalized_magnitude_mixture_stft) predicted_vocal_stft = out.numpy() * mixture_phase predicted_vocal_audio = librosa.istft(predicted_vocal_stft.squeeze(0), win_length=n_fft, hop_length=hop_length, window='hann', center='True') try: scores.append( mir_eval.separation.bss_eval_sources( vocal[:predicted_vocal_audio.shape[0]], predicted_vocal_audio)[0]) except ValueError: print(vocal.all() == 0) print(predicted_vocal_stft.all() == 0) print('Error but skipping')
def process(directory, sources, target_sr, save_only_mono=False): for track_i, track in enumerate(sources): original_sr = track.rate mix = librosa.core.resample(track.audio.T, original_sr, target_sr) drums = librosa.core.resample(track.targets['drums'].audio.T, original_sr, target_sr) bass = librosa.core.resample(track.targets['bass'].audio.T, original_sr, target_sr) other = librosa.core.resample(track.targets['other'].audio.T, original_sr, target_sr) vocal = librosa.core.resample(track.targets['vocals'].audio.T, original_sr, target_sr) acc = librosa.core.resample(track.targets['accompaniment'].audio.T, original_sr, target_sr) stereo = [mix, drums, bass, other, vocal, acc] length = min([t.shape[1] for t in stereo]) if length <= 1: continue left = np.array([t[0, :length] for t in stereo]) right = np.array([t[1, :length] for t in stereo]) mono = np.array([librosa.to_mono(t[:, :length]) for t in stereo]) if save_only_mono: together = mono else: together = np.array([left, right, mono]) if not os.path.exists(directory): os.makedirs(directory) np.savez_compressed(f'{directory}/{track_i:04d}', together.astype('float32')) print(f"Track: {track_i}, sampling rate: {target_sr}")
def get_audio(path, shape=None, mean=[0, 0, 0], std=[1, 1, 1], sample_len=16000, streams=1, sample_rate=1, extension='.wav', start=0, stop=None, channels=1, **kwargs): audio, fs = libr.load(path, sr=sample_len) if len(audio.shape) > channels: audio = libr.to_mono(audio) if len(audio) < sample_len: pad = np.zeros(sample_len - len(audio)) audio = np.append(audio, pad) batches = torch.from_numpy(audio).split(sample_len) if not len(audio) % sample_len == 0: audio = batches[:-1] else: audio = batches audio = torch.stack(audio) return audio
def _decode_non_mp3_file_like(self, file, format=None): try: import librosa import soundfile as sf except ImportError as err: raise ImportError( "To support decoding audio files, please install 'librosa' and 'soundfile'." ) from err if format == "opus": if version.parse( sf.__libsndfile_version__) < version.parse("1.0.30"): raise RuntimeError( "Decoding .opus files requires 'libsndfile'>=1.0.30, " + "it can be installed via conda: `conda install -c conda-forge libsndfile>=1.0.30`" ) array, sampling_rate = sf.read(file) array = array.T if self.mono: array = librosa.to_mono(array) if self.sampling_rate and self.sampling_rate != sampling_rate: array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best") sampling_rate = self.sampling_rate return array, sampling_rate
def pitch_shift(audio, steps, step_size=12): """ Wraps librosa's `pitch_shift` function, and returns a new Audio object. Note that this folds to mono. Parameters --------- audio : Audio The Audio object to act on. steps : float The pitch shift amount. The default unit is semitones, as set by `step_size`. step_size : float > 0 The number of equal-tempered steps per octave. The default is semitones, as set by `step_size=12`. Quarter-tones, for example, would be `step_size=24`. """ shifted = librosa.effects.pitch_shift( librosa.to_mono(audio.raw_samples), audio.sample_rate, steps, bins_per_octave=step_size, ) stretched_audio = Audio(raw_samples=shifted, sample_rate=audio.sample_rate) return stretched_audio
def resamplig(file): SAMPLE_RATE = 16000 y, sr = librosa.load(file) os.remove(file) data = librosa.resample(y, sr, SAMPLE_RATE) data = librosa.to_mono(data) librosa.output.write_wav(file, data, SAMPLE_RATE)
def file_to_input(filename, srate=44100): try: y, sr = librosa.load(filename, sr=None) except: raise IOError('Give me an audio file which I can read!!') if len(y.shape) > 1: print('Mono Conversion') y = librosa.to_mono(y) if sr != srate: print('Resampling to {}'.format(srate)) y = librosa.resample(y, sr, srate) mel_feat = librosa.feature.melspectrogram(y=y, sr=srate, n_fft=n_fft, hop_length=hop_length, n_mels=128) inpt = librosa.power_to_db(mel_feat).T # input needs to be 4D, batch_size X 1 X inpt_size[0] X inpt_size[1] inpt = np.reshape(inpt, (1, 1, inpt.shape[0], inpt.shape[1])) return inpt
def main(): args = parse_args() filepath = Path(args.file).resolve() if not filepath.exists(): raise FileNotFoundError() loop = AudioLoop(path=filepath, hop_length=args.hop, estimated_bpm=args.tempo, align_beats_to_start=not args.no_align) if args.num_beats is not None: print(f"Using given num_beats to compute the tempo and beat times") samples_per_beat = loop.samples / args.num_beats loop.beat_samples = np.rint( np.linspace(0, samples_per_beat * (args.num_beats - 1), args.num_beats)) loop.beat_frames = np.rint(loop.beat_samples / args.hop) print(f"tempo: {loop.tempo}") click_track = librosa.core.clicks(frames=loop.beat_frames, sr=loop.sample_rate, length=loop.samples) sd.play(librosa.to_mono(loop.audio.T) + click_track) val = input("save audio loop? (y/n)") sd.stop() if val == 'y': loop.save(f"{filepath.stem}_LOOP.pkl") print(f"Saved to {filepath.stem}_LOOP.pkl") print("Goodbye!")
def read_audios(file): print(splitext(basename(file))[0]) # read spl values for root, dirs, files in os.walk(levels_path): for name in files: name spl = np.zeros(64) # read spl-data and write to file name for i in range(len(files)): if splitext(basename(file))[0] == splitext(files[i])[0]: spl = np.loadtxt(levels_path + files[i], delimiter=";") y, sr = librosa.load(file, sr=None, mono=False) y_mono = librosa.to_mono(y) # Save as .wav for i in range(int(900 * sr // (sr * frame_length))): # for first 900 sec = 15 min yx = y_mono[int(i * sr * frame_length):int(sr * frame_length * (i + 1))] soundfile.write( splitted_wav + file.split('\\')[-1].split('.webm')[0] + ' _ ' + str(i + 1) + ' _ ' + str(spl[i]) + ' _ ' + 'dBA' + '.wav', yx, sr, 'PCM_16')
def resample(in_file, out_file): target_rate = 48000 audio_data, audio_sample_rate = soundfile.read(in_file, dtype='float32') resampled_data = librosa.resample(librosa.to_mono(audio_data.T), audio_sample_rate, target_rate) audio_data = resampled_data.T soundfile.write(in_file, audio_data, target_rate, subtype='PCM_16')
def read_audio_file(file_path, target_sample_rate, duration=None, samples=None): """Read audio samples from a file. If duration/samples argument is specified audio is padded or clipped to match the value. """ try: y, sample_rate = soundfile.read(file_path) y = librosa.to_mono(y.T) y = librosa.resample(y, sample_rate, target_sample_rate) except: logging.error('Failed to read audio from "{}"'.format(file_path)) raise if duration or samples: if duration: total_samples = int(target_sample_rate * duration) elif samples: total_samples = samples if len(y) < total_samples: # Pad audio files smaller than duration with silence pad = total_samples - len(y) y = np.concatenate((y, np.zeros(pad))) elif len(y) > total_samples: # Clip audio files longer than duration y = y[:total_samples] return y
def __init__(self, signal, n_fft, window_size, hop_length, peak, audio_type, sr): # self.path = path # self.sr = librosa.get_samplerate(self.path) self.sr = sr self.n_fft = n_fft self.window_size = window_size self.hop_length = hop_length self.signal = signal self.mono_signal = librosa.to_mono(self.signal) self.signal_db = librosa.amplitude_to_db(self.mono_signal) self.peak = peak self.audio_type = audio_type self.x_norm = preprocessing.normalize(self.mono_signal, self.peak) self.fft = np.abs( librosa.core.stft(self.mono_signal, n_fft=self.n_fft, win_length=self.window_size, hop_length=self.hop_length)) self.num_bins = self.fft.shape[0] self.fft_db = librosa.amplitude_to_db(self.fft) self.norm_fft_db = preprocessing.compute_norm_fft_db( self.x_norm, self.n_fft, self.window_size, self.hop_length) self.freqs = np.array( [i * self.sr / self.fft.shape[0] for i in range(self.num_bins)])
def load_file(self): ''' Load samples from an audio file Uses: self.filename: path to audio file from which to make spectrogram (optional) self.sample_rate: rate at which to resample audio Returns: samples: the samples from the wav file ''' samples, sample_rate = load( self.filename, mono= False, # Don't automatically load as mono, so we can warn if we force to mono sr=self.sample_rate, # Resample res_type='kaiser_fast', ) # Force to mono if wav has multiple channels if samples.ndim > 1: samples = to_mono(samples) if self.verbosity > 1: print( f"WARNING: Multiple-channel file detected ({filename}). Automatically mixed to mono." ) return samples
def plot_spectrogram(self): plt.figure() mono = librosa.to_mono(self.waveform.T) D = librosa.stft(mono) S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max) img = librosa.display.specshow(S_db, y_axis='log', x_axis='time') plt.colorbar(img, format="%+2.f dB")
def extract(args): audio_directory, output_directory, af, overwrite = args subdir, output_file = os.path.split(af.split(audio_directory)[1]) output_file = os.path.splitext(output_file)[0] output_file = os.path.join(output_directory, output_file) if os.path.exists(output_file) and not overwrite: print('Skipping {}. Already exists.'.format(output_file)) return output = dict() try: y, _sr = soundfile.read(af) y = to_mono(y) sr = 22050 y = resample(y, _sr, sr) except Exception as e: y, sr = load(af) output['linspec_mag'], output['linspec_phase'] = linspec(y) output['melspec'] = melspec(y, sr=sr) output['logspec'] = logspec(y, sr=sr) output['hcqt_mag'], output['hcqt_phase'] = hcqt(y, sr=sr) output['vggish_melspec'] = vggish_melspec(y, sr=sr) # high-level output['percussive_ratio'], output['percussive_rms'], output[ 'total_rms'] = percussive_ratio(y, margin=3.0) output['onset_strength'] = onset_strength(y, detrend=True) output['tempogram'] = tempogram(y) output['onset_patterns'] = onset_patterns(y, sr=sr) np.savez_compressed(output_file, **output)
def swingify(file_path, outfile, factor, sr=None, format=None, max_length=None): y, sr = librosa.load(file_path, mono=False, sr=sr) print(y.shape) if max_length: print('trimming audio to max_len: {} seconds'.format(max_length)) if len(y.shape) > 1: y = y[:, :max_length * sr] else: y = y[:max_length * sr] print(y.shape) anal_samples = librosa.to_mono(y) raw_samples = np.atleast_2d(y) # force stereo if raw_samples.shape[0] < 2: print('doubling mono signal to be stereo') raw_samples = np.vstack([raw_samples, raw_samples]) beats = get_beats(anal_samples, sr, 512) output = synthesize(raw_samples, beats, factor) output = output * 0.7 print(sr) sf.write(outfile, output.T, int(sr), format=format) # librosa.output.write_wav(outfile, output, sr, norm=True) return beats
def convert(): data_original = [] for file_name in file_names: for line in pickle.load(open(file_name, 'rb')): data_original.append(line) log('Original data length is {}'.format(len(data_original))) percentage = 0 begin = timer() data_convert = [] for data in data_original: name = data[0] array = librosa.resample(data[1], data[2], target_rate) if to_mono: array = librosa.to_mono(array) data_convert.append((name, array, target_rate)) if len(data_convert) / len(data_original) - percentage > 0.05: percentage = len(data_convert) / len(data_original) log('Now converted {} ({}%). Cost time {}'.format( len(data_convert), percentage * 100, timer() - begin)) with open(output_file, 'wb') as file: pickle.dump(data_convert, file) log('Converted data length is {}. All finished.'.format(len(data_convert)))
def resample(data, fs, new_fs): # more the sampling rate - more the number of samples in one second # Less samples, less quality # More samples, good quality, but my lead to more storage requirements. # optimal value of sampling rate - 44100 samples per second # print("Changing from sampling rate {} to {}".format(fs,new_fs)) # converting into single channel (monosteric) if data.ndim > 1: data = librosa.to_mono(data) fs = float(fs) new_fs = float(new_fs) size = data.size # old time axis old_time_axis = np.arange(size) / fs total_time = old_time_axis[-1] total_samples = round(total_time * new_fs) # getting new time axis wrt old time axis and new sampling rate new_time_axis = np.arange(total_samples) / new_fs # fills in the values between the samples f = interpolate.interp1d(old_time_axis, data) new_data = f(new_time_axis) return new_data
def read_audio(filepath, sr=None, mono=True, peak_norm=False): """ Read audio Parameters ---------- filepath sr mono Returns ------- y, sr """ try: y, _sr = psf.read(filepath) y = y.T except RuntimeError: y, _sr = librosa.load(filepath, mono=False, sr=None) if sr is not None and sr != _sr: y = resampy.resample(y, _sr, sr, filter='kaiser_fast') else: sr = _sr if mono: y = librosa.to_mono(y) if peak_norm: y /= np.max(np.abs(y)) return y, sr
def sox(x, fs, *args): assert fs > 0 fdesc, infile = tempfile.mkstemp(suffix=".wav") os.close(fdesc) fdesc, outfile = tempfile.mkstemp(suffix=".wav") os.close(fdesc) psf.write(infile, x, fs) try: arguments = ["sox", infile, outfile, "-q"] arguments.extend(args) subprocess.check_call(arguments) x_out, fs = psf.read(outfile) x_out = x_out.T if x.ndim == 1: x_out = librosa.to_mono(x_out) finally: os.unlink(infile) os.unlink(outfile) return x_out
def specShow(sig): plt.figure(figsize=(8, 5)) # multiframe spectrogram #make mono try: sig = sig.frames except: pass sig = np.nan_to_num(list(sig)) try: sig = lib.to_mono(np.transpose(sig)) except: return X = lib.stft(sig) Xdb = lib.amplitude_to_db(abs(X)) plt.figure(figsize=(14, 5)) plt.subplot(1, 2, 1) lib.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz') # single frame spectrogram X = scipy.fft(sig) X_mag = librosa.core.amplitude_to_db(np.absolute(X)) f = np.linspace(0, sr, len(X_mag)) # frequency variable plt.subplot(1, 2, 2) res = int(len(sig) / 2) plt.plot(f[:res], X_mag[:res]) plt.xlabel('Frequency (Hz)')
def _transform(self, X, sr): X = self._val.signal(X) duration = self.random_state.uniform(*self.duration) # Convert stereo signals to mono and take the absolute value mono_amp = np.abs(librosa.to_mono(X)) # Calculate the length of the section in terms of frames total_frames = len(mono_amp) frames = ceil(total_frames * duration) # Initialize variables for keeping track of loudest section previous_amp, section_amp = None, 0 start, end = 0, frames loudest_amp, loudest_idx = -1, (start, end) # Slide the moving section window while end < total_frames: # Calculate volume for current section section_amp += mono_amp[start:end].sum( ) if previous_amp is None else mono_amp[end] - previous_amp # Update loudest section indices if current section is loudest if section_amp > loudest_amp: loudest_amp, loudest_idx = section_amp, (start, end) # Store volume of the frame leaving the moving window previous_amp = mono_amp[start] # Update section indices start, end = start + 1, end + 1 # Return section of the original signal which was the loudest return X[:, loudest_idx[0]:loudest_idx[1]]
def waveShow(sig): try: sig = sig.frames except: pass sig = lib.to_mono(np.transpose(sig)) lib.display.waveplot(sig)
def load_file(filename, sample_rate=22050): ''' Load samples from an audio file Inputs: filename: path to audio file from which to make spectrogram (optional) sample_rate: rate at which to resample audio Returns: samples: the samples from the wav file sample_rate: the sample rate from the wav file ''' samples, sample_rate = load( filename, mono= False, # Don't automatically load as mono, so we can warn if we force to mono sr=sample_rate, # Resample res_type='kaiser_best', ) # Force to mono if wav has multiple channels if samples.ndim > 1: samples = to_mono(samples) #print( # f"WARNING: Multiple-channel file detected ({filename}). Automatically mixed to mono." #) return samples, int(sample_rate)
def detect_onsets(self): mono = librosa.to_mono(self.waveform.T) onset_frames = librosa.onset.onset_detect(y=mono, sr=self.fs, units='frames', pre_max=500, post_max=500, pre_avg=100, post_avg=100, delta=0.01) times = librosa.frames_to_time(onset_frames, sr=self.fs) return times
def to_mono(stereo_array): """Calls librosa.to_mona on the given audio file. :param stereo_array: input stereo array :returns: mono audio array (numpy ndarray) """ return librosa.to_mono(stereo_array)
def read_audio(current_file, sample_rate=None, mono=True): """Read audio file Parameters ---------- current_file : dict Dictionary given by pyannote.database. sample_rate: int, optional Target sampling rate. Defaults to using native sampling rate. mono : int, optional Convert multi-channel to mono. Defaults to True. Returns ------- y : (n_samples, n_channels) np.array Audio samples. sample_rate : int Sampling rate. Notes ----- In case `current_file` contains a `channel` key, data of this (1-indexed) channel will be returned. """ # sphere files if current_file['audio'][-4:] == '.sph': # dump sphere file to a temporary wav file # and load it from here... from sphfile import SPHFile sph = SPHFile(current_file['audio']) with tempfile.NamedTemporaryFile() as f: sph.write_wav(f.name) y, sample_rate = librosa.load(f.name, sr=sample_rate, mono=False) # all other files else: y, sample_rate = librosa.load(current_file['audio'], sr=sample_rate, mono=False) # reshape mono files to (1, n) [was (n, )] if y.ndim == 1: y = y.reshape(1, -1) # extract specific channel if requested channel = current_file.get('channel', None) if channel is not None: y = y[channel - 1, :] # convert to mono if mono: y = librosa.to_mono(y) return y.T, sample_rate
def __test(filename, mono): y, sr = librosa.load(filename, mono=mono) y_mono = librosa.to_mono(y) eq_(y_mono.ndim, 1) eq_(len(y_mono), y.shape[-1]) if mono: assert np.allclose(y, y_mono)
def slice_clip(filename, start, stop, n_samples, sr, mono=True): '''Slice a fragment of audio from a file. This uses pysoundfile to efficiently seek without loading the entire stream. Parameters ---------- filename : str Path to the input file start : int The sample index of `filename` at which the audio fragment should start stop : int The sample index of `filename` at which the audio fragment should stop (e.g. y = audio[start:stop]) n_samples : int > 0 The number of samples to load sr : int > 0 The target sampling rate mono : bool Ensure monophonic audio Returns ------- y : np.ndarray [shape=(n_samples,)] A fragment of audio sampled from `filename` Raises ------ ValueError If the source file is shorter than the requested length ''' with psf.SoundFile(str(filename), mode='r') as soundf: n_target = stop - start soundf.seek(start) y = soundf.read(n_target).T if mono: y = librosa.to_mono(y) # Resample to initial sr y = librosa.resample(y, soundf.samplerate, sr) # Clip to the target length exactly y = librosa.util.fix_length(y, n_samples) return y
def sample_clip(filename, n_samples, sr, mono=True): '''Sample a fragment of audio from a file. This uses pysoundfile to efficiently seek without loading the entire stream. Parameters ---------- filename : str Path to the input file n_samples : int > 0 The number of samples to load sr : int > 0 The target sampling rate mono : bool Ensure monophonic audio Returns ------- y : np.ndarray [shape=(n_samples,)] A fragment of audio sampled randomly from `filename` Raises ------ ValueError If the source file is shorter than the requested length ''' with psf.SoundFile(str(filename), mode='r') as soundf: n_target = int(np.ceil(n_samples * soundf.samplerate / sr)) # Draw a random clip start = np.random.randint(0, len(soundf) - n_target) soundf.seek(start) y = soundf.read(n_target).T if mono: y = librosa.to_mono(y) # Resample to initial sr y = librosa.resample(y, soundf.samplerate, sr) # Clip to the target length exactly y = librosa.util.fix_length(y, n_samples) return y
def __init__(self, file_path=None, raw_samples=None, convert_to_mono=False, sample_rate=44100, analysis_sample_rate=22050): """ Audio constructor. Opens a file path, loads the audio with librosa, and prepares the features Parameters ---------- file_path: string path to the audio file to load raw_samples: np.array samples to use for audio output convert_to_mono: boolean (optional) converts the file to mono on loading sample_rate: number > 0 [scalar] (optional) sample rate to pass to librosa. Returns ------ An Audio object """ if file_path: y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate) elif raw_samples is not None: # This assumes that we're passing in raw_samples # directly from another Audio's raw_samples. y = raw_samples sr = sample_rate self.file_path = file_path self.sample_rate = float(sr) self.analysis_sample_rate = float(analysis_sample_rate) self.num_channels = y.ndim self.duration = librosa.get_duration(y=y, sr=sr) self.analysis_samples = librosa.resample(librosa.to_mono(y), sr, self.analysis_sample_rate, res_type='kaiser_best') self.raw_samples = np.atleast_2d(y) self.zero_indexes = self._create_zero_indexes() self.features = self._create_features() self.timings = self._create_timings()
def get_random_wav(filename, sr, duration): # Get a random range from wav wav, _ = librosa.load(filename, sr = sr, mono = False) print(wav) assert (wav.ndim == 2) and (wav.shape[0] == 2), 'Require wav to have two channels' wav_pad = pad_wav(wav = wav, sr = sr, duration = duration) wav_sample = sample_range(wav = wav, sr = sr, duration = duration) wav_sample_mono = librosa.to_mono(wav_sample) wav_sample_src1 = wav_sample[0, :] wav_sample_src2 = wav_sample[1, :] return wav_sample_mono, wav_sample_src1, wav_sample_src2
def load_wavs(filenames, sr): wavs_mono = list() wavs_src1 = list() wavs_src2 = list() for filename in filenames: wav, _ = librosa.load(filename, sr = sr, mono = False) assert (wav.ndim == 2) and (wav.shape[0] == 2), 'Require wav to have two channels' wav_mono = librosa.to_mono(wav) * 2 # Cancelling average wav_src1 = wav[0, :] wav_src2 = wav[1, :] wavs_mono.append(wav_mono) wavs_src1.append(wav_src1) wavs_src2.append(wav_src2) return wavs_mono, wavs_src1, wavs_src2
def __sox(y, sr, *args): '''Execute sox Parameters ---------- y : np.ndarray Audio time series sr : int > 0 Sampling rate of `y` *args Additional arguments to sox Returns ------- y_out : np.ndarray `y` after sox transformation ''' assert sr > 0 fdesc, infile = tempfile.mkstemp(suffix='.wav') os.close(fdesc) fdesc, outfile = tempfile.mkstemp(suffix='.wav') os.close(fdesc) # Dump the audio librosa.output.write_wav(infile, y, sr) try: arguments = ['sox', infile, outfile, '-q'] arguments.extend(args) subprocess.check_call(arguments) y_out, sr = psf.read(outfile) y_out = y_out.T if y.ndim == 1: y_out = librosa.to_mono(y_out) finally: os.unlink(infile) os.unlink(outfile) return y_out
def wav_data_to_samples(wav_data, sample_rate): """Read PCM-formatted WAV data and return a NumPy array of samples. Uses scipy to read and librosa to process WAV data. Audio will be converted to mono if necessary. Args: wav_data: WAV audio data to read. sample_rate: The number of samples per second at which the audio will be returned. Resampling will be performed if necessary. Returns: A numpy array of audio samples, single-channel (mono) and sampled at the specified rate, in float32 format. Raises: AudioIOReadError: If scipy is unable to read the WAV data. AudioIOError: If audio processing fails. """ try: # Read the wav file, converting sample rate & number of channels. native_sr, y = scipy.io.wavfile.read(six.BytesIO(wav_data)) except Exception as e: # pylint: disable=broad-except raise AudioIOReadError(e) if y.dtype == np.int16: # Convert to float32. y = int16_samples_to_float32(y) elif y.dtype == np.float32: # Already float32. pass else: raise AudioIOError( 'WAV file not 16-bit or 32-bit float PCM, unsupported') try: # Convert to mono and the desired sample rate. if y.ndim == 2 and y.shape[1] == 2: y = y.T y = librosa.to_mono(y) if native_sr != sample_rate: y = librosa.resample(y, native_sr, sample_rate) except Exception as e: # pylint: disable=broad-except raise AudioIOError(e) return y
def swingify(file_path, outfile, factor, sr=None, format=None): y, sr = librosa.load(file_path, mono=False, sr=sr) print(y.shape) anal_samples = librosa.to_mono(y) raw_samples = np.atleast_2d(y) # force stereo if raw_samples.shape[0] < 2: print('doubling mono signal to be stereo') raw_samples = np.vstack([raw_samples, raw_samples]) beats = get_beats(anal_samples, sr, 512) output = synthesize(raw_samples, beats, factor) output = output * 0.7 print(sr) sf.write(outfile, output.T, int(sr), format=format) # librosa.output.write_wav(outfile, output, sr, norm=True) return beats
def resample_all(): audio_folder = 'scenes_stereo/' subsamp_folder = 'scenes_mono_8k/' chdir(audio_folder) mkdir(subsamp_folder) for sub_folder in glob('*'): mkdir(subsamp_folder + sub_folder) for filename in glob(sub_folder + '/*.wav'): print(filename) [fs, sig] = read(filename) sig = to_mono(sig.T) sig = resample(sig, fs, 8000) write(subsamp_folder + filename, 8000, sig)
import numpy as np import os import pprint import librosa import matplotlib.pyplot as plt from audio_models import BeatInterval print('#'*180) print(os.path.abspath(librosa.__file__)) # load audio file audio_path = librosa.util.example_audio_file() audio_path = '../src/audio/medium.m4a' # audio_path = '../audio/.mp3' y, sr = librosa.load(audio_path) y_mono = librosa.to_mono(y) print('y', y.shape) print('sr', sr) print('y_mono', y_mono.shape) # assert(False) # mel spectrogram def mel_spetrogram(y, sr): S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) log_S = librosa.logamplitude(S, ref_power=np.max) print('spectrogram', S.shape) plt.figure(figsize=(12,4)) librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel') plt.title('mel power spectrogram') plt.colorbar(format='%+02.0f dB')
def lib_to_mono(data): return lib.to_mono(data)