def extract_temp_channels(wav_path, temp_directory): """ Extract a single channel from a stereo file to a new mono wav file Parameters ---------- wav_path : str Path to stereo wav file temp_directory : str Directory to save extracted """ name, ext = os.path.splitext(wav_path) base = os.path.basename(name) a_path = os.path.join(temp_directory, base + '_A.wav') b_path = os.path.join(temp_directory, base + '_B.wav') if not os.path.exists(a_path): with soundfile.SoundFile(wav_path, 'r') as inf: sr = inf.samplerate sound_format = inf.format endian = inf.endian subtype = inf.subtype stream = librosa.stream(wav_path, block_length=256, frame_length=2048, hop_length=2048, mono=False) with soundfile.SoundFile(a_path, 'w', samplerate=sr, channels=1, endian=endian, subtype=subtype, format=sound_format) as af, \ soundfile.SoundFile(b_path, 'w', samplerate=sr, channels=1, endian=endian, subtype=subtype, format=sound_format) as bf: for s in stream: af.write(s[0, :]) bf.write(s[1, :]) return a_path, b_path
def data_stream_librosa(**kwargs): """data_stream_librosa load and stream audio data in frames, same as aubio.src does """ # filename = librosa.util.example_audio_file() if 'filename' in kwargs: filename = kwargs['filename'] else: filename = '/home/src/QK/data/sound-arglaaa-2018-10-25/24.wav' if 'frame_length' in kwargs: frame_length = kwargs['frame_length'] else: frame_length = 512 if 'hop_length' in kwargs: hop_length = kwargs['hop_length'] else: hop_length = 512 # streaming does not provide on-the-fly resampling so we must consider the samplerate sr = librosa.get_samplerate(filename) # get duration # dur = librosa.get_duration(filename) # open the stream stream = librosa.stream(filename, block_length=1, frame_length=frame_length, hop_length=hop_length, mono=True ) # return stream descriptor and samplerate return tuple((stream, sr))
def librosa_mix_test(): sr = librosa.get_samplerate(fs[0]) m = sr // SAMPLERATE frame_length = FRAMESPERBUFFER * m hop_length = FRAMESPERBUFFER * m strms = map( lambda f: librosa.stream(f, mono=False, fill_value=0., block_length=1, frame_length=frame_length, hop_length=hop_length), fs) frame = [np.zeros(FRAMESPERBUFFER), np.zeros(FRAMESPERBUFFER)] for s in strms: blk = next(s) frame[0] += blk[0][::4] frame[1] += blk[1][::4] frame = np.column_stack((frame[0], frame[1])).ravel() frame *= 0x800000 frame /= 4 data = frame.astype(np.int32) _bytes = wavdecode.to24le(data) print(np.array(_bytes)[:20])
def create_stream(self, key, filename, sampling_rate, duration, block_duration=5): file_path = os.path.join(self.data_path, f"train_audio/{key}/{filename}.wav") assert os.path.exists( file_path ), "The audio file you are trying to split does not exist in .wav format." # conversion block_length = 128 samples_per_block = sampling_rate * block_duration frame_length = samples_per_block // block_length hop_length = frame_length // 5 stream = librosa.stream( path=file_path, block_length=block_length, # num. of frames per block frame_length=frame_length, # num. of samples per frame hop_length=hop_length, # num. of samples to advance between frames fill_value=0) return stream
def log_mel(filepath, out_folder, fs, N, overlap, win_type='hamming', n_mels=128, fmin=0.0, fmax=None, htk=True): # Load an audio file as a floating point time series # x, fs = librosa.core.load(filepath, sr=fs, offset=5.0) # x_trimmered, index = librosa.effects.trim(x) # chunks = librosa.effects.split(x_trimmered, frame_length=5*fs, hop_length=1*fs) stream = librosa.stream(filepath, block_length=1, frame_length=5*fs, hop_length=1*fs, fill_value=0) i = 0 for x_chunks in stream: image_filename = os.path.join(dest_path, out_folder, os.path.splitext(os.path.basename(filepath))[0] + '_' + str(i)) image_filename = image_filename.replace('_16bit', '') # Power spectrum S = np.abs(librosa.core.stft(x_chunks, n_fft=N, window=signal.get_window(win_type, N), hop_length=N-overlap, center=False)) ** 2 # Build a Mel filter mel_basis = librosa.filters.mel(fs, N, n_mels, fmin, fmax, htk) # Filtering mel_filtered = np.dot(mel_basis, S) coefficients = librosa.core.power_to_db(mel_filtered) plt.figure() plt.imshow(coefficients) plt.savefig(image_filename + '.png') i += 1 # delta = librosa.feature.delta(mel_filtered, delta_width*2+1, order=1, axis=-1) #coefficients = np.concatenate((coefficients, delta)) # add delta e delta-deltas # coefficients.append(librosa.feature.delta(mel_filtered, delta_width*2+1, order=1, axis=-1)) # coefficients.append(librosa.feature.delta(mel_filtered, delta_width*2+1, order=2, axis=-1)) return True
def file_load_stream(wav_name, mono=False): try: sr = librosa.get_samplerate(wav_name) frameSize = sr hoplength = frameSize // 2 stream = librosa.stream(wav_name, block_length=1, frame_length=frameSize, hop_length=hoplength, mono=mono) except: logger.error("file_broken or not exists!! : {}".format(wav_name)) return stream
def load_file(filepath): sr = librosa.get_samplerate(filepath) frame_length = 1024 hop_length = 256 stream = librosa.stream(filepath, block_length=128, frame_length=1024, hop_length=256) for x in stream: yield (librosa.stft(x, n_fft=1024, hop_length=256), sr)
def a(): sr = 44100 p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=sr, input=True) # with s as stream: # librosa.get_samplerate(stream) for y_block in librosa.stream(stream, block_length=256, frame_length=2048, hop_length=2048): m_block = librosa.feature.melspectrogram(y_block, sr=sr, n_fft=2048, hop_length=2048, center=False) print(type(m_block)) stream.close()
def _stream_generator(self, filename, sr): "Used for generating Mel Spectrograms when given sample rate is an integer multiple of the original" assert sr % self.sample_rate == 0, f'Sample Rate {sr} is not a multiple, use block generator instead' mult = sr // self.sample_rate stream = librosa.stream(filename, block_length=self.timesteps, frame_length=self.n_fft * mult, hop_length=self.hop_length * mult, fill_value=0) for block in stream: yield librosa.feature.melspectrogram(block, sr=sr, n_fft=self.n_fft * mult, hop_length=self.hop_length * mult, n_mels=self.n_mels, center=False)
def find_audio_sample(source_path, template_path): template_sound, template_rate = librosa.load(template_path, sr=None) source_rate = librosa.get_samplerate(source_path) frame_length = len(template_sound) block_length = 1024 hop_length = 128 # 512 source_stream = librosa.stream(source_path, block_length=block_length, frame_length=frame_length, hop_length=int(hop_length)) max_value = 0 max_time = -1 for i_block, block in enumerate(source_stream): i_frame = 0 while i_frame * hop_length < hop_length * (block_length - 1): frame = block[i_frame * hop_length:i_frame * hop_length + frame_length] if frame.shape[0] < frame_length: break curr_time = (i_block * block_length * hop_length + i_frame * hop_length) / source_rate #print(f"Processing {format_time(curr_time)}") corr = abs(np.correlate(frame, template_sound)[0]) if max_value < corr: max_time = curr_time max_value = corr i_frame += 1 return max_time, max_value
def block_stream(filename=None, y=None, sr=None, frame_length=2048, hop_length=512, segment_duration=1, n_blocks=1, full_frames=True): '''Load audio into frames. If given a filename, it will load from file in blocks. If y and sr are given, slice into blocks Arguments: filename (str, optional): y, sr (np.ndarray, int, optional): frame_length (int): hop_length (int): block_duration (float) ''' # load audio frame generator if filename is not None: if y is not None: raise ParameterError('Either y or filename must be equal to None') # get blocks from file duration = librosa.get_duration(filename=filename) orig_sr = librosa.get_samplerate(filename) sr = sr or orig_sr # see: https://librosa.github.io/librosa/_modules/librosa/core/audio.html#stream # block_length is in units of `frames` so reverse calculation block_length = max(segment_duration * orig_sr, frame_length) block_n_frames = librosa.core.samples_to_frames( block_length, frame_length, hop_length) n_total = duration * orig_sr / librosa.core.frames_to_samples( block_n_frames, frame_length, hop_length) n_total = int(n_total / n_blocks) y_blocks = librosa.stream(filename, block_length=block_n_frames * n_blocks, frame_length=frame_length, hop_length=hop_length) # will throw an error if audio is not valid y_blocks = (y for y in y_blocks if librosa.util.valid_audio(y, mono=True)) if sr != orig_sr: # resample if we have a different sr y_blocks = (librosa.resample(y, orig_sr, sr) for y in y_blocks) else: if y is None or sr is None: raise ParameterError( 'At least one of (y, sr) or filename must be provided') librosa.util.valid_audio(y, mono=True) # get block length, make it evenly divisible into frames (with hop) block_length = max(segment_duration * sr, frame_length) * n_blocks # min block size = 1 frame block_length = librosa.core.samples_to_frames( block_length, frame_length, hop_length) # convert to even frames block_length = librosa.core.frames_to_samples( block_length, frame_length, hop_length) # convert back # get frames from array y_blocks = librosa.util.frame(y, block_length, block_length).T n_total = len(y_blocks) if full_frames: # drop any frames that are incomplete y_blocks = (y for y in y_blocks if y.size == block_length) return y_blocks, n_total, sr
# We'll generate 64 frames at a time, each frame having 2048 samples # and 75% overlap. # n_fft = 2048 hop_length = 512 # fill_value pads out the last frame with zeros so that we have a # full frame at the end of the signal, even if the signal doesn't # divide evenly into full frames. sr = librosa.get_samplerate(filename) stream = librosa.stream(filename, block_length=16, frame_length=n_fft, hop_length=hop_length, mono=True, fill_value=0) ####################################################################### # For this example, we'll compute PCEN on each block, find the maximum # response over frequency, and store the results in a list. # Make an array to store the frequency-averaged PCEN values pcen_blocks = [] # Initialize the PCEN filter delays to steady state zi = None for y_block in stream: # Compute the STFT (without padding, so center=False) D = librosa.stft(y_block, n_fft=n_fft, hop_length=hop_length, center=False)
song_names[song_idx]) except: print('[Invalid Song Number: {}]'.format(song_number)) sys.exit() original_path, chiptune_path, piano_path = file_sets[song_idx] print('[fs: {}] [fs: {}] [fs: {}]'.format( librosa.core.get_samplerate(original_path), librosa.core.get_samplerate(chiptune_path), librosa.core.get_samplerate(piano_path))) fs = 44100 original_stream = librosa.stream(original_path, block_length=1, frame_length=int(fs / 10), hop_length=int(fs / 10)) chiptune_stream = librosa.stream(chiptune_path, block_length=1, frame_length=int(fs / 10), hop_length=int(fs / 10)) piano_stream = librosa.stream(piano_path, block_length=1, frame_length=int(fs / 10), hop_length=int(fs / 10)) original_buffer = np.empty((0), np.float32) chiptune_buffer = np.empty((0), np.float32) piano_buffer = np.empty((0), np.float32) once = True
# We'll generate 16 frames at a time, each frame having 4096 samples # and 50% overlap. # n_fft = 4096 hop_length = n_fft // 2 # fill_value pads out the last frame with zeros so that we have a # full frame at the end of the signal, even if the signal doesn't # divide evenly into full frames. sr = librosa.get_samplerate(filename) stream = librosa.stream(filename, block_length=16, frame_length=n_fft, hop_length=hop_length, mono=True, fill_value=0) ##################################################################### # For this example, we'll compute PCEN on each block, average over # frequency, and store the results in a list. # Make an array to store the frequency-averaged PCEN values pcen_blocks = [] # Initialize the PCEN filter delays to steady state zi = None for y_block in stream: # Compute the STFT (without padding, so center=False) D = librosa.stft(y_block, n_fft=n_fft, hop_length=hop_length,
def extract_energy_bands(file): print("Extracting energy bands...") sample_rate = librosa.get_samplerate(file) logging.info(f'Starting analysis of {file}') # ================================================ # Declare audio loading stream and related parameters # ================================================ frame_length = sample_rate * 60 hop_length = sample_rate * 5 block_length = 5 * 4 # Load the audio as a stream stream = librosa.stream( file, block_length=block_length, frame_length=frame_length, hop_length=hop_length, ) logging.info(f'Sample rate: {sample_rate}') # ================================================ # Define features to extract # ================================================ energy_band_params = [ { 'sampleRate': sample_rate, 'startCutoffFrequency': 20, 'stopCutoffFrequency': 100 }, { 'sampleRate': sample_rate, 'startCutoffFrequency': 100, 'stopCutoffFrequency': 200 }, { 'sampleRate': sample_rate, 'startCutoffFrequency': 200, 'stopCutoffFrequency': 800 }, { 'sampleRate': sample_rate, 'startCutoffFrequency': 800, 'stopCutoffFrequency': 2000 }, { 'sampleRate': sample_rate, 'startCutoffFrequency': 2000, 'stopCutoffFrequency': 5000 }, { 'sampleRate': sample_rate, 'startCutoffFrequency': 5000, 'stopCutoffFrequency': 8000 }, { 'sampleRate': sample_rate, 'startCutoffFrequency': 8000, 'stopCutoffFrequency': 22050 }, ] def hz_to_bin(f, n_bins, sr): return int(np.round((f / sr) * n_bins)) class EnergyBandSelf: ''' Feature extraction class that extracts the energy in a frequency band, given a power STFT spectrogram.''' def __init__(self, sampleRate=44100, startCutoffFrequency=20.0, stopCutoffFrequency=11025.0): self.sr = sampleRate self.f_start = startCutoffFrequency self.f_stop = stopCutoffFrequency def __call__(self, S_power): n_bins = S_power.shape[0] n_start, n_stop = hz_to_bin(self.f_start, n_bins, self.sr), hz_to_bin( self.f_stop, n_bins, self.sr) return np.sum(S_power[n_start:n_stop, :]) # return np.median(np.sum(S_power[n_start:n_stop, :], axis=0)) energy_band_extractors = [ EnergyBandSelf(**kwargs) for kwargs in energy_band_params ] # ================================================ # Feature extraction # ================================================ energy_band_features = [] # Read the librosa docs to understand how the blocks and frames relate to each other: # https://librosa.org/blog/2019/07/29/stream-processing/#Blocks for i, y_block in enumerate(stream): for j, n_start in enumerate(range(0, len(y_block), hop_length)): if j == 0 and i % 5 == 0: t = (j * hop_length + i * block_length * hop_length) / (60 * sample_rate) logging.info(f'Processing from minute {t:.0f}.') # Select the current audio frame y_frame = y_block[..., n_start:n_start + hop_length] # Calculate the STFT power spectrogram for this audio frame S = np.abs(librosa.stft(y_frame))**2 # Calculate the energy in each of the predefined energy bands energy_band_features.append([e(S) for e in energy_band_extractors]) # Back to a numpy array energy_band_features = np.array(energy_band_features) # ================================================ # Plotting # ================================================ toplot = energy_band_features / np.max(energy_band_features, axis=0)[np.newaxis, :] toplot = toplot / np.sum(toplot, axis=1)[:, np.newaxis] yhat = savgol_filter(toplot, 15, 3, axis=0) # smooth the output a bit df = pd.DataFrame(data=yhat) return df
def multifile_stream(filenames, segment_duration, frame_length, hop_length, sr=None): '''Chain multiple file audio block streams into a single stream. It uses samples from the next file to complete a previously incomplete block ''' # TODO: need to do this for every file and add up n_total all_y_blocks = [] N_total = 0 # TODO: this doesn't take into account `remainder` remainder = None for filename in filenames: offset = 0 # get blocks from file duration = librosa.get_duration(filename=filename) orig_sr = librosa.get_samplerate(filename) sr = sr or orig_sr block_length = max(segment_duration * orig_sr, frame_length) block_n_frames = librosa.core.samples_to_frames( block_length, frame_length, hop_length) # TODO: this needs to be made a part of the generator expression if remainder is not None: offset = 1. * (block_length - len(remainder)) / orig_sr rem_completed, _ = librosa.load(filename, sr=orig_sr, duration=offset) if sr != orig_sr: # resample if we have a different sr rem_completed = librosa.resample(rem_completed, orig_sr, sr) remainder = np.concatenate([remainder, rem_completed]) if len( remainder ) == block_length: # there were enough samples in the new file yield remainder remainder = None # clear else: # super short file ??? - better to be robust to all types of weather. continue n_total = duration * orig_sr / librosa.core.frames_to_samples( block_n_frames, frame_length, hop_length) n_total = int(n_total / n_blocks) N_total += n_total y_blocks = librosa.stream(filename, offset=offset, block_length=block_n_frames, frame_length=frame_length, hop_length=hop_length) # will throw an error if audio is not valid y_blocks = (y for y in y_blocks if librosa.util.valid_audio(y, mono=True)) if sr != orig_sr: # resample if we have a different sr y_blocks = (librosa.resample(y, orig_sr, sr) for y in y_blocks) for y in y_blocks: yield y # TODO: return y_blocks, n_total, sr
prm_sound_filepath = "C:/Users/f.clement/Desktop/vod_cutter/temp_prm_audio.wav" # Implementation based on https://stackoverflow.com/questions/52572693/find-sound-effect-inside-an-audio-file # See also https://librosa.org/blog/2019/07/29/stream-processing/ source_sound, source_rate = librosa.load(ref_sound_filepath, sr=None) template_sound, template_rate = librosa.load(prm_sound_filepath, sr=None) source_rate = librosa.get_samplerate(ref_sound_filepath) frame_length = len(template_sound) hop_length = 128 block_length = 1024 source_stream = librosa.stream(ref_sound_filepath, block_length=block_length, frame_length=frame_length, hop_length=int(hop_length)) xs = [] ys = [] abs_ys = [] for i_block, block in enumerate(source_stream): i_frame = 0 while i_frame * hop_length < hop_length * (block_length - 1): frame = block[i_frame * hop_length:i_frame * hop_length + frame_length] if frame.shape[0] < frame_length: break
lab = pd.read_csv("/home/gnlenfn/remote/lstm/emotion_classes.csv") count = 0 mel_spec = pd.DataFrame() for file in glob.glob(wav_loc): sr = librosa.get_samplerate(file) frame_length = 0.02 frame_stride = 0.01 input_nfft = int(round(sr * frame_length)) input_stride = int(round(sr * frame_stride)) label = np.array( lab.EMOTION[lab.name == file.split("/")[-1].split(".")[0]]) stream = librosa.stream(file, block_length=304, frame_length=input_stride, hop_length=input_stride, fill_value=0) sub = 0 for y_block in stream: sub += 1 m_block = librosa.stft( y_block, n_fft=800, win_length=input_nfft, hop_length=input_stride, window='hamming', center=False, #fmax=4000 )
import librosa sr = librosa.get_samplerate('D:/Vasanth/DeepLearning/SubtitleSynchronizer/videos1/GardenofEvil.flac') # Set the frame parameters to be equivalent to the librosa defaults # in the file's native sampling rate frame_length = (2048 * sr) // 22050 hop_length = (512 * sr) // 22050 # Stream the data, working on 128 frames at a time stream = librosa.stream('D:/Vasanth/DeepLearning/SubtitleSynchronizer/videos1/GardenofEvil.flac', block_length=128, frame_length=frame_length, hop_length=hop_length) chromas = [] for y in stream: chroma_block = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=frame_length, hop_length=hop_length, center=False) chromas.append(chromas)
import math import sys import librosa sys.path.append("./../") from configs import config for wav_file in wav_files: print(wav_file) sr = librosa.get_samplerate(wav_file) frame_length = int(math.pow(2, math.ceil(math.log2((sr * config.frame_size_in_ms * 0.001))))) hop_length = int(config.percentage_overlap * frame_length / 100) print(sr, frame_length, hop_length) stream = librosa.stream(wav_file, block_length=1, frame_length=frame_length, hop_length=hop_length) for frame in stream: print(list(frame)) break