def __envelope(x, hop): """Compute the max-envelope of non-overlapping frames of x at length hop x is assumed to be multi-channel, of shape (n_channels, n_samples). """ x_frame = np.abs(util.frame(x, frame_length=hop, hop_length=hop)) return x_frame.max(axis=1)
def __init__(self, y, sr=44100, stem="sample", qual="", suffix=".mp3", parent_path=".", hop_length=512, frame_length=2048, duration=None, **kwargs): end = y.shape[-1] if duration is not None: end = min(duration * sr, end) end = int(end) self.end = end self._y = y features = frame(y, hop_length=hop_length, frame_length=frame_length) super().__init__(features=features, hop_length=hop_length, frame_length=frame_length, sr=sr, **kwargs) for k in kwargs: setattr(self, k, kwargs[k]) self.stem = stem self.qual = qual self.suffix = suffix self.parent_path = Path(parent_path) self.start_samp = 0 self.end_samp = y.shape[-1]
def find_start_end_rests(audio_data, sr, hop_length=HOP_LENGTH, n_fft=N_FFT): """ In order to evaluate the alignment procedure, we may want to exclude the (potential) start and end rests - the alignment is likely to be poor in these regions, but it does not matter. This functions returns the estimated end of the starting rest and the estimated start of the ending rest. The function won't return a meaningful output if there are rests in the middle of the music piece. Parameters ---------- audio_data : np.ndarray The raw waveform audio. sr : int The sample rate Returns ------- times_start_end_rests : list List of two items, with first being the end time of the start rest and the second item being the start time of the end rest, in seconds. """ # Compute the 3rd percentile of the envelope and # deem anything below this value as silence envelope = frame(audio_data, hop_length=hop_length, frame_length=n_fft).max(axis=0) lower_bound = np.percentile(envelope, 5.0) # Implement the search as loop, this should be faster than vectorisation k = 0 while envelope[k] <= lower_bound: k += 1 # Return 0 if there is no start rest if k == 0: time_start = 0.0 else: # The first value of the output of the frame function correspond to the time of # n_fft, then the times are spaced according to hop_length time_start = ((k-1)*hop_length + n_fft)/float(sr) j = len(envelope)-1 while envelope[j] <= lower_bound: j -= 1 # Return the length of the track if the is no end rest if j == len(envelope)-1: time_end = len(audio_data)/float(sr) else: time_end = ((j-1)*hop_length + n_fft)/float(sr) times_start_end_rests = [time_start, time_end] return(times_start_end_rests)
def transform(dataset): X = dataset.audio S = dataset.station Y = dataset.label_vec print('starting transform') X = [frame(x, frame_length=8000, hop_length=1000) for x in X] Y = [frame(y, frame_length=8000, hop_length=1000) for y in Y] S = [frame(s, frame_length=8000, hop_length=1000) for s in S] print('start mfcc') X = [mfcc_vec(x) for x in tqdm(X)] X = np.concatenate(X, axis=-1).T S = np.concatenate(S, axis=-1).T[:, 0] # only take first element of station Y = np.concatenate(Y, axis=-1) Y = (Y.sum(axis=0) / 8000) return X, S, Y
def extract_features(self, signal, frame_len = 2, hop_len = 0.5, fs = 16000): # Frame duration 2s, overlap duration 1.5s, assuming 16 kHz sampling rate S = np.transpose(frame(signal, int(frame_len*fs), int(hop_len*fs))) # 201 sequences of 59 dimensional MFCC based features X = list(map(lambda s: feature_extractor(s, fs), S)) X = np.swapaxes(X, 1, 2) self.X_ = X return X
def enframe(x, win, hop_len): if isinstance(win, int): win_len = win elif isinstance(win, np.ndarray): win_len = len(win) else: print('win type is not right.') raise x_frames = util.frame(x, win_len, hop_len, axis=0) if isinstance(win, np.ndarray): x_frames = x_frames * win return x_frames
def get_chunks(dataset, effect, num_seconds, sr, context): x = dataset['clean'][:] y = dataset[effect][:] # Frame into 1 second long segments first x = frame(x, sr, sr).T y = frame(y, sr, sr).T # Sample num_seconds number of segments if num_seconds < x.shape[0]: ind = np.random.choice(np.arange(x.shape[0]), num_seconds, replace=False) x = x[ind] y = y[ind] else: print('Number of seconds: {} is larger than dataset size: {}'.format( num_seconds, x.shape[0])) # Segment further into context frames x = np.apply_along_axis(frame, 1, x, frame_length=context, hop_length=1) x = np.transpose(x, (0, 2, 1)) x = x.reshape(-1, context, 1) y = y[:, context - 1:] y = y.reshape(-1, 1) return {'x': x, 'y': y}
def compute_spectral_signature(song_id, cached = True, use_covar = True): if cached and is_signature_cached(song_id): return fetch_signature(song_id) audioclip_path = join(AUDIOCLIPS_FOLDER, "{0}.mp3".format(song_id)) waveform, sample_rate, frame_length, frames = None, None, None, None try: waveform, sample_rate = load(audioclip_path, sr=SAMPLE_RATE) frame_length = core.time_to_samples(np.arange(0, 2, FRAME_TIMESTEP), sr = sample_rate)[1] frames = librosa_util.frame(y = waveform, frame_length = frame_length, hop_length = frame_length) except Exception as e: logging.warn("Couldn't preprocess audioclip '{0}': {1}".format(audioclip_path, str(e))) return None # The 'frames' array has shape (<frame_length>, <number_of_frames>) # hence, we transpose it. This holds true for every call to the librosa library that returns an array. frames = frames.T spectrograms = [] for frame in frames[FRAME_START: FRAME_START + FRAME_TOTAL]: spectrogram = feature.mfcc(y = frame, sr = frame_length).T to_add = [ entry[MFCSS_OFFSET : MFCSS_OFFSET+N_MFCCS] for entry in spectrogram ] spectrograms += to_add spectrograms = np.array(spectrograms) clusters = KMeans(n_clusters = CLUSTERS_PER_SIGNATURE) model = clusters.fit(spectrograms) # A song's "signature" is an array [ ( u_i, s_i, w_i ) ... ]. Where 0 <= i < CLUSTERS_PER_SIGNATURE # The triple (u_i, s_i, w_i) contains these variables: # u_i : Mean for Cluster i # s_i : Covariance for Cluster i # w_i : Weight for Cluster i signature = [] for label in xrange(CLUSTERS_PER_SIGNATURE): indexes = [ index for index, element in enumerate(model.labels_) if element == label ] cluster_points = [ spectrograms[i] for i in indexes ] mean = model.cluster_centers_[label] covariance = np.cov(cluster_points) if use_covar else [] weight = len(cluster_points) cluster_params = (mean, covariance, weight) signature.append(cluster_params) persist_signature(song_id, signature) return signature
def test_framing(): while True: N = np.random.randint(500, 100000) window_len = np.random.randint(10, 100) stride_len = np.random.randint(1, 50) signal = np.random.rand(N) mine = to_frames(signal, window_len, stride_len, writeable=False) theirs = frame(signal, frame_length=window_len, hop_length=stride_len).T assert len(mine) == len(theirs), "len(mine) = {}, len(theirs) = {}".format( len(mine), len(theirs) ) np.testing.assert_almost_equal(mine, theirs) print("PASSED")
def spectrogram(self, audio_data): # Pad the time series so that frames are centered y = np.pad(audio_data, int(self.n_fft // 2), mode=self.pad_mode) y_frames = util.frame(y, frame_length=self.n_fft, hop_length=self.hop_length) windowed = (self.fft_window * y_frames).T fft_matrix = [] for frame in windowed: complex_fft = self.fft(frame) amp_spectr = np.sqrt(((complex_fft.real**2) + (complex_fft.imag**2)))[:self.n_fft // 2 + 1] fft_matrix.append(amp_spectr) fft_matrix = np.stack(fft_matrix) return np.abs(fft_matrix)**self.power
def stft(x, frame_length=1024, hop_length=512): # ..., FFT axis if not isinstance(x, chainer.Variable): x = chainer.as_variable(x) xp = x.xp pad_len = (x.shape[-1] // hop_length - frame_length // hop_length + 1) * hop_length + frame_length pad = pad_len - x.shape[-1] if pad > 0: shape = list(x.shape) pad = xp.zeros(shape[:-1] + [pad]).astype(x.dtype) x = F.concat((x, pad), -1) index = frame(np.arange(x.shape[-1]), frame_length, hop_length).T tmp = x[..., index] * xp.hamming(frame_length).astype(x.dtype) yr, yi = F.fft((tmp, xp.zeros(tmp.shape).astype(x.dtype))) return yr[..., :frame_length // 2 + 1], yi[..., :frame_length // 2 + 1]
def energy(filepath: str, frame_ms: int, sliding_ms: int) -> np.ndarray: ''' Given an audio file, returns the energy (calculated as the area under the curve of the signal) for each frame of width frame_ms sliding each sliding_ms. This functions uses the composite trapezoidal rule to approximate the are, since other methods are far too expensive (like Simpson's or Romberg's). ''' time_series, sr = load(filepath) sr_ms = sr / 1000 time_series = normalize_gain(time_series) frames = frame(time_series, frame_length=int(sr_ms * frame_ms), hop_length=int(sr_ms * sliding_ms)) return trapz(frames, dx=frame_ms, axis=0)
def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0): # # check data # if data.dtype.kind == 'i': # if data.max() > 2**15 - 1 or data.min() < -2**15: # raise ValueError( # 'When data.type is int, data must be -32768 < data < 32767.') # data = data.astype('f') / 2.0**15 # elif data.dtype.kind == 'f': # if np.abs(data).max() > 1: # raise ValueError( # 'When data.type is float, data must be -1.0 <= data <= 1.0.') # data = data.astype('f') # else: # raise ValueError('data.dtype must be int or float.') data = data.squeeze() if not data.ndim == 1: raise ValueError('data must be mono (1 ch).') # resampling if fs != fs_vad: resampled = resample(data, fs, fs_vad) if np.abs(resampled).max() > 1.0: resampled *= (0.99 / np.abs(resampled).max()) # warn('Resampling causes data clipping. data was rescaled.') else: resampled = data resampled = (resampled * 2.0**15).astype('int16') hop = fs_vad * hop_length // 1000 framelen = resampled.size // hop + 1 padlen = framelen * hop - resampled.size paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0) framed = frame(paded, frame_length=hop, hop_length=hop).T vad = webrtcvad.Vad() vad.set_mode(vad_mode) valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed] hop_origin = fs * hop_length // 1000 va_framed = np.zeros([len(valist), hop_origin]) va_framed[valist] = 1 return va_framed.reshape(-1)[:data.size]
def libstft(y, fs, n_fft=2048, hop_length=None, win_length=None, window='hann', center=None, dtype=np.complex64, pad_mode='reflect'): # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) fft_window = get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, int(n_fft // 2), mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=win_length, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] f = np.linspace(0, np.pi, stft_matrix.shape[0], endpoint=True) * fs / np.pi / 2 return stft_matrix, f
def zero_crossing_rate(y, frame_length=2048, hop_length=512, center=True, **kwargs): global CROSSING util.valid_audio(y) if center: y = np.pad(y, int(frame_length // 2), mode='edge') y_framed = util.frame(y, frame_length, hop_length) kwargs['axis'] = 0 kwargs.setdefault('pad', False) crossings = zero_crossings(y_framed, **kwargs) CROSSING = crossings print(crossings) return np.mean(crossings, axis=0, keepdims=True)
def __init__(self, y, fs, duration=32, hop_size=10): """ :param y: audio time series :param fs: sampling frequency (Number of samples per second) :param duration: Analysis frame duration (in msec) :param hop_size: Number of steps to advance between frames (default: 10 ms) """ self.fs = fs self.frame_length = int(duration * (fs / 1000)) # Analysis frame length (in samples) # hop_length -> librosa self.shift_length = int(float(hop_size) * (fs / 1000)) # matrix where the rows contains contiguous slice frames = frame(y, frame_length=self.frame_length, hop_length=self.shift_length, axis=0) self.window = np.kaiser(M=self.frame_length, beta=0.5) self.windowed_frames = np.multiply(frames, self.window)
def _fundamental(filepath: str, frame_ms: int, sliding_ms: int) -> np.ndarray: ''' Given an audio file, splits into frames and tries to guess the fundamental frequency of each one of them. The method used is the "most precise" among the easy ones. Here's a good explanation of them: https://gist.github.com/endolith/255291 Returns an array with the F0 of each frame. ''' time_series, sr = load(filepath) sr_ms = sr / 1000 time_series = normalize_gain(time_series) frames = frame(time_series, frame_length=int(sr_ms * frame_ms), hop_length=int(sr_ms * sliding_ms)) fundamentals = np.ndarray((frames.shape[1])) for i, f in enumerate(frames.T): fundamentals[i] = freq_from_fft(f, sr) return fundamentals
def py_webrtcvad(data, fs, fs_vad, hoplength=30, vad_mode=0): import webrtcvad from librosa.core import resample from librosa.util import frame """ Voice activity detection. This was implementioned for easier use of py-webrtcvad. Thanks to: https://github.com/wiseman/py-webrtcvad.git Parameters ---------- data : ndarray numpy array of mono (1 ch) speech data. 1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1). if data type is int, -32768 < data < 32767. if data type is float, -1 < data < 1. fs : int Sampling frequency of data. fs_vad : int, optional Sampling frequency for webrtcvad. fs_vad must be 8000, 16000, 32000 or 48000. Default is 16000. hoplength : int, optional Step size[milli second]. hoplength must be 10, 20, or 30. Default is 0.1. vad_mode : int, optional set vad aggressiveness. As vad_mode increases, it becomes more aggressive. vad_mode must be 0, 1, 2 or 3. Default is 0. Returns ------- vact : ndarray voice activity. time length of vact is same as input data. If 0, it is unvoiced, 1 is voiced. """ # check argument if fs_vad not in [8000, 16000, 32000, 48000]: raise ValueError('fs_vad must be 8000, 16000, 32000 or 48000.') if hoplength not in [10, 20, 30]: raise ValueError('hoplength must be 10, 20, or 30.') if vad_mode not in [0, 1, 2, 3]: raise ValueError('vad_mode must be 0, 1, 2 or 3.') # check data if data.dtype.kind == 'i': if data.max() > 2**15 - 1 or data.min() < -2**15: raise ValueError( 'when data type is int, data must be -32768 < data < 32767.') data = data.astype('f') elif data.dtype.kind == 'f': if np.abs(data).max() >= 1: data = data / np.abs(data).max() * 0.9 print('input data was rescaled.') #warnings.warn('input data was rescaled.') data = (data * 2**15).astype('f') else: raise ValueError('data dtype must be int or float.') data = data.squeeze() if not data.ndim == 1: raise ValueError('data must be mono (1 ch).') # resampling if fs != fs_vad: resampled = resample(data, fs, fs_vad) else: resampled = data resampled = resampled.astype('int16') hop = fs_vad * hoplength // 1000 framelen = resampled.size // hop + 1 padlen = framelen * hop - resampled.size paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0) framed = frame(paded, frame_length=hop, hop_length=hop).T vad = webrtcvad.Vad() vad.set_mode(vad_mode) valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed] hop_origin = fs * hoplength // 1000 va_framed = np.zeros([len(valist), hop_origin]) va_framed[valist] = 1 return va_framed.reshape(-1)[:data.size]
def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0): """ Voice activity detection. This was implementioned for easier use of py-webrtcvad. Parameters ---------- data : ndarray numpy array of mono (1 ch) speech data. 1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1). if data type is int, -32768 < data < 32767. if data type is float, -1 < data < 1. fs : int Sampling frequency of data. fs_vad : int, optional Sampling frequency for webrtcvad. fs_vad must be 8000, 16000, 32000 or 48000. Default is 16000. hop_length : int, optional Step size[milli second]. hop_length must be 10, 20, or 30. Default is 0.1. vad_mode : int, optional set vad aggressiveness. As vad_mode increases, it becomes more aggressive. vad_mode must be 0, 1, 2 or 3. Default is 0. Returns ------- vact : ndarray voice activity. time length of vact is same as input data. If 0, it is unvoiced, 1 is voiced. """ # check argument if fs_vad not in [8000, 16000, 32000, 48000]: raise ValueError('fs_vad must be 8000, 16000, 32000 or 48000.') if hop_length not in [10, 20, 30]: raise ValueError('hop_length must be 10, 20, or 30.') if vad_mode not in [0, 1, 2, 3]: raise ValueError('vad_mode must be 0, 1, 2 or 3.') # check data if data.dtype.kind == 'i': if data.max() > 2**15 - 1 or data.min() < -2**15: raise ValueError( 'When data.type is int, data must be -32768 < data < 32767.') data = data.astype('f') / 2.0**15 elif data.dtype.kind == 'f': if np.abs(data).max() > 1: # librosa.load()后有可能稍微大于1.0 data = MinMaxScaler( (-1, 1)).fit_transform(data.reshape(-1, 1)).reshape(-1) # raise ValueError( # 'When data.type is float, data must be -1.0 <= data <= 1.0.') data = data.astype('f') else: raise ValueError('data.dtype must be int or float.') data = data.squeeze() if not data.ndim == 1: raise ValueError('data must be mono (1 ch).') # resampling if fs != fs_vad: resampled = resample(data, fs, fs_vad) if np.abs(resampled).max() > 1.0: resampled *= (0.99 / np.abs(resampled).max()) warn('Resampling causes data clipping. data was rescaled.') else: resampled = data resampled = (resampled * 2.0**15).astype('int16') hop = fs_vad * hop_length // 1000 framelen = resampled.size // hop + 1 padlen = framelen * hop - resampled.size paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0) framed = frame(paded, frame_length=hop, hop_length=hop).T vad = webrtcvad.Vad() vad.set_mode(vad_mode) valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed] hop_origin = fs * hop_length // 1000 va_framed = np.zeros([len(valist), hop_origin]) va_framed[valist] = 1 return va_framed.reshape(-1)[:data.size]
import torch.nn.functional as F from torch.autograd import Variable import h5py from librosa.util import frame import numpy as np torch.manual_seed(0) h = h5py.File('data_nocab_norm.h5','r') X = h['x'][:] Y = h['y'][:] x = X[0:100000]/2+1 y = Y[0:100000]/2+1 x = torch.from_numpy(frame(x, 41, 1).T).type(torch.FloatTensor) y = torch.from_numpy(y[40:]).type(torch.FloatTensor) hidden_size = 128 class RNN_Net(nn.Module): def __init__(self): super(RNN_Net, self).__init__() self.gru = nn.GRU(1, hidden_size, batch_first = True) self.act = nn.PReLU() self.fc = nn.Linear(hidden_size, 1) def forward(self, x, hidden): output, hidden = self.gru(x, hidden) output = output[:,-1,:] output = self.fc(output)
import csv from librosa.util import frame from librosa.core import load meta_files = [ 'iemocap2.txt', 'aibo2.txt', 'emodb2.txt', 'enterface2.txt', 'ldc2.txt' ] #meta_files = ['tiny_dataset.txt'] frame_ms = 25 sliding_ms = 10 n_frames = 0 n_files = 0 for meta_file in meta_files: with open(meta_file) as f: for line in csv.DictReader(f, dialect='excel-tab'): filename = line.get('n_train_data.name') time_series, sr = load(filename) sr_ms = sr / 1000 frames = frame(time_series, frame_length=int(sr_ms * frame_ms), hop_length=int(sr_ms * sliding_ms)) n_frames += frames.shape[1] n_files += 1 print(f'Files: {n_files}') print(f'Frames: {n_frames}')
def stft(y, n_fft=2048, hop_length=None, win_length=None, window=None, center=True, dtype=np.complex64): import scipy import six from librosa import util # By default, use the entire frame if win_length is None: win_length = n_fft #win_length = tf.constant(n_fft) # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length.value() / 4) #hop_length = win_length/4 #hop_length.to_int64() if window is None: # Default is an asymmetric Hann window fft_window = scipy.signal.hann(win_length, sym=False) #fft_window = tf.constant(scipy.signal.hann(convertTFtoNP(win_length), sym=False)) elif six.callable(window): # User supplied a window function fft_window = window(win_length) else: # User supplied a window vector. # Make sure it's an array: fft_window = np.asarray(window) # validate length compatibility # if fft_window.size != n_fft: # raise ParameterError('Size mismatch between n_fft and len(window)') # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) #fft_window.assign(util.pad_center(convertTFtoNP(fft_window), n_fft)) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) #tf.reshape(fft_window, (-1,1)) # Pad the time series so that frames are centered if center: util.valid_audio(y) y_ = np.pad(convertTFtoNP(y), int(n_fft // 2), mode='reflect') # padding = int(n_fft // 2) # y_frames = tf.pad(y, [[padding, padding],[padding,padding]], mode='REFLECT') # Window the time series. y_frames = util.frame(y_, frame_length=n_fft, hop_length=hop_length) #y_frames.assign(util.frame(convertTFtoNP(y_frames), frame_length=n_fft, hop_length=hop_length)) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') #stft_matrix = tf.zeros((int(1 + n_fft // 2), y_frames.get_shape()[1]._value), # dtype=dtype, # order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) #n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.get_shape()[0]._value * # convertTFtoNP(stft_matrix).itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): #for bl_s in range(0, stft_matrix.get_shape()[1]._value, n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) #bl_t = min(bl_s + n_columns, stft_matrix.get_shape()[1]._value) # RFFT and Conjugate here to match phase from DPWE code stft_matrix[:, bl_s:bl_t] = scipy.fftpack.fft( fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]].conj() #tf.scatter_update(stft_matrix, tf.constant(range(bl_s,bl_t)), tf.conj(tf.slice(tf.fft( # fft_window * tf.slice( # y_frames, [0,bl_s],[y_frames.get_shape()[0]._value,bl_t-bl_s])), # [0],[stft_matrix.get_shape()[0]._value]))) return stft_matrix
# Get the type of the signal file file_name = args.signal.split("/")[-1] file_format = file_name.split(".")[1] # Load signal - for now, only works with wav or numpy files if file_format == "npy": signal = np.load(args.signal) else: (rate, sig) = wavefile.load(args.signal) signal = sig[0] # Frame and compute MFCCs S = np.transpose( frame(signal, int(args.frame_len * 16), int(args.hop_len * 16))) # For now, only 16kHz sampling rate can be used X = list(map(lambda s: feature_extractor(s, 16000), S)) X = np.array(np.swapaxes(X, 1, 2)) X = X.astype( np.float16 ) # Compression to save memory, 16-bit MFCCs have also been used in the training of the current_best.h5 num_timesteps = X.shape[1] # =============================================== # Embedding extraction # =============================================== emb_model = load_model(args.model, custom_objects={ 'VLAD': VLAD,
def stft(y, n_fft=2048, hop_length=None, win_length=None, window='hann', center=True, dtype=np.complex64, pad_mode='reflect'): """Short-time Fourier transform (STFT) Returns a complex-valued matrix D such that `np.abs(D[f, t])` is the magnitude of frequency bin `f` at frame `t` `np.angle(D[f, t])` is the phase of frequency bin `f` at frame `t` Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] number audio of frames between STFT columns. If unspecified, defaults `win_length / 4`. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. If unspecified, defaults to ``win_length = n_fft``. window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] - a window specification (string, tuple, or number); see `scipy.signal.get_window` - a window function, such as `scipy.signal.hanning` - a vector or array of length `n_fft` .. see also:: `filters.get_window` center : boolean - If `True`, the signal `y` is padded so that frame `D[:, t]` is centered at `y[t * hop_length]`. - If `False`, then `D[:, t]` begins at `y[t * hop_length]` dtype : numeric type Complex numeric type for `D`. Default is 64-bit complex. pad_mode : string If `center=True`, the padding mode to use at the edges of the signal. By default, STFT uses reflection padding. Returns ------- D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype] STFT matrix See Also -------- istft : Inverse STFT ifgram : Instantaneous frequency spectrogram np.pad : array padding Notes ----- This function caches at level 20. Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> D = np.abs(librosa.stft(y)) >>> D array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ..., 6.82710262e-04, 2.51654536e-04, 7.23036574e-05], [2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ..., 3.48026224e-04, 2.35853557e-04, 7.54836728e-05], [7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ..., 6.29352580e-04, 3.38571583e-04, 8.38094638e-05], ..., [9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ..., 1.85637656e-08, 2.89708542e-08, 5.74304337e-09], [1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ..., 3.49099771e-08, 3.11740926e-08, 5.29926236e-09], [1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ..., 5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32) Use left-aligned frames, instead of centered frames >>> D_left = np.abs(librosa.stft(y, center=False)) Use a shorter hop length >>> D_short = np.abs(librosa.stft(y, hop_length=64)) Display a spectrogram >>> import matplotlib.pyplot as plt >>> librosa.display.specshow(librosa.amplitude_to_db(D, ... ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('Power spectrogram') >>> plt.colorbar(format='%+2.0f dB') >>> plt.tight_layout() """ # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) #fft_window = get_window(window, win_length, fftbins=True) fft_window = vorbis(win_length) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, int(n_fft // 2), mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] return stft_matrix
"5544574287152993687.mp4": [], "5544620672795594434.mp4": [], "5547193787702629969.mp4": [], "5549784941472309008.mp4": [], "5552368364300855101.mp4": [], "5555325449284154780.mp4": [], "5555360238519252381.mp4": [] } datapath = f"./data/{d}" wavs = [f.name for f in os.scandir(datapath) if f.name.endswith(".wav")] wavs.sort() for wavfile in wavs: print(f"Diarizing file {wavfile} now.") (rate, sig) = wavefile.load(f"{datapath}/{wavfile}") signal = sig[0] S = np.transpose(frame(signal, int(2000 * 16), int(500 * 16))) X = list(map(lambda s: fe(s, 16000), S)) X = np.array(np.swapaxes(X, 1, 2)) X = X.astype(np.float16) num_timesteps = X.shape[1] if num_timesteps != 201: emb_model.layers.pop(0) new_input = Input(batch_shape=(None, num_timesteps, 30)) new_output = emb_model(new_input) emb_model = Model(new_input, new_output) embs = emb_model.predict(X) try: SD.cluster(rounds=10, clust_range=[2, 8],
import librosa import numpy as np import librosa.util as util from librosa.filters import get_window audio_path = "../AudioData/audio/D4_750.wav" noise_path = "../AudioData/noise/Pink Noise.wav" # 读取音频文件 y, sr = librosa.load(audio_path) # 对音频文件进行分帧 win_len = n_fft = 200 hop_length = 80 # Pad the time series so that frames are centered y = np.pad(y, int(n_fft // 2), mode='reflect') # Window the time series. y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length, axis=0) # 获得窗系数 fft_window = get_window('hamm', 10, fftbins=False) # fft_window = fft_window[1:-1] print(fft_window) fft_window = get_window('hamm', 10, fftbins=True) print(fft_window) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) #
def hht(self, y, hop_length=None, win_length=None, center=True, dtype=np.complex64, pad_mode='reflect'): """Hilbert-Huang transform (HHT) Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) hop_length : int > 0 [scalar] number audio of frames between STFT columns. If unspecified, defaults `win_length / 4`. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. If unspecified, defaults to ``win_length = n_fft``. center : boolean - If `True`, the signal `y` is padded so that frame `D[:, t]` is centered at `y[t * hop_length]`. - If `False`, then `D[:, t]` begins at `y[t * hop_length]` dtype : numeric type Complex numeric type for `D`. Default is 64-bit complex. pad_mode : string If `center=True`, the padding mode to use at the edges of the signal. By default, HHT uses reflection padding. Returns ------- hht_matrix : np.ndarray [shape=(30, t), dtype=dtype] bjp_matrix : np.ndarray [shape=(n_hht-1, t), dtype=dtype] """ # By default, use the entire frame if win_length is None: win_length = self.n_hht # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length / 2) hht_window = self.window # Pad the window out to n_hht size hht_window = util.pad_center(hht_window, self.n_hht) # Reshape so that the window can be broadcast hht_window = hht_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, self.n_hht - 1, mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=self.n_hht, hop_length=hop_length).T # Pre-allocate the HHT matrix hht_matrix = np.empty((27, y_frames.shape[0]), dtype=dtype, order='F') bjp_matrix = np.empty((self.n_hht - 1, y_frames.shape[0]), dtype=dtype, order='F') for bl_s in range(hht_matrix.shape[1]): frame_signal = hht_window[:, 0] * y_frames[bl_s, :] A, f, bjp = get_hht(frame_signal, self.fs) hht_matrix[:, bl_s] = self.hht_based_feature(A, f * self.fs, bjp) bjp_matrix[:, bl_s] = bjp return hht_matrix, bjp_matrix