Beispiel #1
0
def __envelope(x, hop):
    """Compute the max-envelope of non-overlapping frames of x at length hop

    x is assumed to be multi-channel, of shape (n_channels, n_samples).
    """
    x_frame = np.abs(util.frame(x, frame_length=hop, hop_length=hop))
    return x_frame.max(axis=1)
 def __init__(self,
              y,
              sr=44100,
              stem="sample",
              qual="",
              suffix=".mp3",
              parent_path=".",
              hop_length=512,
              frame_length=2048,
              duration=None,
              **kwargs):
     end = y.shape[-1]
     if duration is not None:
         end = min(duration * sr, end)
     end = int(end)
     self.end = end
     self._y = y
     features = frame(y, hop_length=hop_length, frame_length=frame_length)
     super().__init__(features=features,
                      hop_length=hop_length,
                      frame_length=frame_length,
                      sr=sr,
                      **kwargs)
     for k in kwargs:
         setattr(self, k, kwargs[k])
     self.stem = stem
     self.qual = qual
     self.suffix = suffix
     self.parent_path = Path(parent_path)
     self.start_samp = 0
     self.end_samp = y.shape[-1]
Beispiel #3
0
def find_start_end_rests(audio_data, sr, hop_length=HOP_LENGTH, n_fft=N_FFT):
    """
    In order to evaluate the alignment procedure, we may want to exclude the 
    (potential) start and end rests - the alignment is likely to be poor in 
    these regions, but it does not matter.
    This functions returns the estimated end of the starting rest and the
    estimated start of the ending rest.
    
    The function won't return a meaningful output if there are rests in the middle 
    of the music piece. 
    
    Parameters
    ----------
    audio_data : np.ndarray
        The raw waveform audio.       
    sr : int
        The sample rate
    
    Returns
    -------
    times_start_end_rests  : list 
        List of two items, with first being the end time of the start rest and the second item
        being the start time of the end rest, in seconds.        
    """
    
    # Compute the 3rd percentile of the envelope and  
    # deem anything below this value as silence   
    envelope = frame(audio_data, hop_length=hop_length, frame_length=n_fft).max(axis=0)
    lower_bound = np.percentile(envelope, 5.0)
    
    # Implement the search as loop, this should be faster than vectorisation
    k = 0
    while envelope[k] <= lower_bound:
        k += 1
        
    # Return 0 if there is no start rest
    if k == 0:
        time_start = 0.0
    else:
        # The first value of the output of the frame function correspond to the time of
        # n_fft, then the times are spaced according to hop_length 
        time_start = ((k-1)*hop_length + n_fft)/float(sr)
             
    j = len(envelope)-1
    while envelope[j] <= lower_bound:
        j -= 1
    
    # Return the length of the track if the is no end rest
    if j == len(envelope)-1:
        time_end = len(audio_data)/float(sr)
    else:
        time_end = ((j-1)*hop_length + n_fft)/float(sr)
            
    times_start_end_rests = [time_start, time_end]
    
    return(times_start_end_rests)
Beispiel #4
0
def transform(dataset):
    X = dataset.audio
    S = dataset.station
    Y = dataset.label_vec

    print('starting transform')
    X = [frame(x, frame_length=8000, hop_length=1000) for x in X]
    Y = [frame(y, frame_length=8000, hop_length=1000) for y in Y]
    S = [frame(s, frame_length=8000, hop_length=1000) for s in S]

    print('start mfcc')
    X = [mfcc_vec(x) for x in tqdm(X)]
    X = np.concatenate(X, axis=-1).T
    S = np.concatenate(S, axis=-1).T[:,
                                     0]  # only take first element of station
    Y = np.concatenate(Y, axis=-1)
    Y = (Y.sum(axis=0) / 8000)

    return X, S, Y
Beispiel #5
0
 def extract_features(self, signal, frame_len = 2, hop_len = 0.5, fs = 16000):  
     
     # Frame duration 2s, overlap duration 1.5s, assuming 16 kHz sampling rate
     S = np.transpose(frame(signal, int(frame_len*fs), int(hop_len*fs)))
     
     # 201 sequences of 59 dimensional MFCC based features
     X = list(map(lambda s: feature_extractor(s, fs), S))
     X = np.swapaxes(X, 1, 2)
     self.X_ = X
     return X
Beispiel #6
0
def enframe(x, win, hop_len):
    if isinstance(win, int):
        win_len = win
    elif isinstance(win, np.ndarray):
        win_len = len(win)
    else:
        print('win type is not right.')
        raise
    x_frames = util.frame(x, win_len, hop_len, axis=0)

    if isinstance(win, np.ndarray):
        x_frames = x_frames * win
    return x_frames
Beispiel #7
0
def get_chunks(dataset, effect, num_seconds, sr, context):
    x = dataset['clean'][:]
    y = dataset[effect][:]
    # Frame into 1 second long segments first
    x = frame(x, sr, sr).T
    y = frame(y, sr, sr).T
    # Sample num_seconds number of segments
    if num_seconds < x.shape[0]:
        ind = np.random.choice(np.arange(x.shape[0]),
                               num_seconds,
                               replace=False)
        x = x[ind]
        y = y[ind]
    else:
        print('Number of seconds: {} is larger than dataset size: {}'.format(
            num_seconds, x.shape[0]))
    # Segment further into context frames
    x = np.apply_along_axis(frame, 1, x, frame_length=context, hop_length=1)
    x = np.transpose(x, (0, 2, 1))
    x = x.reshape(-1, context, 1)
    y = y[:, context - 1:]
    y = y.reshape(-1, 1)
    return {'x': x, 'y': y}
Beispiel #8
0
def compute_spectral_signature(song_id, cached = True, use_covar = True):

	if cached and is_signature_cached(song_id):
		return fetch_signature(song_id)

	audioclip_path = join(AUDIOCLIPS_FOLDER, "{0}.mp3".format(song_id))
	waveform, sample_rate, frame_length, frames = None, None, None, None
	try:
		waveform, sample_rate = load(audioclip_path, sr=SAMPLE_RATE)
		frame_length = core.time_to_samples(np.arange(0, 2, FRAME_TIMESTEP), sr = sample_rate)[1]
		frames = librosa_util.frame(y = waveform, frame_length = frame_length, hop_length = frame_length)
	except Exception as e:
		logging.warn("Couldn't preprocess audioclip '{0}': {1}".format(audioclip_path, str(e)))
		return None

	# The 'frames' array has shape (<frame_length>, <number_of_frames>)
	# hence, we transpose it. This holds true for every call to the librosa library that returns an array.
	frames = frames.T

	spectrograms = []
	for frame in frames[FRAME_START: FRAME_START + FRAME_TOTAL]:
		spectrogram = feature.mfcc(y = frame, sr = frame_length).T
		to_add = [ entry[MFCSS_OFFSET : MFCSS_OFFSET+N_MFCCS] for entry in spectrogram ]
		spectrograms += to_add
	
	spectrograms = np.array(spectrograms)
	clusters = KMeans(n_clusters = CLUSTERS_PER_SIGNATURE)
	model = clusters.fit(spectrograms)

	# A song's "signature" is an array [ ( u_i, s_i, w_i ) ... ]. Where 0 <= i < CLUSTERS_PER_SIGNATURE
	# The triple (u_i, s_i, w_i) contains these variables:
	# 	u_i : Mean for Cluster i
	#	s_i : Covariance for Cluster i
	#	w_i : Weight for Cluster i
	
	signature = []
	for label in xrange(CLUSTERS_PER_SIGNATURE):
		indexes = [ index for index, element in enumerate(model.labels_) if element == label ]
		cluster_points = [ spectrograms[i] for i in indexes ]

		mean = model.cluster_centers_[label]
		covariance = np.cov(cluster_points) if use_covar else []
		weight = len(cluster_points)
		cluster_params = (mean, covariance, weight)

		signature.append(cluster_params)

	persist_signature(song_id, signature)
	
	return signature
Beispiel #9
0
def test_framing():
    while True:
        N = np.random.randint(500, 100000)
        window_len = np.random.randint(10, 100)
        stride_len = np.random.randint(1, 50)
        signal = np.random.rand(N)

        mine = to_frames(signal, window_len, stride_len, writeable=False)
        theirs = frame(signal, frame_length=window_len, hop_length=stride_len).T

        assert len(mine) == len(theirs), "len(mine) = {}, len(theirs) = {}".format(
            len(mine), len(theirs)
        )
        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
Beispiel #10
0
 def spectrogram(self, audio_data):
     # Pad the time series so that frames are centered
     y = np.pad(audio_data, int(self.n_fft // 2), mode=self.pad_mode)
     y_frames = util.frame(y,
                           frame_length=self.n_fft,
                           hop_length=self.hop_length)
     windowed = (self.fft_window * y_frames).T
     fft_matrix = []
     for frame in windowed:
         complex_fft = self.fft(frame)
         amp_spectr = np.sqrt(((complex_fft.real**2) +
                               (complex_fft.imag**2)))[:self.n_fft // 2 + 1]
         fft_matrix.append(amp_spectr)
     fft_matrix = np.stack(fft_matrix)
     return np.abs(fft_matrix)**self.power
Beispiel #11
0
def stft(x, frame_length=1024, hop_length=512):
    # ..., FFT axis
    if not isinstance(x, chainer.Variable):
        x = chainer.as_variable(x)
    xp = x.xp
    pad_len = (x.shape[-1] // hop_length - frame_length // hop_length +
               1) * hop_length + frame_length
    pad = pad_len - x.shape[-1]
    if pad > 0:
        shape = list(x.shape)
        pad = xp.zeros(shape[:-1] + [pad]).astype(x.dtype)
        x = F.concat((x, pad), -1)
    index = frame(np.arange(x.shape[-1]), frame_length, hop_length).T
    tmp = x[..., index] * xp.hamming(frame_length).astype(x.dtype)
    yr, yi = F.fft((tmp, xp.zeros(tmp.shape).astype(x.dtype)))
    return yr[..., :frame_length // 2 + 1], yi[..., :frame_length // 2 + 1]
Beispiel #12
0
def energy(filepath: str, frame_ms: int, sliding_ms: int) -> np.ndarray:
    '''
		Given an audio file, returns the energy (calculated as the area
		under the curve of the signal) for each frame of width
		frame_ms sliding each sliding_ms.
		This functions uses the composite trapezoidal rule to approximate
		the are, since other methods are far too expensive (like Simpson's or
		Romberg's).
	'''
    time_series, sr = load(filepath)
    sr_ms = sr / 1000
    time_series = normalize_gain(time_series)
    frames = frame(time_series,
                   frame_length=int(sr_ms * frame_ms),
                   hop_length=int(sr_ms * sliding_ms))
    return trapz(frames, dx=frame_ms, axis=0)
Beispiel #13
0
def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0):
    # # check data
    # if data.dtype.kind == 'i':
    #     if data.max() > 2**15 - 1 or data.min() < -2**15:
    #         raise ValueError(
    #             'When data.type is int, data must be -32768 < data < 32767.')
    #     data = data.astype('f') / 2.0**15

    # elif data.dtype.kind == 'f':
    #     if np.abs(data).max() > 1:
    #         raise ValueError(
    #             'When data.type is float, data must be -1.0 <= data <= 1.0.')
    #     data = data.astype('f')

    # else:
    #     raise ValueError('data.dtype must be int or float.')

    data = data.squeeze()
    if not data.ndim == 1:
        raise ValueError('data must be mono (1 ch).')

    # resampling
    if fs != fs_vad:
        resampled = resample(data, fs, fs_vad)
        if np.abs(resampled).max() > 1.0:
            resampled *= (0.99 / np.abs(resampled).max())
            # warn('Resampling causes data clipping. data was rescaled.')
    else:
        resampled = data

    resampled = (resampled * 2.0**15).astype('int16')

    hop = fs_vad * hop_length // 1000
    framelen = resampled.size // hop + 1
    padlen = framelen * hop - resampled.size
    paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0)
    framed = frame(paded, frame_length=hop, hop_length=hop).T

    vad = webrtcvad.Vad()
    vad.set_mode(vad_mode)
    valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed]

    hop_origin = fs * hop_length // 1000
    va_framed = np.zeros([len(valist), hop_origin])
    va_framed[valist] = 1

    return va_framed.reshape(-1)[:data.size]
Beispiel #14
0
def libstft(y, fs, n_fft=2048, hop_length=None, win_length=None, window='hann',
            center=None, dtype=np.complex64, pad_mode='reflect'):
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    fft_window = get_window(window, win_length, fftbins=True)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Check audio is valid
    util.valid_audio(y)

    # Pad the time series so that frames are centered
    if center:
        y = np.pad(y, int(n_fft // 2), mode=pad_mode)

    # Window the time series.
    y_frames = util.frame(y, frame_length=win_length, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] *
                                          stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window *
                                            y_frames[:, bl_s:bl_t],
                                            axis=0)[:stft_matrix.shape[0]]
    f = np.linspace(0, np.pi, stft_matrix.shape[0], endpoint=True) * fs / np.pi / 2
    return stft_matrix, f
Beispiel #15
0
def zero_crossing_rate(y, frame_length=2048, hop_length=512, center=True,
                       **kwargs):
    global CROSSING
    util.valid_audio(y)

    if center:
        y = np.pad(y, int(frame_length // 2), mode='edge')

    y_framed = util.frame(y, frame_length, hop_length)

    kwargs['axis'] = 0
    kwargs.setdefault('pad', False)

    crossings = zero_crossings(y_framed, **kwargs)
    CROSSING = crossings
    print(crossings)

    return np.mean(crossings, axis=0, keepdims=True)
    def __init__(self, y, fs, duration=32, hop_size=10):
        """
        :param y: audio time series
        :param fs: sampling frequency (Number of samples per second)
        :param duration: Analysis frame duration (in msec)
        :param hop_size: Number of steps to advance between frames (default: 10 ms)
        """

        self.fs = fs
        self.frame_length = int(duration * (fs / 1000))  # Analysis frame length (in samples)
        # hop_length -> librosa

        self.shift_length = int(float(hop_size) * (fs / 1000))
        # matrix where the rows contains contiguous slice

        frames = frame(y, frame_length=self.frame_length, hop_length=self.shift_length, axis=0)
        self.window = np.kaiser(M=self.frame_length, beta=0.5)
        self.windowed_frames = np.multiply(frames, self.window)
Beispiel #17
0
def _fundamental(filepath: str, frame_ms: int, sliding_ms: int) -> np.ndarray:
    '''
        Given an audio file, splits into frames and tries to
        guess the fundamental frequency of each one of them.
        The method used is the "most precise" among the easy
        ones. Here's a good explanation of them:
            https://gist.github.com/endolith/255291
        Returns an array with the F0 of each frame.
    '''
    time_series, sr = load(filepath)
    sr_ms = sr / 1000
    time_series = normalize_gain(time_series)
    frames = frame(time_series,
                   frame_length=int(sr_ms * frame_ms),
                   hop_length=int(sr_ms * sliding_ms))
    fundamentals = np.ndarray((frames.shape[1]))
    for i, f in enumerate(frames.T):
        fundamentals[i] = freq_from_fft(f, sr)
    return fundamentals
Beispiel #18
0
def py_webrtcvad(data, fs, fs_vad, hoplength=30, vad_mode=0):    
    import webrtcvad
    from librosa.core import resample
    from librosa.util import frame
    """ Voice activity detection.
    This was implementioned for easier use of py-webrtcvad.
    Thanks to: https://github.com/wiseman/py-webrtcvad.git
    Parameters
    ----------
    data : ndarray
        numpy array of mono (1 ch) speech data.
        1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1).
        if data type is int, -32768 < data < 32767.
        if data type is float, -1 < data < 1.
    fs : int
        Sampling frequency of data.
    fs_vad : int, optional
        Sampling frequency for webrtcvad.
        fs_vad must be 8000, 16000, 32000 or 48000.
        Default is 16000.
    hoplength : int, optional
        Step size[milli second].
        hoplength must be 10, 20, or 30.
        Default is 0.1.
    vad_mode : int, optional
        set vad aggressiveness.
        As vad_mode increases, it becomes more aggressive.
        vad_mode must be 0, 1, 2 or 3.
        Default is 0.
    Returns
    -------
    vact : ndarray
        voice activity. time length of vact is same as input data.
        If 0, it is unvoiced, 1 is voiced.
    """

    # check argument
    if fs_vad not in [8000, 16000, 32000, 48000]:
        raise ValueError('fs_vad must be 8000, 16000, 32000 or 48000.')

    if hoplength not in [10, 20, 30]:
        raise ValueError('hoplength must be 10, 20, or 30.')

    if vad_mode not in [0, 1, 2, 3]:
        raise ValueError('vad_mode must be 0, 1, 2 or 3.')

    # check data
    if data.dtype.kind == 'i':
        if data.max() > 2**15 - 1 or data.min() < -2**15:
            raise ValueError(
                'when data type is int, data must be -32768 < data < 32767.')
        data = data.astype('f')

    elif data.dtype.kind == 'f':
        if np.abs(data).max() >= 1:
            data = data / np.abs(data).max() * 0.9
            print('input data was rescaled.')
            #warnings.warn('input data was rescaled.')
        data = (data * 2**15).astype('f')
    else:
        raise ValueError('data dtype must be int or float.')

    data = data.squeeze()
    if not data.ndim == 1:
        raise ValueError('data must be mono (1 ch).')

    # resampling
    if fs != fs_vad:
        resampled = resample(data, fs, fs_vad)
    else:
        resampled = data

    resampled = resampled.astype('int16')

    hop = fs_vad * hoplength // 1000
    framelen = resampled.size // hop + 1
    padlen = framelen * hop - resampled.size
    paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0)
    framed = frame(paded, frame_length=hop, hop_length=hop).T    
    
    vad = webrtcvad.Vad()
    vad.set_mode(vad_mode)
    valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed]

    hop_origin = fs * hoplength // 1000
    va_framed = np.zeros([len(valist), hop_origin])
    va_framed[valist] = 1

    return va_framed.reshape(-1)[:data.size]
Beispiel #19
0
def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0):
    """ Voice activity detection.
    This was implementioned for easier use of py-webrtcvad.
    Parameters
    ----------
    data : ndarray
        numpy array of mono (1 ch) speech data.
        1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1).
        if data type is int, -32768 < data < 32767.
        if data type is float, -1 < data < 1.
    fs : int
        Sampling frequency of data.
    fs_vad : int, optional
        Sampling frequency for webrtcvad.
        fs_vad must be 8000, 16000, 32000 or 48000.
        Default is 16000.
    hop_length : int, optional
        Step size[milli second].
        hop_length must be 10, 20, or 30.
        Default is 0.1.
    vad_mode : int, optional
        set vad aggressiveness.
        As vad_mode increases, it becomes more aggressive.
        vad_mode must be 0, 1, 2 or 3.
        Default is 0.

    Returns
    -------
    vact : ndarray
        voice activity. time length of vact is same as input data.
        If 0, it is unvoiced, 1 is voiced.
    """

    # check argument
    if fs_vad not in [8000, 16000, 32000, 48000]:
        raise ValueError('fs_vad must be 8000, 16000, 32000 or 48000.')

    if hop_length not in [10, 20, 30]:
        raise ValueError('hop_length must be 10, 20, or 30.')

    if vad_mode not in [0, 1, 2, 3]:
        raise ValueError('vad_mode must be 0, 1, 2 or 3.')

    # check data
    if data.dtype.kind == 'i':
        if data.max() > 2**15 - 1 or data.min() < -2**15:
            raise ValueError(
                'When data.type is int, data must be -32768 < data < 32767.')
        data = data.astype('f') / 2.0**15

    elif data.dtype.kind == 'f':
        if np.abs(data).max() > 1:
            # librosa.load()后有可能稍微大于1.0
            data = MinMaxScaler(
                (-1, 1)).fit_transform(data.reshape(-1, 1)).reshape(-1)
            # raise ValueError(
            #     'When data.type is float, data must be -1.0 <= data <= 1.0.')
        data = data.astype('f')

    else:
        raise ValueError('data.dtype must be int or float.')

    data = data.squeeze()
    if not data.ndim == 1:
        raise ValueError('data must be mono (1 ch).')

    # resampling
    if fs != fs_vad:
        resampled = resample(data, fs, fs_vad)
        if np.abs(resampled).max() > 1.0:
            resampled *= (0.99 / np.abs(resampled).max())
            warn('Resampling causes data clipping. data was rescaled.')

    else:
        resampled = data

    resampled = (resampled * 2.0**15).astype('int16')

    hop = fs_vad * hop_length // 1000
    framelen = resampled.size // hop + 1
    padlen = framelen * hop - resampled.size
    paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0)
    framed = frame(paded, frame_length=hop, hop_length=hop).T

    vad = webrtcvad.Vad()
    vad.set_mode(vad_mode)
    valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed]

    hop_origin = fs * hop_length // 1000
    va_framed = np.zeros([len(valist), hop_origin])
    va_framed[valist] = 1

    return va_framed.reshape(-1)[:data.size]
Beispiel #20
0
import torch.nn.functional as F
from torch.autograd import Variable
import h5py
from librosa.util import frame
import numpy as np

torch.manual_seed(0)
h = h5py.File('data_nocab_norm.h5','r')

X = h['x'][:]
Y = h['y'][:]

x = X[0:100000]/2+1
y = Y[0:100000]/2+1

x = torch.from_numpy(frame(x, 41, 1).T).type(torch.FloatTensor)
y = torch.from_numpy(y[40:]).type(torch.FloatTensor)

hidden_size = 128

class RNN_Net(nn.Module):
    def __init__(self):
        super(RNN_Net, self).__init__()
        self.gru = nn.GRU(1, hidden_size, batch_first = True)
        self.act = nn.PReLU()
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        output = output[:,-1,:]
        output = self.fc(output)
Beispiel #21
0
import csv

from librosa.util import frame
from librosa.core import load

meta_files = [
    'iemocap2.txt', 'aibo2.txt', 'emodb2.txt', 'enterface2.txt', 'ldc2.txt'
]
#meta_files = ['tiny_dataset.txt']
frame_ms = 25
sliding_ms = 10
n_frames = 0
n_files = 0
for meta_file in meta_files:
    with open(meta_file) as f:
        for line in csv.DictReader(f, dialect='excel-tab'):
            filename = line.get('n_train_data.name')
            time_series, sr = load(filename)
            sr_ms = sr / 1000
            frames = frame(time_series,
                           frame_length=int(sr_ms * frame_ms),
                           hop_length=int(sr_ms * sliding_ms))
            n_frames += frames.shape[1]
            n_files += 1

print(f'Files: {n_files}')
print(f'Frames: {n_frames}')
Beispiel #22
0
def stft(y,
         n_fft=2048,
         hop_length=None,
         win_length=None,
         window=None,
         center=True,
         dtype=np.complex64):
    import scipy
    import six
    from librosa import util
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft
        #win_length = tf.constant(n_fft)

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length.value() / 4)
        #hop_length = win_length/4
        #hop_length.to_int64()

    if window is None:
        # Default is an asymmetric Hann window
        fft_window = scipy.signal.hann(win_length, sym=False)
        #fft_window = tf.constant(scipy.signal.hann(convertTFtoNP(win_length), sym=False))

    elif six.callable(window):
        # User supplied a window function

        fft_window = window(win_length)

    else:
        # User supplied a window vector.
        # Make sure it's an array:
        fft_window = np.asarray(window)

        # validate length compatibility


#        if fft_window.size != n_fft:
#           raise ParameterError('Size mismatch between n_fft and len(window)')

# Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)
    #fft_window.assign(util.pad_center(convertTFtoNP(fft_window), n_fft))

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))
    #tf.reshape(fft_window, (-1,1))

    # Pad the time series so that frames are centered
    if center:
        util.valid_audio(y)
        y_ = np.pad(convertTFtoNP(y), int(n_fft // 2), mode='reflect')
    #    padding = int(n_fft // 2)
    #    y_frames = tf.pad(y, [[padding, padding],[padding,padding]], mode='REFLECT')

    # Window the time series.
    y_frames = util.frame(y_, frame_length=n_fft, hop_length=hop_length)
    #y_frames.assign(util.frame(convertTFtoNP(y_frames), frame_length=n_fft, hop_length=hop_length))

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')
    #stft_matrix = tf.zeros((int(1 + n_fft // 2), y_frames.get_shape()[1]._value),
    #                      dtype=dtype,
    #                      order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK /
                    (stft_matrix.shape[0] * stft_matrix.itemsize))

    #n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.get_shape()[0]._value *
    #                                      convertTFtoNP(stft_matrix).itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        #for bl_s in range(0, stft_matrix.get_shape()[1]._value, n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
        #bl_t = min(bl_s + n_columns, stft_matrix.get_shape()[1]._value)
        # RFFT and Conjugate here to match phase from DPWE code
        stft_matrix[:, bl_s:bl_t] = scipy.fftpack.fft(
            fft_window * y_frames[:, bl_s:bl_t],
            axis=0)[:stft_matrix.shape[0]].conj()
        #tf.scatter_update(stft_matrix, tf.constant(range(bl_s,bl_t)), tf.conj(tf.slice(tf.fft(
        #                                    fft_window * tf.slice(
        #                                    y_frames, [0,bl_s],[y_frames.get_shape()[0]._value,bl_t-bl_s])),
        #                                    [0],[stft_matrix.get_shape()[0]._value])))

    return stft_matrix
Beispiel #23
0
    # Get the type of the signal file
    file_name = args.signal.split("/")[-1]
    file_format = file_name.split(".")[1]

    # Load signal - for now, only works with wav or numpy files
    if file_format == "npy":
        signal = np.load(args.signal)
    else:
        (rate, sig) = wavefile.load(args.signal)
        signal = sig[0]

    # Frame and compute MFCCs
    S = np.transpose(
        frame(signal, int(args.frame_len * 16),
              int(args.hop_len *
                  16)))  # For now, only 16kHz sampling rate can be used
    X = list(map(lambda s: feature_extractor(s, 16000), S))
    X = np.array(np.swapaxes(X, 1, 2))
    X = X.astype(
        np.float16
    )  # Compression to save memory, 16-bit MFCCs have also been used in the training of the current_best.h5
    num_timesteps = X.shape[1]

    # ===============================================
    #           Embedding extraction
    # ===============================================

    emb_model = load_model(args.model,
                           custom_objects={
                               'VLAD': VLAD,
Beispiel #24
0
def stft(y,
         n_fft=2048,
         hop_length=None,
         win_length=None,
         window='hann',
         center=True,
         dtype=np.complex64,
         pad_mode='reflect'):
    """Short-time Fourier transform (STFT)

    Returns a complex-valued matrix D such that
        `np.abs(D[f, t])` is the magnitude of frequency bin `f`
        at frame `t`

        `np.angle(D[f, t])` is the phase of frequency bin `f`
        at frame `t`

    Parameters
    ----------
    y : np.ndarray [shape=(n,)], real-valued
        the input signal (audio time series)

    n_fft : int > 0 [scalar]
        FFT window size

    hop_length : int > 0 [scalar]
        number audio of frames between STFT columns.
        If unspecified, defaults `win_length / 4`.

    win_length  : int <= n_fft [scalar]
        Each frame of audio is windowed by `window()`.
        The window will be of length `win_length` and then padded
        with zeros to match `n_fft`.

        If unspecified, defaults to ``win_length = n_fft``.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, or number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.hanning`
        - a vector or array of length `n_fft`

        .. see also:: `filters.get_window`

    center      : boolean
        - If `True`, the signal `y` is padded so that frame
          `D[:, t]` is centered at `y[t * hop_length]`.
        - If `False`, then `D[:, t]` begins at `y[t * hop_length]`

    dtype       : numeric type
        Complex numeric type for `D`.  Default is 64-bit complex.

    pad_mode : string
        If `center=True`, the padding mode to use at the edges of the signal.
        By default, STFT uses reflection padding.


    Returns
    -------
    D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype]
        STFT matrix


    See Also
    --------
    istft : Inverse STFT

    ifgram : Instantaneous frequency spectrogram

    np.pad : array padding

    Notes
    -----
    This function caches at level 20.


    Examples
    --------

    >>> y, sr = librosa.load(librosa.util.example_audio_file())
    >>> D = np.abs(librosa.stft(y))
    >>> D
    array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ...,
            6.82710262e-04, 2.51654536e-04, 7.23036574e-05],
           [2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ...,
            3.48026224e-04, 2.35853557e-04, 7.54836728e-05],
           [7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ...,
            6.29352580e-04, 3.38571583e-04, 8.38094638e-05],
           ...,
           [9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ...,
            1.85637656e-08, 2.89708542e-08, 5.74304337e-09],
           [1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ...,
            3.49099771e-08, 3.11740926e-08, 5.29926236e-09],
           [1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ...,
            5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32)


    Use left-aligned frames, instead of centered frames

    >>> D_left = np.abs(librosa.stft(y, center=False))


    Use a shorter hop length

    >>> D_short = np.abs(librosa.stft(y, hop_length=64))


    Display a spectrogram

    >>> import matplotlib.pyplot as plt
    >>> librosa.display.specshow(librosa.amplitude_to_db(D,
    ...                                                  ref=np.max),
    ...                          y_axis='log', x_axis='time')
    >>> plt.title('Power spectrogram')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.tight_layout()

    """

    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    #fft_window = get_window(window, win_length, fftbins=True)
    fft_window = vorbis(win_length)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Check audio is valid
    util.valid_audio(y)

    # Pad the time series so that frames are centered
    if center:
        y = np.pad(y, int(n_fft // 2), mode=pad_mode)

    # Window the time series.
    y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK /
                    (stft_matrix.shape[0] * stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:,
                    bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t],
                                         axis=0)[:stft_matrix.shape[0]]

    return stft_matrix
Beispiel #25
0
        "5544574287152993687.mp4": [],
        "5544620672795594434.mp4": [],
        "5547193787702629969.mp4": [],
        "5549784941472309008.mp4": [],
        "5552368364300855101.mp4": [],
        "5555325449284154780.mp4": [],
        "5555360238519252381.mp4": []
    }
    datapath = f"./data/{d}"
    wavs = [f.name for f in os.scandir(datapath) if f.name.endswith(".wav")]
    wavs.sort()
    for wavfile in wavs:
        print(f"Diarizing file {wavfile} now.")
        (rate, sig) = wavefile.load(f"{datapath}/{wavfile}")
        signal = sig[0]
        S = np.transpose(frame(signal, int(2000 * 16), int(500 * 16)))
        X = list(map(lambda s: fe(s, 16000), S))
        X = np.array(np.swapaxes(X, 1, 2))
        X = X.astype(np.float16)
        num_timesteps = X.shape[1]

        if num_timesteps != 201:
            emb_model.layers.pop(0)
            new_input = Input(batch_shape=(None, num_timesteps, 30))
            new_output = emb_model(new_input)
            emb_model = Model(new_input, new_output)

        embs = emb_model.predict(X)
        try:
            SD.cluster(rounds=10,
                       clust_range=[2, 8],
Beispiel #26
0
import librosa
import numpy as np
import librosa.util as util
from librosa.filters import get_window

audio_path = "../AudioData/audio/D4_750.wav"
noise_path = "../AudioData/noise/Pink Noise.wav"
# 读取音频文件
y, sr = librosa.load(audio_path)

# 对音频文件进行分帧
win_len = n_fft = 200
hop_length = 80
# Pad the time series so that frames are centered
y = np.pad(y, int(n_fft // 2), mode='reflect')
# Window the time series.
y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length, axis=0)

# 获得窗系数
fft_window = get_window('hamm', 10, fftbins=False)
# fft_window = fft_window[1:-1]
print(fft_window)
fft_window = get_window('hamm', 10, fftbins=True)
print(fft_window)
# Pad the window out to n_fft size
fft_window = util.pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))

#
Beispiel #27
0
    def hht(self,
            y,
            hop_length=None,
            win_length=None,
            center=True,
            dtype=np.complex64,
            pad_mode='reflect'):
        """Hilbert-Huang transform (HHT)

        Parameters
        ----------
        y : np.ndarray [shape=(n,)], real-valued
            the input signal (audio time series)

        hop_length : int > 0 [scalar]
            number audio of frames between STFT columns.
            If unspecified, defaults `win_length / 4`.

        win_length  : int <= n_fft [scalar]
            Each frame of audio is windowed by `window()`.
            The window will be of length `win_length` and then padded
            with zeros to match `n_fft`.

            If unspecified, defaults to ``win_length = n_fft``.

        center      : boolean
            - If `True`, the signal `y` is padded so that frame
              `D[:, t]` is centered at `y[t * hop_length]`.
            - If `False`, then `D[:, t]` begins at `y[t * hop_length]`

        dtype       : numeric type
            Complex numeric type for `D`.  Default is 64-bit complex.

        pad_mode : string
            If `center=True`, the padding mode to use at the edges of the signal.
            By default, HHT uses reflection padding.

        Returns
        -------
        hht_matrix : np.ndarray [shape=(30, t), dtype=dtype]
        bjp_matrix : np.ndarray [shape=(n_hht-1, t), dtype=dtype]

        """

        # By default, use the entire frame
        if win_length is None:
            win_length = self.n_hht

        # Set the default hop, if it's not already specified
        if hop_length is None:
            hop_length = int(win_length / 2)

        hht_window = self.window

        # Pad the window out to n_hht size
        hht_window = util.pad_center(hht_window, self.n_hht)

        # Reshape so that the window can be broadcast
        hht_window = hht_window.reshape((-1, 1))

        # Check audio is valid
        util.valid_audio(y)

        # Pad the time series so that frames are centered
        if center:
            y = np.pad(y, self.n_hht - 1, mode=pad_mode)

        # Window the time series.
        y_frames = util.frame(y,
                              frame_length=self.n_hht,
                              hop_length=hop_length).T

        # Pre-allocate the HHT matrix
        hht_matrix = np.empty((27, y_frames.shape[0]), dtype=dtype, order='F')

        bjp_matrix = np.empty((self.n_hht - 1, y_frames.shape[0]),
                              dtype=dtype,
                              order='F')

        for bl_s in range(hht_matrix.shape[1]):
            frame_signal = hht_window[:, 0] * y_frames[bl_s, :]
            A, f, bjp = get_hht(frame_signal, self.fs)
            hht_matrix[:, bl_s] = self.hht_based_feature(A, f * self.fs, bjp)
            bjp_matrix[:, bl_s] = bjp

        return hht_matrix, bjp_matrix