def segment(self, example, exclude_keys=None): if exclude_keys is None: exclude_keys = [] elif isinstance(exclude_keys, str): exclude_keys = [exclude_keys] segment_len = shift = self.opts.time_segments num_samples = example[NUM_SAMPLES] audio_keys = [ key for key in example['audio_keys'] if not key in exclude_keys ] for key in audio_keys: example[key] = segment_axis(example[key][..., :num_samples], segment_len, shift=shift, axis=-1, end='cut') lengths = ([example[key].shape[-2] for key in audio_keys]) assert lengths.count(lengths[-2]) == len(lengths), { audio_keys[idx]: leng for idx, leng in enumerate(lengths) } length = lengths[0] if length == 0: from lazy_dataset.core import FilterException print('was to short') raise FilterException out_list = list() example[NUM_SAMPLES] = self.opts.time_segments for idx in range(length): new_example = deepcopy(example) for key in audio_keys: new_example[key] = new_example[key][..., idx, :] out_list.append(new_example) shuffle(out_list) return out_list
def __call__(self, sig): energy = np.sum(segment_axis(sig, self.frame_size, self.frame_shift)**2, axis=-1) activity = energy >= self.threshold activity = self.activity_frame_to_time(activity) if self.len_smooth_win != 0: activity = self.smooth_voice_activity(activity) return activity
def smooth_voice_activity(self, activity): activity = activity.copy() shift = self.len_smooth_win // 2 padding = [(0, 0)] * (activity.ndim - 1) + [(shift, shift)] vad_padded = np.pad(activity, padding, 'edge') vad_segmented = \ segment_axis(vad_padded, self.len_smooth_win, 1, end='pad') vad_segmented = np.sum(vad_segmented, axis=-1) activity[vad_segmented >= shift] = 1 activity[vad_segmented < shift] = 0 return activity
def activity_frame_to_time(self, frame_wise_activity): frame_wise_activity = np.asarray(frame_wise_activity) frame_wise_activity = np.broadcast_to( frame_wise_activity[..., None], (*frame_wise_activity.shape, self.frame_size)) len_time_sig = (frame_wise_activity.shape[-2] * self.frame_shift + self.frame_size - self.frame_shift) time_activity = \ np.zeros((*frame_wise_activity.shape[:-2], len_time_sig)) time_signal_seg = segment_axis(time_activity, self.frame_size, self.frame_shift, end=None) time_signal_seg[frame_wise_activity > 0] = 1 return time_activity != 0
def smooth_vad(vad_pred, threshold=0.1, window=25, divisor=1): """ >>> vad_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.2, 0.1]) >>> smooth_vad(vad_pred, window=3, divisor=1, threshold=0.3) array([0., 0., 1., 1., 1., 1., 1., 1., 0.]) >>> smooth_vad(vad_pred, window=5, divisor=1, threshold=0.5) array([0., 0., 0., 0., 1., 1., 1., 1., 0.]) >>> smooth_vad(vad_pred, window=5, divisor=2, threshold=0.5) array([0., 0., 0., 1., 1., 1., 1., 1., 1.]) >>> smooth_vad(vad_pred[None, None], window=5, divisor=2, threshold=0.5) array([[[0., 0., 0., 1., 1., 1., 1., 1., 1.]]]) """ vad_pred = vad_pred.copy() vad_pred[vad_pred > threshold] = 1. vad_pred[vad_pred < 1] = 0. shift = window // 2 padding = [(0, 0)] * (vad_pred.ndim - 1) + [(shift, shift)] vad_padded = np.pad(vad_pred, padding, 'edge') vad_segmented = segment_axis(vad_padded, window, 1, end='pad') vad_segmented = np.sum(vad_segmented, axis=-1) vad_pred[vad_segmented >= shift // divisor] = 1 vad_pred[vad_segmented < shift // divisor] = 0 return vad_pred
def segment( x: Union[list, np.ndarray, torch.Tensor], length: int, shift: int = None, anchor: Union[str, int] = 'left', axis: int = -1, mode: str = 'constant', padding: bool = False, rng=np.random ): """ Segments a signal `x` along an axis. Either with a predefined anchor for the segment boundaries if anchor is set or with an internally calculated anchor if anchor is a string. Args: x: signal to be segmented, either torch.Tensor or numpy.array anchor: anchor from which the segmentation boundaries are calculated. if it is a string `get_anchor` is called to calculate an integer using `anchor` as anchor mode definition. length: segment length shift: shift between segments, defaults to length axis: axis over which to segment mode: used in _get_segment_length_for_mode padding: May only be `True` if `anchor` is `0` or `left` since padding is only applied to the end of the signal. This may be the right choice for evaluation. If `False` the residual values are disgarded. rng: random number generator (`numpy.random`) Returns: >>> np.random.seed(3) >>> segment(np.arange(0, 15), 10, 3, anchor='left') array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]) >>> segment(np.arange(0, 15), 10, 3, anchor='random') array([[ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]]) >>> segment(np.arange(0, 15), 10, 3, anchor=5) array([[ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]]) """ if padding: # No padding is implemented for the begging of a signal assert anchor in [0, 'left'], (padding, anchor) end = 'pad' else: end = 'cut' if x.__class__.__module__ == 'numpy': ndim = x.ndim moveaxis = np.moveaxis elif x.__class__.__module__ == 'torch': ndim = x.dim() from distutils.version import LooseVersion if LooseVersion(torch.__version__) >= '1.7.0': moveaxis = torch.movedim else: # moveaxis code taken from # https: // github.com / pytorch / pytorch / issues / 36048 def moveaxis(tensor: torch.Tensor, source: int, destination: int) -> torch.Tensor: dim = tensor.dim() perm = list(range(dim)) if destination < 0: destination += dim perm.pop(source) perm.insert(destination, source) return tensor.permute(*perm) elif isinstance(x, list): x = np.array(x) ndim = x.ndim moveaxis = np.moveaxis else: raise TypeError('Unknown type for input signal x', type(x)) axis = axis % ndim num_samples = x.shape[axis] assert num_samples >= length, (num_samples, length) assert mode in possible_segment_modes, ( 'Unknown length mode. Length mode has to be chosen' 'from', possible_segment_modes, 'and is', mode ) length, shift, num_samples = _get_segment_length_for_mode( num_samples, length, shift, mode) assert shift > 0, shift if isinstance(anchor, str): anchor = get_anchor(num_samples, length, shift, mode=anchor, rng=rng) assert isinstance(anchor, int), (anchor, type(anchor)) start = anchor % shift # slice the array to remove samples discarded with the specified anchor slc = [slice(None)] * ndim slc[axis] = slice(start, None) x = x[tuple(slc)] return moveaxis( segment_axis(x, length, shift, end=end, axis=axis), axis, 0)
def eval_estimator(db_json, scenario, ref_node_id, vad_threshold, activity_threshold): msg = ('scenario must be "Scenario-1", "Scenario-2", ' '"Scenario-3" or "Scenario-4"') scenarios = ['Scenario-1', 'Scenario-2', 'Scenario-3', 'Scenario-4'] assert scenario in scenarios, msg if scenario == 'Scenario-1': db = AsyncWASN(db_json).get_data_set_scenario_1() elif scenario == 'Scenario-2': db = AsyncWASN(db_json).get_data_set_scenario_2() elif scenario == 'Scenario-3': db = AsyncWASN(db_json).get_data_set_scenario_3() elif scenario == 'Scenario-4': db = AsyncWASN(db_json).get_data_set_scenario_4() sro_estimator = DynamicWACD() voice_activity_detector = VoiceActivityDetector(vad_threshold) num_examples = 3 * len(db) errors = np.zeros(num_examples) for ex_id, example in enumerate(db): print(f'Process example {example["example_id"].split("_")[-1]}') all_dists = get_distances(example) ref_sig = load_audio(example['audio_path'][f'node_{ref_node_id}']) other_nodes = [i for i in range(4) if i != ref_node_id] for cnt, node_id in enumerate(other_nodes): sig = load_audio(example['audio_path'][f'node_{node_id}']) # Align the signals coarsely sig_sync, ref_sig_sync, offset = \ coarse_sync(sig, ref_sig, len_sync=320000) # Estimate the sampling rate offset (SRO) activity_sig = voice_activity_detector(sig_sync) activity_ref_sig = voice_activity_detector(ref_sig_sync) sro_est = sro_estimator( sig_sync, ref_sig_sync, activity_sig, activity_ref_sig ) # Compensate for the SRO sig_sync = compensate_sro(sig_sync, sro_est) ref_sig_sync = ref_sig_sync[:len(sig_sync)] # Estimate the time shifts and distances sig_shifts = est_time_shift(sig_sync, ref_sig_sync, 16384, 2048) if offset > 0: dists = all_dists[int(np.round(offset)):, node_id] dists_ref = all_dists[:, ref_node_id] else: dists = all_dists[:, node_id] dists_ref = all_dists[int(np.round(-offset)):, ref_node_id] frame_ids = \ 8192 + np.asarray([i*2048 for i in range(len(sig_shifts))]) dists = dists[frame_ids] dists_ref = dists_ref[frame_ids] # Discard estimates corresponding to periods in time # without source activity activity_ref_sig = voice_activity_detector(ref_sig_sync) activity_ref_sig = \ (segment_axis(activity_ref_sig, 16384, 2048).sum(-1) > activity_threshold) activity_sig = voice_activity_detector(sig_sync) activity_sig = (segment_axis(activity_sig, 16384, 2048).sum(-1) > activity_threshold) activity_mask = np.logical_and(activity_sig, activity_ref_sig) sig_shifts = sig_shifts[activity_mask] dists = dists[activity_mask] dists_ref = dists_ref[activity_mask] # Estimate the sampling time offsett (STO) sto_est = est_sto(sig_shifts, dists, dists_ref) - offset # Calculate the estimation error sto = (example['sto'][f'node_{node_id}'] - example['sto'][f'node_{ref_node_id}']) errors[3*ex_id+cnt] = sto - sto_est print(f'node {node_id}: error = ' f'{np.round(errors[3*ex_id+cnt], 2)} samples') print(f'\nRMSE = {np.round(np.sqrt(np.mean(errors**2)), 2)} samples')
def stft( time_signal, size: int = 1024, shift: int = 256, *, # axis=-1, # I never use this and it complicated the code window: [str, typing.Callable] = 'blackman', window_length: int = None, fading: typing.Optional[typing.Union[bool, str]] = 'full', pad: bool = True, symmetric_window: bool = False, ): """ >>> import numpy as np >>> import random >>> from paderbox.transform.module_stft import stft as np_stft, istft as np_istft >>> kwargs = dict( ... size=np.random.randint(100, 200), ... shift=np.random.randint(40, 100), ... window=random.choice(['blackman', 'hann', 'hamming']), ... fading=random.choice(['full', 'half', False]), ... ) >>> num_samples = np.random.randint(200, 500) >>> a = np.random.rand(num_samples) >>> A_np = np_stft(a, **kwargs) >>> A_pt = stft(torch.tensor(a), **kwargs) >>> np.testing.assert_allclose( ... A_np, A_pt.numpy(), err_msg=str(kwargs), atol=1e-10) """ assert isinstance(time_signal, torch.Tensor) if window_length is None: window_length = size else: if window_length != size: raise NotImplementedError( 'Torch does not support window_length != size\n' 'window_length = {window_length} != {size} = size') # Pad with zeros to have enough samples for the window function to fade. assert fading in [None, True, False, 'full', 'half'], (fading, type(fading)) if fading not in [False, None]: if fading == 'half': pad_width = [ (window_length - shift) // 2, math.ceil((window_length - shift) / 2), ] else: pad_width = [ window_length - shift, window_length - shift, ] time_signal = torch.nn.functional.pad(time_signal, pad_width, mode='constant') window = _get_window( window=window, symmetric_window=symmetric_window, window_length=window_length, ) time_signal_seg = segment_axis(time_signal, window_length, shift=shift, axis=-1, end='pad' if pad else 'cut') out = torch.rfft( time_signal_seg * window, 1, # size, ) assert out.shape[-1] == 2, out.shape return torch_complex.ComplexTensor(out[..., 0], out[..., 1])
def modmfcc(time_signal, sample_rate=16000, stft_win_len=400, stft_shift=160, numcep=30, number_of_filters=40, stft_size=512, lowest_frequency=0, highest_frequency=None, preemphasis_factor=0.97, ceplifter=22, stft_window=scipy.signal.hamming, mod_length=16, mod_shift=8, mod_window=scipy.signal.hamming, avg_length=1, avg_shift=1): """ Compute Mod-MFCC features from an audio signal. :param time_signal: the audio signal from which to compute features. Should be an channels x samples array. :param sample_rate: the sample rate of the signal we are working with. Default is 16000. :param stft_win_len: the length of the analysis window. In samples. Default is 400 (25 milliseconds @ 16kHz). :param stft_shift: the step between successive windows. In samples. Default is 160 (10 milliseconds @ 16kHz). :param numcep: the number of cepstrum to return, Default is 20. :param number_of_filters: number of filters in the filterbank, Default is 40. :param stft_size: the FFT size. Default is 512. :param lowest_frequency: lowest band edge of mel filters. In Hz, Default is 0. :param highest_frequency: highest band edge of mel filters. In Hz, Default is samplerate/2. :param preemphasis_factor: apply preemphasis filter with preemphasis_factor as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: the liftering coefficient to use. ceplifter <= 0 disables lifter. Default is 22. :param stft_window: the window function to use for fbank features. Default is hamming window. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ x = mfcc(time_signal, sample_rate=sample_rate, window_length=stft_win_len, window=stft_window, stft_shift=stft_shift, stft_size=stft_size, number_of_filters=number_of_filters, lowest_frequency=lowest_frequency, highest_frequency=highest_frequency, preemphasis_factor=preemphasis_factor, ceplifter=ceplifter, numcep=numcep) x = np.abs( stft(x, size=mod_length, shift=mod_shift, window=mod_window, axis=-2, fading=False)) assert avg_length >= avg_shift if avg_length > 1: x = segment_axis(x, length=avg_length, shift=avg_shift, end='pad', axis=-3) x = np.mean(x, axis=-3) return x
def istft( stft_signal, size: int=1024, shift: int=256, *, window: [str, typing.Callable]=signal.windows.blackman, fading: typing.Optional[typing.Union[bool, str]] = 'full', window_length: int=None, symmetric_window: bool=False, num_samples: int=None, pad: bool=True, ): """ Calculated the inverse short time Fourier transform to exactly reconstruct the time signal. ..note:: Be careful if you make modifications in the frequency domain (e.g. beamforming) because the synthesis window is calculated according to the unmodified! analysis window. :param stft_signal: Single channel complex STFT signal with dimensions (..., frames, size/2+1). :param size: Scalar FFT-size. :param shift: Scalar FFT-shift. Typically shift is a fraction of size. :param window: Window function handle. :param fading: Removes the additional padding, if done during STFT. :param window_length: Sometimes one desires to use a shorter window than the fft size. In that case, the window is padded with zeros. The default is to use the fft-size as a window size. :param symmetric_window: symmetric or periodic window. Assume window is periodic. Since the implementation of the windows in scipy.signal have a curious behaviour for odd window_length. Use window(len+1)[:-1]. Since is equal to the behaviour of MATLAB. :param num_samples: None or the number of samples that the original time signal has. When given, check, that the backt transformed signal has a valid number of samples and shorten the signal to the original length. (Does only work when pad is True). :param pad: Necessary when num_samples is not None. This arguments is only for the forward transform nessesary and not for the inverse. Here it is used, to check that num_samples is valid. :return: Single channel complex STFT signal :return: Single channel time signal. """ # Note: frame_axis and frequency_axis would make this function much more # complicated stft_signal = np.array(stft_signal) assert stft_signal.shape[-1] == size // 2 + 1, str(stft_signal.shape) if window_length is None: window_length = size window = _get_window( window=window, symmetric_window=symmetric_window, window_length=window_length, ) window = _biorthogonal_window_fastest(window, shift) # window = _biorthogonal_window_fastest( # window, shift, use_amplitude_for_biorthogonal_window) # if disable_sythesis_window: # window = np.ones_like(window) time_signal = np.zeros( (*stft_signal.shape[:-2], stft_signal.shape[-2] * shift + window_length - shift)) # Get the correct view to time_signal time_signal_seg = segment_axis( time_signal, window_length, shift, end=None ) # Unbuffered inplace add np.add.at( time_signal_seg, ..., window * np.real( irfft(stft_signal, n=size) )[..., :window_length] ) # The [..., :window_length] is the inverse of the window padding in rfft. # Compensate fade-in and fade-out assert fading in [None, True, False, 'full', 'half'], fading if fading not in [None, False]: pad_width = (window_length - shift) if fading == 'half': pad_width /= 2 time_signal = time_signal[ ..., int(pad_width):time_signal.shape[-1] - ceil(pad_width)] if num_samples is not None: if pad: assert time_signal.shape[-1] >= num_samples, (time_signal.shape, num_samples) assert time_signal.shape[-1] < num_samples + shift, (time_signal.shape, num_samples) time_signal = time_signal[..., :num_samples] else: raise ValueError( pad, 'When padding is False in the stft, the signal is cutted.' 'This operation can not be inverted.', ) return time_signal
def stft( time_signal, size: int = 1024, shift: int = 256, *, axis=-1, window: [str, typing.Callable] = signal.windows.blackman, window_length: int = None, fading: typing.Optional[typing.Union[bool, str]] = 'full', pad: bool = True, symmetric_window: bool = False, ) -> np.array: """ ToDo: Open points: - sym_window need literature - fading why it is better? - should pad have more degrees of freedom? Calculates the short time Fourier transformation of a multi channel multi speaker time signal. It is able to add additional zeros for fade-in and fade out and should yield an STFT signal which allows perfect reconstruction. :param time_signal: Multi channel time signal with dimensions AA x ... x AZ x T x BA x ... x BZ. :param size: Scalar FFT-size. :param shift: Scalar FFT-shift, the step between successive frames in samples. Typically shift is a fraction of size. :param axis: Scalar axis of time. Default: None means the biggest dimension. :param window: Window function handle. Default is windows.blackman window. :param fading: Pads the signal with zeros for better reconstruction. :param window_length: Sometimes one desires to use a shorter window than the fft size. In that case, the window is padded with zeros. The default is to use the fft-size as a window size. :param pad: If true zero pad the signal to match the shape, else cut :param symmetric_window: symmetric or periodic window. Assume window is periodic. Since the implementation of the windows in scipy.signal have a curious behaviour for odd window_length. Use window(len+1)[:-1]. Since is equal to the behaviour of MATLAB. :return: Single channel complex STFT signal with dimensions AA x ... x AZ x T' times size/2+1 times BA x ... x BZ. """ time_signal = np.asarray(time_signal) axis = axis % time_signal.ndim if window_length is None: window_length = size # Pad with zeros to have enough samples for the window function to fade. assert fading in [None, True, False, 'full', 'half'], fading if fading not in [False, None]: pad_width = np.zeros((time_signal.ndim, 2), dtype=np.int) if fading == 'half': pad_width[axis, 0] = (window_length - shift) // 2 pad_width[axis, 1] = ceil((window_length - shift) / 2) else: pad_width[axis, :] = window_length - shift time_signal = np.pad(time_signal, pad_width, mode='constant') window = _get_window( window=window, symmetric_window=symmetric_window, window_length=window_length, ) time_signal_seg = segment_axis( time_signal, window_length, shift=shift, axis=axis, end='pad' if pad else 'cut' ) letters = string.ascii_lowercase[:time_signal_seg.ndim] mapping = letters + ',' + letters[axis + 1] + '->' + letters try: # ToDo: Implement this more memory efficient return rfft( np.einsum(mapping, time_signal_seg, window), n=size, axis=axis + 1, ) except ValueError as e: raise ValueError( f'Could not calculate the stft, something does not match.\n' f'mapping: {mapping}, ' f'time_signal_seg.shape: {time_signal_seg.shape}, ' f'window.shape: {window.shape}, ' f'size: {size}' f'axis+1: {axis+1}' ) from e