コード例 #1
0
 def segment(self, example, exclude_keys=None):
     if exclude_keys is None:
         exclude_keys = []
     elif isinstance(exclude_keys, str):
         exclude_keys = [exclude_keys]
     segment_len = shift = self.opts.time_segments
     num_samples = example[NUM_SAMPLES]
     audio_keys = [
         key for key in example['audio_keys'] if not key in exclude_keys
     ]
     for key in audio_keys:
         example[key] = segment_axis(example[key][..., :num_samples],
                                     segment_len,
                                     shift=shift,
                                     axis=-1,
                                     end='cut')
     lengths = ([example[key].shape[-2] for key in audio_keys])
     assert lengths.count(lengths[-2]) == len(lengths), {
         audio_keys[idx]: leng
         for idx, leng in enumerate(lengths)
     }
     length = lengths[0]
     if length == 0:
         from lazy_dataset.core import FilterException
         print('was to short')
         raise FilterException
     out_list = list()
     example[NUM_SAMPLES] = self.opts.time_segments
     for idx in range(length):
         new_example = deepcopy(example)
         for key in audio_keys:
             new_example[key] = new_example[key][..., idx, :]
         out_list.append(new_example)
     shuffle(out_list)
     return out_list
コード例 #2
0
ファイル: utils.py プロジェクト: fgnt/paderwasn
 def __call__(self, sig):
     energy = np.sum(segment_axis(sig, self.frame_size,
                                  self.frame_shift)**2,
                     axis=-1)
     activity = energy >= self.threshold
     activity = self.activity_frame_to_time(activity)
     if self.len_smooth_win != 0:
         activity = self.smooth_voice_activity(activity)
     return activity
コード例 #3
0
ファイル: utils.py プロジェクト: fgnt/paderwasn
 def smooth_voice_activity(self, activity):
     activity = activity.copy()
     shift = self.len_smooth_win // 2
     padding = [(0, 0)] * (activity.ndim - 1) + [(shift, shift)]
     vad_padded = np.pad(activity, padding, 'edge')
     vad_segmented = \
         segment_axis(vad_padded, self.len_smooth_win, 1, end='pad')
     vad_segmented = np.sum(vad_segmented, axis=-1)
     activity[vad_segmented >= shift] = 1
     activity[vad_segmented < shift] = 0
     return activity
コード例 #4
0
ファイル: utils.py プロジェクト: fgnt/paderwasn
 def activity_frame_to_time(self, frame_wise_activity):
     frame_wise_activity = np.asarray(frame_wise_activity)
     frame_wise_activity = np.broadcast_to(
         frame_wise_activity[..., None],
         (*frame_wise_activity.shape, self.frame_size))
     len_time_sig = (frame_wise_activity.shape[-2] * self.frame_shift +
                     self.frame_size - self.frame_shift)
     time_activity = \
         np.zeros((*frame_wise_activity.shape[:-2], len_time_sig))
     time_signal_seg = segment_axis(time_activity,
                                    self.frame_size,
                                    self.frame_shift,
                                    end=None)
     time_signal_seg[frame_wise_activity > 0] = 1
     return time_activity != 0
コード例 #5
0
ファイル: eval_sad.py プロジェクト: yisiying/padertorch
def smooth_vad(vad_pred, threshold=0.1, window=25, divisor=1):
    """
    >>> vad_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.2, 0.1])
    >>> smooth_vad(vad_pred, window=3, divisor=1, threshold=0.3)
    array([0., 0., 1., 1., 1., 1., 1., 1., 0.])
    >>> smooth_vad(vad_pred, window=5, divisor=1, threshold=0.5)
    array([0., 0., 0., 0., 1., 1., 1., 1., 0.])
    >>> smooth_vad(vad_pred, window=5, divisor=2, threshold=0.5)
    array([0., 0., 0., 1., 1., 1., 1., 1., 1.])
    >>> smooth_vad(vad_pred[None, None], window=5, divisor=2, threshold=0.5)
    array([[[0., 0., 0., 1., 1., 1., 1., 1., 1.]]])
    """
    vad_pred = vad_pred.copy()
    vad_pred[vad_pred > threshold] = 1.
    vad_pred[vad_pred < 1] = 0.
    shift = window // 2
    padding = [(0, 0)] * (vad_pred.ndim - 1) + [(shift, shift)]
    vad_padded = np.pad(vad_pred, padding, 'edge')
    vad_segmented = segment_axis(vad_padded, window, 1, end='pad')
    vad_segmented = np.sum(vad_segmented, axis=-1)
    vad_pred[vad_segmented >= shift // divisor] = 1
    vad_pred[vad_segmented < shift // divisor] = 0
    return vad_pred
コード例 #6
0
def segment(
        x: Union[list, np.ndarray, torch.Tensor], length: int,
        shift: int = None, anchor: Union[str, int] = 'left', axis: int = -1,
        mode: str = 'constant', padding: bool = False, rng=np.random
):
    """
    Segments a signal `x` along an axis. Either with a predefined anchor for
    the segment boundaries if anchor is set or with an internally calculated
    anchor if anchor is a string.

    Args:
        x: signal to be segmented, either torch.Tensor or numpy.array
        anchor: anchor from which the segmentation boundaries are calculated.
            if it is a string `get_anchor` is called to calculate an integer
            using `anchor` as anchor mode definition.
        length: segment length
        shift: shift between segments, defaults to length
        axis: axis over which to segment
        mode: used in _get_segment_length_for_mode
        padding: May only be `True` if `anchor` is `0` or `left` since padding
            is only applied to the end of the signal. This may be the right
            choice for evaluation.
            If `False` the residual values are disgarded.
        rng: random number generator (`numpy.random`)

    Returns:

    >>> np.random.seed(3)
    >>> segment(np.arange(0, 15), 10, 3, anchor='left')
    array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
           [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12]])
    >>> segment(np.arange(0, 15), 10, 3, anchor='random')
    array([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
           [ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14]])
    >>> segment(np.arange(0, 15), 10, 3, anchor=5)
    array([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
           [ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14]])
    """

    if padding:
        # No padding is implemented for the begging of a signal
        assert anchor in [0, 'left'], (padding, anchor)
        end = 'pad'
    else:
        end = 'cut'

    if x.__class__.__module__ == 'numpy':
        ndim = x.ndim
        moveaxis = np.moveaxis
    elif x.__class__.__module__ == 'torch':
        ndim = x.dim()
        from distutils.version import LooseVersion

        if LooseVersion(torch.__version__) >= '1.7.0':
            moveaxis = torch.movedim
        else:
            # moveaxis code taken from
            # https: // github.com / pytorch / pytorch / issues / 36048
            def moveaxis(tensor: torch.Tensor, source: int,
                         destination: int) -> torch.Tensor:
                dim = tensor.dim()
                perm = list(range(dim))
                if destination < 0:
                    destination += dim
                perm.pop(source)
                perm.insert(destination, source)
                return tensor.permute(*perm)
    elif isinstance(x, list):
        x = np.array(x)
        ndim = x.ndim
        moveaxis = np.moveaxis
    else:
        raise TypeError('Unknown type for input signal x', type(x))
    axis = axis % ndim

    num_samples = x.shape[axis]
    assert num_samples >= length, (num_samples, length)

    assert mode in possible_segment_modes, (
        'Unknown length mode. Length mode has to be chosen'
        'from', possible_segment_modes, 'and is', mode
    )

    length, shift, num_samples = _get_segment_length_for_mode(
        num_samples, length, shift, mode)

    assert shift > 0, shift
    if isinstance(anchor, str):
        anchor = get_anchor(num_samples, length, shift, mode=anchor, rng=rng)
    assert isinstance(anchor, int), (anchor, type(anchor))

    start = anchor % shift

    # slice the array to remove samples discarded with the specified anchor
    slc = [slice(None)] * ndim
    slc[axis] = slice(start, None)
    x = x[tuple(slc)]

    return moveaxis(
        segment_axis(x, length, shift, end=end, axis=axis), axis, 0)
コード例 #7
0
def eval_estimator(db_json,
                   scenario,
                   ref_node_id,
                   vad_threshold,
                   activity_threshold):
    msg = ('scenario must be "Scenario-1", "Scenario-2", '
           '"Scenario-3" or "Scenario-4"')
    scenarios = ['Scenario-1', 'Scenario-2', 'Scenario-3', 'Scenario-4']
    assert scenario in scenarios, msg

    if scenario == 'Scenario-1':
        db = AsyncWASN(db_json).get_data_set_scenario_1()
    elif scenario == 'Scenario-2':
        db = AsyncWASN(db_json).get_data_set_scenario_2()
    elif scenario == 'Scenario-3':
        db = AsyncWASN(db_json).get_data_set_scenario_3()
    elif scenario == 'Scenario-4':
        db = AsyncWASN(db_json).get_data_set_scenario_4()

    sro_estimator = DynamicWACD()
    voice_activity_detector = VoiceActivityDetector(vad_threshold)
    num_examples = 3 * len(db)
    errors = np.zeros(num_examples)
    for ex_id, example in enumerate(db):
        print(f'Process example {example["example_id"].split("_")[-1]}')
        all_dists = get_distances(example)
        ref_sig = load_audio(example['audio_path'][f'node_{ref_node_id}'])
        other_nodes = [i for i in range(4) if i != ref_node_id]
        for cnt, node_id in enumerate(other_nodes):
            sig = load_audio(example['audio_path'][f'node_{node_id}'])

            # Align the signals coarsely
            sig_sync, ref_sig_sync, offset = \
                coarse_sync(sig, ref_sig, len_sync=320000)

            # Estimate the sampling rate offset (SRO)
            activity_sig = voice_activity_detector(sig_sync)
            activity_ref_sig = voice_activity_detector(ref_sig_sync)
            sro_est = sro_estimator(
                sig_sync, ref_sig_sync, activity_sig, activity_ref_sig
            )

            # Compensate for the SRO
            sig_sync = compensate_sro(sig_sync, sro_est)
            ref_sig_sync = ref_sig_sync[:len(sig_sync)]

            # Estimate the time shifts and distances
            sig_shifts = est_time_shift(sig_sync, ref_sig_sync, 16384, 2048)
            if offset > 0:
                dists = all_dists[int(np.round(offset)):, node_id]
                dists_ref = all_dists[:, ref_node_id]
            else:
                dists = all_dists[:, node_id]
                dists_ref = all_dists[int(np.round(-offset)):, ref_node_id]
            frame_ids = \
                8192 + np.asarray([i*2048 for i in range(len(sig_shifts))])
            dists = dists[frame_ids]
            dists_ref = dists_ref[frame_ids]

            # Discard estimates corresponding to periods in time
            # without source activity
            activity_ref_sig = voice_activity_detector(ref_sig_sync)
            activity_ref_sig = \
                (segment_axis(activity_ref_sig, 16384, 2048).sum(-1)
                 > activity_threshold)
            activity_sig = voice_activity_detector(sig_sync)
            activity_sig = (segment_axis(activity_sig, 16384, 2048).sum(-1)
                            > activity_threshold)
            activity_mask = np.logical_and(activity_sig, activity_ref_sig)
            sig_shifts = sig_shifts[activity_mask]
            dists = dists[activity_mask]
            dists_ref = dists_ref[activity_mask]

            # Estimate the sampling time offsett (STO)
            sto_est = est_sto(sig_shifts, dists, dists_ref) - offset

            # Calculate the estimation error
            sto = (example['sto'][f'node_{node_id}']
                   - example['sto'][f'node_{ref_node_id}'])
            errors[3*ex_id+cnt] = sto - sto_est
            print(f'node {node_id}: error = '
                  f'{np.round(errors[3*ex_id+cnt], 2)} samples')
    print(f'\nRMSE = {np.round(np.sqrt(np.mean(errors**2)), 2)} samples')
コード例 #8
0
ファイル: transform.py プロジェクト: yisiying/padertorch
def stft(
    time_signal,
    size: int = 1024,
    shift: int = 256,
    *,
    # axis=-1,  # I never use this and it complicated the code
    window: [str, typing.Callable] = 'blackman',
    window_length: int = None,
    fading: typing.Optional[typing.Union[bool, str]] = 'full',
    pad: bool = True,
    symmetric_window: bool = False,
):
    """
    >>> import numpy as np
    >>> import random
    >>> from paderbox.transform.module_stft import stft as np_stft, istft as np_istft
    >>> kwargs = dict(
    ...     size=np.random.randint(100, 200),
    ...     shift=np.random.randint(40, 100),
    ...     window=random.choice(['blackman', 'hann', 'hamming']),
    ...     fading=random.choice(['full', 'half', False]),
    ... )
    >>> num_samples = np.random.randint(200, 500)
    >>> a = np.random.rand(num_samples)
    >>> A_np = np_stft(a, **kwargs)
    >>> A_pt = stft(torch.tensor(a), **kwargs)
    >>> np.testing.assert_allclose(
    ...     A_np, A_pt.numpy(), err_msg=str(kwargs), atol=1e-10)

    """
    assert isinstance(time_signal, torch.Tensor)
    if window_length is None:
        window_length = size
    else:
        if window_length != size:
            raise NotImplementedError(
                'Torch does not support window_length != size\n'
                'window_length = {window_length} != {size} = size')

    # Pad with zeros to have enough samples for the window function to fade.
    assert fading in [None, True, False, 'full',
                      'half'], (fading, type(fading))
    if fading not in [False, None]:
        if fading == 'half':
            pad_width = [
                (window_length - shift) // 2,
                math.ceil((window_length - shift) / 2),
            ]
        else:
            pad_width = [
                window_length - shift,
                window_length - shift,
            ]
        time_signal = torch.nn.functional.pad(time_signal,
                                              pad_width,
                                              mode='constant')

    window = _get_window(
        window=window,
        symmetric_window=symmetric_window,
        window_length=window_length,
    )

    time_signal_seg = segment_axis(time_signal,
                                   window_length,
                                   shift=shift,
                                   axis=-1,
                                   end='pad' if pad else 'cut')

    out = torch.rfft(
        time_signal_seg * window,
        1,
        # size,
    )
    assert out.shape[-1] == 2, out.shape
    return torch_complex.ComplexTensor(out[..., 0], out[..., 1])
コード例 #9
0
def modmfcc(time_signal,
            sample_rate=16000,
            stft_win_len=400,
            stft_shift=160,
            numcep=30,
            number_of_filters=40,
            stft_size=512,
            lowest_frequency=0,
            highest_frequency=None,
            preemphasis_factor=0.97,
            ceplifter=22,
            stft_window=scipy.signal.hamming,
            mod_length=16,
            mod_shift=8,
            mod_window=scipy.signal.hamming,
            avg_length=1,
            avg_shift=1):
    """
    Compute Mod-MFCC features from an audio signal.

    :param time_signal: the audio signal from which to compute features.
        Should be an channels x samples array.
    :param sample_rate: the sample rate of the signal we are working with.
        Default is 16000.
    :param stft_win_len: the length of the analysis window. In samples.
        Default is 400 (25 milliseconds @ 16kHz).
    :param stft_shift: the step between successive windows. In samples.
        Default is 160 (10 milliseconds @ 16kHz).
    :param numcep: the number of cepstrum to return, Default is 20.
    :param number_of_filters: number of filters in the filterbank,
        Default is 40.
    :param stft_size: the FFT size. Default is 512.
    :param lowest_frequency: lowest band edge of mel filters. In Hz,
        Default is 0.
    :param highest_frequency: highest band edge of mel filters. In Hz,
        Default is samplerate/2.
    :param preemphasis_factor: apply preemphasis filter with preemphasis_factor
        as coefficient. 0 is no filter. Default is 0.97.
    :param ceplifter: the liftering coefficient to use.
        ceplifter <= 0 disables lifter.
        Default is 22.
    :param stft_window: the window function to use for fbank features. Default is
        hamming window.
    :returns: A numpy array of size (NUMFRAMES by numcep) containing features.
        Each row holds 1 feature vector.
    """
    x = mfcc(time_signal,
             sample_rate=sample_rate,
             window_length=stft_win_len,
             window=stft_window,
             stft_shift=stft_shift,
             stft_size=stft_size,
             number_of_filters=number_of_filters,
             lowest_frequency=lowest_frequency,
             highest_frequency=highest_frequency,
             preemphasis_factor=preemphasis_factor,
             ceplifter=ceplifter,
             numcep=numcep)

    x = np.abs(
        stft(x,
             size=mod_length,
             shift=mod_shift,
             window=mod_window,
             axis=-2,
             fading=False))
    assert avg_length >= avg_shift
    if avg_length > 1:
        x = segment_axis(x,
                         length=avg_length,
                         shift=avg_shift,
                         end='pad',
                         axis=-3)
        x = np.mean(x, axis=-3)
    return x
コード例 #10
0
ファイル: module_stft.py プロジェクト: mdeegen/paderbox
def istft(
        stft_signal,
        size: int=1024,
        shift: int=256,
        *,
        window: [str, typing.Callable]=signal.windows.blackman,
        fading: typing.Optional[typing.Union[bool, str]] = 'full',
        window_length: int=None,
        symmetric_window: bool=False,
        num_samples: int=None,
        pad: bool=True,
):
    """
    Calculated the inverse short time Fourier transform to exactly reconstruct
    the time signal.

    ..note::
        Be careful if you make modifications in the frequency domain (e.g.
        beamforming) because the synthesis window is calculated according to
        the unmodified! analysis window.

    :param stft_signal: Single channel complex STFT signal
        with dimensions (..., frames, size/2+1).
    :param size: Scalar FFT-size.
    :param shift: Scalar FFT-shift. Typically shift is a fraction of size.
    :param window: Window function handle.
    :param fading: Removes the additional padding, if done during STFT.
    :param window_length: Sometimes one desires to use a shorter window than
        the fft size. In that case, the window is padded with zeros.
        The default is to use the fft-size as a window size.
    :param symmetric_window: symmetric or periodic window. Assume window is
        periodic. Since the implementation of the windows in scipy.signal have a
        curious behaviour for odd window_length. Use window(len+1)[:-1]. Since
        is equal to the behaviour of MATLAB.
    :param num_samples: None or the number of samples that the original time
        signal has. When given, check, that the backt transformed signal
        has a valid number of samples and shorten the signal to the original
        length. (Does only work when pad is True).
    :param pad: Necessary when num_samples is not None. This arguments is only
        for the forward transform nessesary and not for the inverse.
        Here it is used, to check that num_samples is valid.

    :return: Single channel complex STFT signal
    :return: Single channel time signal.
    """
    # Note: frame_axis and frequency_axis would make this function much more
    #       complicated
    stft_signal = np.array(stft_signal)

    assert stft_signal.shape[-1] == size // 2 + 1, str(stft_signal.shape)

    if window_length is None:
        window_length = size

    window = _get_window(
        window=window,
        symmetric_window=symmetric_window,
        window_length=window_length,
    )

    window = _biorthogonal_window_fastest(window, shift)

    # window = _biorthogonal_window_fastest(
    #     window, shift, use_amplitude_for_biorthogonal_window)
    # if disable_sythesis_window:
    #     window = np.ones_like(window)

    time_signal = np.zeros(
        (*stft_signal.shape[:-2],
         stft_signal.shape[-2] * shift + window_length - shift))

    # Get the correct view to time_signal
    time_signal_seg = segment_axis(
        time_signal, window_length, shift, end=None
    )

    # Unbuffered inplace add
    np.add.at(
        time_signal_seg,
        ...,
        window * np.real(
            irfft(stft_signal, n=size)
        )[..., :window_length]
    )
    # The [..., :window_length] is the inverse of the window padding in rfft.

    # Compensate fade-in and fade-out

    assert fading in [None, True, False, 'full', 'half'], fading
    if fading not in [None, False]:
        pad_width = (window_length - shift)
        if fading == 'half':
            pad_width /= 2
        time_signal = time_signal[
            ..., int(pad_width):time_signal.shape[-1] - ceil(pad_width)]

    if num_samples is not None:
        if pad:
            assert time_signal.shape[-1] >= num_samples, (time_signal.shape, num_samples)
            assert time_signal.shape[-1] < num_samples + shift, (time_signal.shape, num_samples)
            time_signal = time_signal[..., :num_samples]
        else:
            raise ValueError(
                pad,
                'When padding is False in the stft, the signal is cutted.'
                'This operation can not be inverted.',
            )

    return time_signal
コード例 #11
0
ファイル: module_stft.py プロジェクト: mdeegen/paderbox
def stft(
        time_signal,
        size: int = 1024,
        shift: int = 256,
        *,
        axis=-1,
        window: [str, typing.Callable] = signal.windows.blackman,
        window_length: int = None,
        fading: typing.Optional[typing.Union[bool, str]] = 'full',
        pad: bool = True,
        symmetric_window: bool = False,
) -> np.array:
    """
    ToDo: Open points:
     - sym_window need literature
     - fading why it is better?
     - should pad have more degrees of freedom?

    Calculates the short time Fourier transformation of a multi channel multi
    speaker time signal. It is able to add additional zeros for fade-in and
    fade out and should yield an STFT signal which allows perfect
    reconstruction.

    :param time_signal: Multi channel time signal with dimensions
        AA x ... x AZ x T x BA x ... x BZ.
    :param size: Scalar FFT-size.
    :param shift: Scalar FFT-shift, the step between successive frames in
        samples. Typically shift is a fraction of size.
    :param axis: Scalar axis of time.
        Default: None means the biggest dimension.
    :param window: Window function handle. Default is windows.blackman window.
    :param fading: Pads the signal with zeros for better reconstruction.
    :param window_length: Sometimes one desires to use a shorter window than
        the fft size. In that case, the window is padded with zeros.
        The default is to use the fft-size as a window size.
    :param pad: If true zero pad the signal to match the shape, else cut
    :param symmetric_window: symmetric or periodic window. Assume window is
        periodic. Since the implementation of the windows in scipy.signal have a
        curious behaviour for odd window_length. Use window(len+1)[:-1]. Since
        is equal to the behaviour of MATLAB.
    :return: Single channel complex STFT signal with dimensions
        AA x ... x AZ x T' times size/2+1 times BA x ... x BZ.
    """
    time_signal = np.asarray(time_signal)

    axis = axis % time_signal.ndim

    if window_length is None:
        window_length = size

    # Pad with zeros to have enough samples for the window function to fade.
    assert fading in [None, True, False, 'full', 'half'], fading
    if fading not in [False, None]:
        pad_width = np.zeros((time_signal.ndim, 2), dtype=np.int)
        if fading == 'half':
            pad_width[axis, 0] = (window_length - shift) // 2
            pad_width[axis, 1] = ceil((window_length - shift) / 2)
        else:
            pad_width[axis, :] = window_length - shift
        time_signal = np.pad(time_signal, pad_width, mode='constant')

    window = _get_window(
        window=window,
        symmetric_window=symmetric_window,
        window_length=window_length,
    )

    time_signal_seg = segment_axis(
        time_signal,
        window_length,
        shift=shift,
        axis=axis,
        end='pad' if pad else 'cut'
    )

    letters = string.ascii_lowercase[:time_signal_seg.ndim]
    mapping = letters + ',' + letters[axis + 1] + '->' + letters

    try:
        # ToDo: Implement this more memory efficient
        return rfft(
            np.einsum(mapping, time_signal_seg, window),
            n=size,
            axis=axis + 1,
        )
    except ValueError as e:
        raise ValueError(
            f'Could not calculate the stft, something does not match.\n'
            f'mapping: {mapping}, '
            f'time_signal_seg.shape: {time_signal_seg.shape}, '
            f'window.shape: {window.shape}, '
            f'size: {size}'
            f'axis+1: {axis+1}'
        ) from e