def audio(signal,
          sampling_rate: int = 16000,

        signal: Shape (samples, batch [optional]). If `batch_first = True`,
            (batch [optional], samples).
        sampling_rate: Sampling rate of the audio signal
        batch_first: If `True`, the optional batch dimension is assumed to be
            the first axis, otherwise the second one.
        normalize: If `True`, the signal is normalized to a max amplitude of
            0.95 to prevent clipping
    signal = to_numpy(signal, detach=True)

    signal = _remove_batch_axis(signal, batch_first=batch_first, ndim=1)

    # Normalize so that there is no clipping
    if normalize:
        denominator = np.max(np.abs(signal))
        if denominator > 0:
            signal = signal / denominator
        signal *= 0.95

    return signal, sampling_rate
def example_to_numpy(example, detach=False):
    Moves a nested structure to numpy. Opposite of example_to_device.


        example on where each tensor is converted to numpy

    from padertorch.utils import to_numpy

    if isinstance(example, dict):
        return example.__class__({
            key: example_to_numpy(value, detach=detach)
            for key, value in example.items()
    elif isinstance(example, (tuple, list)):
        return example.__class__(
            [example_to_numpy(element, detach=detach) for element in example])
    elif torch.is_tensor(example) or 'ComplexTensor' in str(type(example)):
        return to_numpy(example, detach=detach)
    elif isinstance(example, np.ndarray):
        return example
    elif hasattr(example, '__dataclass_fields__'):
        return example.__class__(
                f: example_to_numpy(getattr(example, f), detach=detach)
                for f in example.__dataclass_fields__
        return example
def mask_to_image(mask: _T_input,
                  batch_first: bool = False,
                  color: Optional[str] = None,
                  origin: str = 'lower') -> np.ndarray:
    Creates an image from a mask `Tensor` or `ndarray`.

    For more details of the output shape, see the tensorboardx docs

        mask: Mask to plot
        batch_first: If `True`, `signal` is expected to have shape
            `(batch [optional], frames, features)`. If `False`, the batch axis
            is assumed to be in the second position, i.e.,
            `(frames, batch [optional], features)`.
        color: A color map name. The name is forwarded to
               `matplotlib.pyplot.cm.get_cmap` to get the color map. If `None`,
               grayscale is used.
        origin: Origin of the plot. Can be `'upper'` or `'lower'`.

        Colorized image with shape (color (1 or 3), features, frames)
    mask = to_numpy(mask, detach=True)

    image = np.clip(mask * 255, 0, 255)
    image = image.astype(np.uint8)

    image = _remove_batch_axis(image, batch_first=batch_first)

    return _colorize(_apply_origin(image.T, origin), color)
 def convert(value):
     id_ = id(value)
     if id_ in memo:
         return memo[id_]
     if isinstance(value, torch.Tensor) or 'ComplexTensor' in str(
         value = to_numpy(value, detach=detach)
     memo[id_] = value
     return value
def spectrogram_to_image(
        signal: _T_input,
        batch_first: bool = False,
        color: str = 'viridis',
        origin: str = 'lower',
        log: bool = True,
        visible_dB: float = 50,
) -> np.ndarray:
    Creates an image from a spectrogram (power).

        When The input is the absolute value of the STFT, the value for
        visible_dB is effectively two times larger (i.e. default 100) and
        the image looks more noisy.

    For more details of the output shape, see the tensorboardx docs

        signal: Spectrogram to plot.
        batch_first: If `True`, `signal` is expected to have shape
            `(batch [optional], frames, features)`. If `False`, the batch axis
            is assumed to be in the second position, i.e.,
            `(frames, batch [optional], features)`.
        color: A color map name. The name is forwarded to
               `matplotlib.pyplot.cm.get_cmap` to get the color map.
        origin: Origin of the plot. Can be `'upper'` or `'lower'`.
        log: If `True`, the spectrogram is plotted in log domain and shows a
            50dB range. The 50dB can be changed with the argument `visible_dB`.
        visible_dB: Only used when `log` is `True`. Specifies how many dB will
            be visible in the plot. Assumes the input is the power of the STFT
            signal, i.e., the abs square of it.

        Colorized image with shape (channels (3), features, frames)

    signal = to_numpy(signal, detach=True)

    signal = signal / (np.max(np.abs(signal)) + np.finfo(signal.dtype).tiny)

    signal = _remove_batch_axis(signal, batch_first=batch_first)

    if log:
        # remove problematic small numbers
        floor = 10 ** (-visible_dB / 10)
        signal = np.maximum(signal, floor)

        # Scale such that X dB are visible (i.e. in the range 0 to 1)
        signal = (10 / visible_dB) * np.log10(signal) + 1

    signal = (signal * 255).astype(np.uint8)

    return _colorize(_apply_origin(signal.T, origin=origin), color)
def stft_to_image(
        signal: _T_input,
        batch_first: bool = False,
        color: str = 'viridis',
        origin: str = 'lower',
        visible_dB: float = 50,
) -> np.ndarray:
    Creates an image from an STFT signal.
    For more details of the output shape, see the tensorboardx docs

        signal: Shape (frames, batch [optional], features)
        batch_first: if true mask shape (batch [optional], frames, features]
        color: A color map name. The name is forwarded to
               `matplotlib.pyplot.cm.get_cmap` to get the color map. If `None`,
               grayscale is used.
        origin: Origin of the plot. Can be `'upper'` or `'lower'`.
        visible_dB: How many dezibel are visible in the image.
                    Note: `paderbox.visualization.plot.stft` uses
                          `visible_dB == 60` internally. So by default it shows
                          10 dB more.

        Colorized image with shape (color (1 or 3), features, frames)

    Small test to see the effect of `visible_dB`:

        >>> visible_dB = 60
        >>> 10 ** (-visible_dB / 20)

        >>> data = [1, 0.004, 0.003, 0.001_05, 0.001]
        >>> np.squeeze(stft_to_image(np.array(data)[:, None], color=None))
        array([255,  10,   0,   0,   0], dtype=uint8)

        >>> np.squeeze(stft_to_image(
        ...     np.array(data)[:, None], color=None, visible_dB=60))
        array([255,  51,  40,   1,   0], dtype=uint8)

    signal = to_numpy(signal, detach=True)

    return spectrogram_to_image(
        signal.real ** 2 + signal.imag ** 2,
def audio(
        signal: _T_input,
        sampling_rate: int = 16000,
        batch_first: bool = False,
        normalize: bool = True,
) -> Tuple[np.ndarray, int]:
    Adds an audio signal to tensorboard.

        signal: Time-domain signal with shape (samples, batch [optional]).
            If `batch_first = True`, (batch [optional], samples).
        sampling_rate: Sampling rate of the audio signal
        batch_first: If `True`, `signal` is expected to have shape
            `(batch [optional], samples)`. If `False`, the batch axis
            is assumed to be in the second position, i.e.,
            `(samples, batch [optional])`.
        normalize: If `True`, the signal is normalized to a max amplitude of
            0.95 to prevent clipping.

        A tuple consisting of the signal and the sampling rate. See tensorboardX
        docs for further information on the return type.
    signal = to_numpy(signal, detach=True)

    signal = _remove_batch_axis(signal, batch_first=batch_first, ndim=1)

    # Normalize so that there is no clipping
    if normalize:
        denominator = np.max(np.abs(signal))
        if denominator > 0:
            signal = signal / denominator
            signal *= 0.95

    return signal, sampling_rate
def pit_loss_from_loss_matrix(
        algorithm: ['optimal', 'greedy'] = 'optimal',
    Calculates the PIT loss given a pair_wise_loss matrix.
        pair_wise_loss_matrix: shape: (K, K)
        reduction: 'mean' or 'sum'

    >>> import numpy as np
    >>> score_matrix = np.array([[11., 10, 0],[4, 5, 10],[6, 0, 5]])
    >>> score_matrix
    array([[11., 10.,  0.],
           [ 4.,  5., 10.],
           [ 6.,  0.,  5.]])
    >>> pair_wise_loss_matrix = torch.tensor(-score_matrix)
    >>> pit_loss_from_loss_matrix(pair_wise_loss_matrix, reduction='sum', algorithm='optimal')
    tensor(-26., dtype=torch.float64)
    >>> pit_loss_from_loss_matrix(pair_wise_loss_matrix, reduction='sum', algorithm='greedy')
    tensor(-21., dtype=torch.float64)

    import scipy.optimize
    from padertorch.utils import to_numpy

    assert len(pair_wise_loss_matrix.shape) == 2, pair_wise_loss_matrix.shape
    assert pair_wise_loss_matrix.shape[-2] == pair_wise_loss_matrix.shape[-1], pair_wise_loss_matrix.shape
    sources = pair_wise_loss_matrix.shape[-1]

    # We have to detach here because pair_wise_loss_matrix should require grads
    pair_wise_loss_np = to_numpy(pair_wise_loss_matrix, detach=True)

    if algorithm == 'optimal':
        row_ind, col_ind = scipy.optimize.linear_sum_assignment(
    elif algorithm == 'greedy':
        from pb_bss.permutation_alignment import _mapping_from_score_matrix
        col_ind = _mapping_from_score_matrix(-pair_wise_loss_np,
        row_ind = range(sources)
        raise ValueError(algorithm)

    if reduction == 'mean':
        min_loss = pair_wise_loss_matrix[row_ind, col_ind].mean()
    elif reduction == 'sum':
        min_loss = pair_wise_loss_matrix[row_ind, col_ind].sum()
        raise ValueError(reduction)

    if return_permutation:
        return min_loss, col_ind
        return min_loss