Ejemplo n.º 1
0
def resample(y,
             orig_sr,
             target_sr,
             res_type='kaiser_best',
             fix=True,
             scale=False,
             **kwargs):
    # First, validate the audio buffer
    util.valid_audio(y, mono=False)

    if orig_sr == target_sr:
        return y

    ratio = float(target_sr) / orig_sr

    n_samples = int(np.ceil(y.shape[-1] * ratio))

    if res_type == 'scipy':
        y_hat = scipy.signal.resample(y, n_samples, axis=-1)
    else:
        y_hat = resampy.resample(y,
                                 orig_sr,
                                 target_sr,
                                 filter=res_type,
                                 axis=-1)

    if fix:
        y_hat = util.fix_length(y_hat, n_samples, **kwargs)

    if scale:
        y_hat /= np.sqrt(ratio)

    return np.ascontiguousarray(y_hat, dtype=y.dtype)
Ejemplo n.º 2
0
def to_mono(y):
    # Validate the buffer.  Stereo is ok here.
    util.valid_audio(y, mono=False)

    if y.ndim > 1:
        y = np.mean(y, axis=0)

    return y
Ejemplo n.º 3
0
    def get_features(self, y, sample_rate):

        # convert to mono
        if self.mono:
            y = np.mean(y, axis=1, keepdims=True)

        # resample if sample rates mismatch
        if (self.sample_rate
                is not None) and (self.sample_rate != sample_rate):
            if y.shape[1] == 1:
                # librosa expects mono audio to be of shape (n,), but we have (n, 1).
                y = librosa.core.resample(y[:, 0], sample_rate,
                                          self.sample_rate)[:, None]
            else:
                y = librosa.core.resample(y.T, sample_rate, self.sample_rate).T
            sample_rate = self.sample_rate

        # augment data
        if self.augmentation is not None:
            y = self.augmentation(y, sample_rate)

        # TODO: how time consuming is this thing (needs profiling...)
        try:
            valid = valid_audio(y[:, 0], mono=True)
        except ParameterError as e:
            msg = f"Something went wrong when augmenting waveform."
            raise ValueError(msg)

        return y
Ejemplo n.º 4
0
def libstft(y, fs, n_fft=2048, hop_length=None, win_length=None, window='hann',
            center=None, dtype=np.complex64, pad_mode='reflect'):
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    fft_window = get_window(window, win_length, fftbins=True)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Check audio is valid
    util.valid_audio(y)

    # Pad the time series so that frames are centered
    if center:
        y = np.pad(y, int(n_fft // 2), mode=pad_mode)

    # Window the time series.
    y_frames = util.frame(y, frame_length=win_length, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] *
                                          stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window *
                                            y_frames[:, bl_s:bl_t],
                                            axis=0)[:stft_matrix.shape[0]]
    f = np.linspace(0, np.pi, stft_matrix.shape[0], endpoint=True) * fs / np.pi / 2
    return stft_matrix, f
Ejemplo n.º 5
0
def zero_crossing_rate(y, frame_length=2048, hop_length=512, center=True,
                       **kwargs):
    global CROSSING
    util.valid_audio(y)

    if center:
        y = np.pad(y, int(frame_length // 2), mode='edge')

    y_framed = util.frame(y, frame_length, hop_length)

    kwargs['axis'] = 0
    kwargs.setdefault('pad', False)

    crossings = zero_crossings(y_framed, **kwargs)
    CROSSING = crossings
    print(crossings)

    return np.mean(crossings, axis=0, keepdims=True)
    def get_mfcc(self, sig_frm):
        sig_frm = sig_frm / 32768.0
        window = 'hamming'
        win_length = sig_frm.shape[0]
        hop_length = win_length
        center = True
        n_fft = win_length
        fft_window = get_window(window, win_length, fftbins=True)
        fft_window = util.pad_center(fft_window, n_fft)
        fft_window = fft_window.reshape((-1, 1))
        util.valid_audio(sig_frm)
        sig_frm = sig_frm[:, None]
        stft_matrix = np.empty((int(1 + n_fft // 2), 1),
                               dtype=np.complex64,
                               order='F')
        stft = fft.fft(fft_window * sig_frm,
                       axis=0)[:stft_matrix.shape[0]].conj()
        powspec = np.abs(stft)**2
        melspec = librosa.feature.melspectrogram(S=powspec,
                                                 hop_length=hop_length,
                                                 n_fft=n_fft,
                                                 n_mels=40)
        mfcc = librosa.feature.mfcc(S=librosa.logamplitude(melspec), n_mfcc=13)

        n_fft = 512
        fft_window = get_window(window, win_length, fftbins=True)
        fft_window = util.pad_center(fft_window, n_fft)
        fft_window = fft_window.reshape((-1, 1))
        y = np.pad(sig_frm[:, 0], int(n_fft // 2), mode='reflect')
        pad_frame = librosa.util.frame(y,
                                       frame_length=n_fft,
                                       hop_length=win_length * 2)[:, 0][:,
                                                                        None]
        stft_matrix = np.empty((int(1 + n_fft // 2), 1),
                               dtype=np.complex64,
                               order='F')
        stft = fft.fft(fft_window * pad_frame,
                       axis=0)[:stft_matrix.shape[0]].conj()
        powspec = np.abs(stft)**2
        power_to_db = getattr(librosa, 'power_to_db')
        spec = power_to_db(powspec)
        self.spec_tape_add(spec)
        return mfcc
def lpc(y, order):
    """Linear Prediction Coefficients via Burg's method

    This function applies Burg's method to estimate coefficients of a linear
    filter on `y` of order `order`.  Burg's method is an extension to the
    Yule-Walker approach, which are both sometimes referred to as LPC parameter
    estimation by autocorrelation.

    It follows the description and implementation approach described in the
    introduction in [1]_.  N.B. This paper describes a different method, which
    is not implemented here, but has been chosen for its clear explanation of
    Burg's technique in its introduction.

    .. [1] Larry Marple
           A New Autoregressive Spectrum Analysis Algorithm
           IEEE Transactions on Accoustics, Speech, and Signal Processing
           vol 28, no. 4, 1980

    Parameters
    ----------
    y : np.ndarray
        Time series to fit

    order : int > 0
        Order of the linear filter

    Returns
    -------
    a : np.ndarray of length order + 1
        LP prediction error coefficients, i.e. filter denominator polynomial

    Raises
    ------
    ParameterError
        - If y is not valid audio as per `util.valid_audio`
        - If order < 1 or not integer
    FloatingPointError
        - If y is ill-conditioned

    See also
    --------
    scipy.signal.lfilter

    Examples
    --------
    Compute LP coefficients of y at order 16 on entire series

    >>> y, sr = librosa.load(librosa.util.example_audio_file(), offset=30,
    ...                      duration=10)
    >>> librosa.lpc(y, 16)

    Compute LP coefficients, and plot LP estimate of original series

    >>> import matplotlib.pyplot as plt
    >>> import scipy
    >>> y, sr = librosa.load(librosa.util.example_audio_file(), offset=30,
    ...                      duration=0.020)
    >>> a = librosa.lpc(y, 2)
    >>> y_hat = scipy.signal.lfilter([0] + -1*a[1:], [1], y)
    >>> plt.figure()
    >>> plt.plot(y)
    >>> plt.plot(y_hat, linestyle='--')
    >>> plt.legend(['y', 'y_hat'])
    >>> plt.title('LP Model Forward Prediction')
    >>> plt.show()

    """
    if not isinstance(order, int) or order < 1:
        raise ParameterError("order must be an integer > 0")

    util.valid_audio(y, mono=True)

    return __lpc(y, order)
Ejemplo n.º 8
0
def waveplot(
    y,
    sr=22050,
    max_points=5e4,
    x_axis="time",
    offset=0.0,
    max_sr=1000,
    ax=None,
    **kwargs,
):
    """Plot the amplitude envelope of a waveform.

    If ``y`` is monophonic, a filled curve is drawn between ``[-abs(y), abs(y)]``.

    If ``y`` is stereo, the curve is drawn between ``[-abs(y[1]), abs(y[0])]``,
    so that the left and right channels are drawn above and below the axis,
    respectively.

    Long signals (``duration >= max_points``) are down-sampled to at
    most ``max_sr`` before plotting.

    .. warning::
        This function is deprecated in librosa 0.8.1 and will be removed
        in 0.9.0.  Its functionality is replaced and extended by `waveshow`.

    Parameters
    ----------
    y : np.ndarray [shape=(n,) or (2,n)]
        audio time series (mono or stereo)

    sr : number > 0 [scalar]
        sampling rate of ``y``

    max_points : positive number or None
        Maximum number of time-points to plot: if ``max_points`` exceeds
        the duration of ``y``, then ``y`` is downsampled.

        If `None`, no downsampling is performed.

    x_axis : str or None
        Display of the x-axis ticks and tick markers. Accepted values are:

        - 'time' : markers are shown as milliseconds, seconds, minutes, or hours.
                    Values are plotted in units of seconds.

        - 's' : markers are shown as seconds.

        - 'ms' : markers are shown as milliseconds.

        - 'lag' : like time, but past the halfway point counts as negative values.

        - 'lag_s' : same as lag, but in seconds.

        - 'lag_ms' : same as lag, but in milliseconds.

        - `None`, 'none', or 'off': ticks and tick markers are hidden.


    ax : matplotlib.axes.Axes or None
        Axes to plot on instead of the default `plt.gca()`.

    offset : float
        Horizontal offset (in seconds) to start the waveform plot

    max_sr : number > 0 [scalar]
        Maximum sampling rate for the visualization

    kwargs
        Additional keyword arguments to `matplotlib.pyplot.fill_between`

    Returns
    -------
    pc : matplotlib.collections.PolyCollection
        The PolyCollection created by `fill_between`.

    See also
    --------
    waveshow
    librosa.resample
    matplotlib.pyplot.fill_between


    Examples
    --------
    Plot a monophonic waveform

    >>> import matplotlib.pyplot as plt
    >>> y, sr = librosa.load(librosa.ex('choice'), duration=10)
    >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
    >>> librosa.display.waveplot(y, sr=sr, ax=ax[0])
    >>> ax[0].set(title='Monophonic')
    >>> ax[0].label_outer()

    Or a stereo waveform

    >>> y, sr = librosa.load(librosa.ex('choice', hq=True), mono=False, duration=10)
    >>> librosa.display.waveplot(y, sr=sr, ax=ax[1])
    >>> ax[1].set(title='Stereo')
    >>> ax[1].label_outer()

    Or harmonic and percussive components with transparency

    >>> y, sr = librosa.load(librosa.ex('choice'), duration=10)
    >>> y_harm, y_perc = librosa.effects.hpss(y)
    >>> librosa.display.waveplot(y_harm, sr=sr, alpha=0.25, ax=ax[2])
    >>> librosa.display.waveplot(y_perc, sr=sr, color='r', alpha=0.5, ax=ax[2])
    >>> ax[2].set(title='Harmonic + Percussive')
    """

    util.valid_audio(y, mono=False)

    if not (isinstance(max_sr, (int, np.integer)) and max_sr > 0):
        raise ParameterError("max_sr must be a non-negative integer")

    target_sr = sr
    hop_length = 1

    # Pad an extra channel dimension, if necessary
    if y.ndim == 1:
        y = y[np.newaxis, :]

    if max_points is not None:
        if max_points <= 0:
            raise ParameterError("max_points must be strictly positive")

        if max_points < y.shape[-1]:
            target_sr = min(max_sr, (sr * y.shape[-1]) // max_points)

        hop_length = sr // target_sr

    # Reduce by envelope calculation
    y = __envelope(y, hop_length)

    y_top = y[0]
    y_bottom = -y[-1]

    axes = __check_axes(ax)

    kwargs.setdefault("color", next(axes._get_lines.prop_cycler)["color"])

    locs = offset + core.times_like(y_top, sr=sr, hop_length=hop_length)

    out = axes.fill_between(locs, y_bottom, y_top, **kwargs)

    axes.set_xlim([locs.min(), locs.max()])

    # Construct tickers and locators
    __decorate_axis(axes.xaxis, x_axis)

    return out
Ejemplo n.º 9
0
def waveshow(
    y,
    sr=22050,
    max_points=11025,
    x_axis="time",
    offset=0.0,
    marker="",
    where="post",
    label=None,
    ax=None,
    **kwargs,
):
    """Visualize a waveform in the time domain.

    This function constructs a plot which adaptively switches between a raw
    samples-based view of the signal (`matplotlib.pyplot.step`) and an
    amplitude-envelope view of the signal (`matplotlib.pyplot.fill_between`)
    depending on the time extent of the plot's viewport.

    More specifically, when the plot spans a time interval of less than ``max_points /
    sr`` (by default, 1/2 second), the samples-based view is used, and otherwise a
    downsampled amplitude envelope is used.
    This is done to limit the complexity of the visual elements to guarantee an
    efficient, visually interpretable plot.

    When using interactive rendering (e.g., in a Jupyter notebook or IPython
    console), the plot will automatically update as the view-port is changed, either
    through widget controls or programmatic updates.

    .. note:: When visualizing stereo waveforms, the amplitude envelope will be generated
        so that the upper limits derive from the left channel, and the lower limits derive
        from the right channel, which can produce a vertically asymmetric plot.

        When zoomed in to the sample view, only the first channel will be shown.
        If you want to visualize both channels at the sample level, it is recommended to
        plot each signal independently.


    Parameters
    ----------
    y : np.ndarray [shape=(n,) or (2,n)]
        audio time series (mono or stereo)

    sr : number > 0 [scalar]
        sampling rate of ``y`` (samples per second)

    max_points : positive integer
        Maximum number of samples to draw.  When the plot covers a time extent
        smaller than ``max_points / sr`` (default: 1/2 second), samples are drawn.

        If drawing raw samples would exceed `max_points`, then a downsampled
        amplitude envelope extracted from non-overlapping windows of `y` is
        visualized instead.  The parameters of the amplitude envelope are defined so
        that the resulting plot cannot produce more than `max_points` frames.

    x_axis : str or None
        Display of the x-axis ticks and tick markers. Accepted values are:

        - 'time' : markers are shown as milliseconds, seconds, minutes, or hours.
                    Values are plotted in units of seconds.

        - 's' : markers are shown as seconds.

        - 'ms' : markers are shown as milliseconds.

        - 'lag' : like time, but past the halfway point counts as negative values.

        - 'lag_s' : same as lag, but in seconds.

        - 'lag_ms' : same as lag, but in milliseconds.

        - `None`, 'none', or 'off': ticks and tick markers are hidden.


    ax : matplotlib.axes.Axes or None
        Axes to plot on instead of the default `plt.gca()`.

    offset : float
        Horizontal offset (in seconds) to start the waveform plot

    marker : string
        Marker symbol to use for sample values. (default: no markers)

        See also: `matplotlib.markers`.

    where : string, {'pre', 'mid', 'post'}
        This setting determines how both waveform and envelope plots interpolate
        between observations.

        See `matplotlib.pyplot.step` for details.

        Default: 'post'

    label : string [optional]
        The label string applied to this plot.
        Note that the label

    kwargs
        Additional keyword arguments to `matplotlib.pyplot.fill_between` and
        `matplotlib.pyplot.step`.

        Note that only those arguments which are common to both functions will be
        supported.

    Returns
    -------
    librosa.display.AdaptiveWaveplot
        An object of type `librosa.display.AdaptiveWaveplot`

    See also
    --------
    AdaptiveWaveplot
    matplotlib.pyplot.step
    matplotlib.pyplot.fill_between
    matplotlib.markers


    Examples
    --------
    Plot a monophonic waveform with an envelope view

    >>> import matplotlib.pyplot as plt
    >>> y, sr = librosa.load(librosa.ex('choice'), duration=10)
    >>> fig, ax = plt.subplots(nrows=3, sharex=True)
    >>> librosa.display.waveshow(y, sr=sr, ax=ax[0])
    >>> ax[0].set(title='Envelope view, mono')
    >>> ax[0].label_outer()

    Or a stereo waveform

    >>> y, sr = librosa.load(librosa.ex('choice', hq=True), mono=False, duration=10)
    >>> librosa.display.waveshow(y, sr=sr, ax=ax[1])
    >>> ax[1].set(title='Envelope view, stereo')
    >>> ax[1].label_outer()

    Or harmonic and percussive components with transparency

    >>> y, sr = librosa.load(librosa.ex('choice'), duration=10)
    >>> y_harm, y_perc = librosa.effects.hpss(y)
    >>> librosa.display.waveshow(y_harm, sr=sr, alpha=0.5, ax=ax[2], label='Harmonic')
    >>> librosa.display.waveshow(y_perc, sr=sr, color='r', alpha=0.5, ax=ax[2], label='Percussive')
    >>> ax[2].set(title='Multiple waveforms')
    >>> ax[2].legend()

    Zooming in on a plot to show raw sample values

    >>> fig, (ax, ax2) = plt.subplots(nrows=2, sharex=True)
    >>> ax.set(xlim=[6.0, 6.01], title='Sample view', ylim=[-0.2, 0.2])
    >>> librosa.display.waveshow(y, sr=sr, ax=ax, marker='.', label='Full signal')
    >>> librosa.display.waveshow(y_harm, sr=sr, alpha=0.5, ax=ax2, label='Harmonic')
    >>> librosa.display.waveshow(y_perc, sr=sr, color='r', alpha=0.5, ax=ax2, label='Percussive')
    >>> ax.label_outer()
    >>> ax.legend()
    >>> ax2.legend()

    """
    util.valid_audio(y, mono=False)

    # Pad an extra channel dimension, if necessary
    if y.ndim == 1:
        y = y[np.newaxis, :]

    if max_points <= 0:
        raise ParameterError(
            "max_points={} must be strictly positive".format(max_points))

    # Create the adaptive drawing object
    axes = __check_axes(ax)

    if "color" not in kwargs:
        kwargs.setdefault("color", next(axes._get_lines.prop_cycler)["color"])

    # Reduce by envelope calculation
    # this choice of hop ensures that the envelope has at most max_points values
    hop_length = max(1, y.shape[-1] // max_points)
    y_env = __envelope(y, hop_length)

    # Split the envelope into top and bottom
    y_bottom, y_top = -y_env[-1], y_env[0]

    times = offset + core.times_like(y, sr=sr, hop_length=1)

    # Only plot up to max_points worth of data here
    (steps, ) = axes.step(times[:max_points],
                          y[0, :max_points],
                          marker=marker,
                          where=where,
                          **kwargs)

    envelope = axes.fill_between(
        times[:len(y_top) * hop_length:hop_length],
        y_bottom,
        y_top,
        step=where,
        label=label,
        **kwargs,
    )
    adaptor = AdaptiveWaveplot(times,
                               y[0],
                               steps,
                               envelope,
                               sr=sr,
                               max_samples=max_points)

    axes.callbacks.connect("xlim_changed", adaptor.update)

    # Force an initial update to ensure the state is consistent
    adaptor.update(axes)

    # Construct tickers and locators
    __decorate_axis(axes.xaxis, x_axis)

    return adaptor
Ejemplo n.º 10
0
def stft(y,
         n_fft=2048,
         hop_length=None,
         win_length=None,
         window='hann',
         center=True,
         dtype=np.complex64,
         pad_mode='reflect'):
    """Short-time Fourier transform (STFT)

    Returns a complex-valued matrix D such that
        `np.abs(D[f, t])` is the magnitude of frequency bin `f`
        at frame `t`

        `np.angle(D[f, t])` is the phase of frequency bin `f`
        at frame `t`

    Parameters
    ----------
    y : np.ndarray [shape=(n,)], real-valued
        the input signal (audio time series)

    n_fft : int > 0 [scalar]
        FFT window size

    hop_length : int > 0 [scalar]
        number audio of frames between STFT columns.
        If unspecified, defaults `win_length / 4`.

    win_length  : int <= n_fft [scalar]
        Each frame of audio is windowed by `window()`.
        The window will be of length `win_length` and then padded
        with zeros to match `n_fft`.

        If unspecified, defaults to ``win_length = n_fft``.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, or number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.hanning`
        - a vector or array of length `n_fft`

        .. see also:: `filters.get_window`

    center      : boolean
        - If `True`, the signal `y` is padded so that frame
          `D[:, t]` is centered at `y[t * hop_length]`.
        - If `False`, then `D[:, t]` begins at `y[t * hop_length]`

    dtype       : numeric type
        Complex numeric type for `D`.  Default is 64-bit complex.

    pad_mode : string
        If `center=True`, the padding mode to use at the edges of the signal.
        By default, STFT uses reflection padding.


    Returns
    -------
    D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype]
        STFT matrix


    See Also
    --------
    istft : Inverse STFT

    ifgram : Instantaneous frequency spectrogram

    np.pad : array padding

    Notes
    -----
    This function caches at level 20.


    Examples
    --------

    >>> y, sr = librosa.load(librosa.util.example_audio_file())
    >>> D = np.abs(librosa.stft(y))
    >>> D
    array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ...,
            6.82710262e-04, 2.51654536e-04, 7.23036574e-05],
           [2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ...,
            3.48026224e-04, 2.35853557e-04, 7.54836728e-05],
           [7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ...,
            6.29352580e-04, 3.38571583e-04, 8.38094638e-05],
           ...,
           [9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ...,
            1.85637656e-08, 2.89708542e-08, 5.74304337e-09],
           [1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ...,
            3.49099771e-08, 3.11740926e-08, 5.29926236e-09],
           [1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ...,
            5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32)


    Use left-aligned frames, instead of centered frames

    >>> D_left = np.abs(librosa.stft(y, center=False))


    Use a shorter hop length

    >>> D_short = np.abs(librosa.stft(y, hop_length=64))


    Display a spectrogram

    >>> import matplotlib.pyplot as plt
    >>> librosa.display.specshow(librosa.amplitude_to_db(D,
    ...                                                  ref=np.max),
    ...                          y_axis='log', x_axis='time')
    >>> plt.title('Power spectrogram')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.tight_layout()

    """

    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    #fft_window = get_window(window, win_length, fftbins=True)
    fft_window = vorbis(win_length)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Check audio is valid
    util.valid_audio(y)

    # Pad the time series so that frames are centered
    if center:
        y = np.pad(y, int(n_fft // 2), mode=pad_mode)

    # Window the time series.
    y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK /
                    (stft_matrix.shape[0] * stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:,
                    bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t],
                                         axis=0)[:stft_matrix.shape[0]]

    return stft_matrix
Ejemplo n.º 11
0
    def hht(self,
            y,
            hop_length=None,
            win_length=None,
            center=True,
            dtype=np.complex64,
            pad_mode='reflect'):
        """Hilbert-Huang transform (HHT)

        Parameters
        ----------
        y : np.ndarray [shape=(n,)], real-valued
            the input signal (audio time series)

        hop_length : int > 0 [scalar]
            number audio of frames between STFT columns.
            If unspecified, defaults `win_length / 4`.

        win_length  : int <= n_fft [scalar]
            Each frame of audio is windowed by `window()`.
            The window will be of length `win_length` and then padded
            with zeros to match `n_fft`.

            If unspecified, defaults to ``win_length = n_fft``.

        center      : boolean
            - If `True`, the signal `y` is padded so that frame
              `D[:, t]` is centered at `y[t * hop_length]`.
            - If `False`, then `D[:, t]` begins at `y[t * hop_length]`

        dtype       : numeric type
            Complex numeric type for `D`.  Default is 64-bit complex.

        pad_mode : string
            If `center=True`, the padding mode to use at the edges of the signal.
            By default, HHT uses reflection padding.

        Returns
        -------
        hht_matrix : np.ndarray [shape=(30, t), dtype=dtype]
        bjp_matrix : np.ndarray [shape=(n_hht-1, t), dtype=dtype]

        """

        # By default, use the entire frame
        if win_length is None:
            win_length = self.n_hht

        # Set the default hop, if it's not already specified
        if hop_length is None:
            hop_length = int(win_length / 2)

        hht_window = self.window

        # Pad the window out to n_hht size
        hht_window = util.pad_center(hht_window, self.n_hht)

        # Reshape so that the window can be broadcast
        hht_window = hht_window.reshape((-1, 1))

        # Check audio is valid
        util.valid_audio(y)

        # Pad the time series so that frames are centered
        if center:
            y = np.pad(y, self.n_hht - 1, mode=pad_mode)

        # Window the time series.
        y_frames = util.frame(y,
                              frame_length=self.n_hht,
                              hop_length=hop_length).T

        # Pre-allocate the HHT matrix
        hht_matrix = np.empty((27, y_frames.shape[0]), dtype=dtype, order='F')

        bjp_matrix = np.empty((self.n_hht - 1, y_frames.shape[0]),
                              dtype=dtype,
                              order='F')

        for bl_s in range(hht_matrix.shape[1]):
            frame_signal = hht_window[:, 0] * y_frames[bl_s, :]
            A, f, bjp = get_hht(frame_signal, self.fs)
            hht_matrix[:, bl_s] = self.hht_based_feature(A, f * self.fs, bjp)
            bjp_matrix[:, bl_s] = bjp

        return hht_matrix, bjp_matrix
Ejemplo n.º 12
0
def stft(y,
         n_fft=2048,
         hop_length=None,
         win_length=None,
         window=None,
         center=True,
         dtype=np.complex64):
    import scipy
    import six
    from librosa import util
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft
        #win_length = tf.constant(n_fft)

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length.value() / 4)
        #hop_length = win_length/4
        #hop_length.to_int64()

    if window is None:
        # Default is an asymmetric Hann window
        fft_window = scipy.signal.hann(win_length, sym=False)
        #fft_window = tf.constant(scipy.signal.hann(convertTFtoNP(win_length), sym=False))

    elif six.callable(window):
        # User supplied a window function

        fft_window = window(win_length)

    else:
        # User supplied a window vector.
        # Make sure it's an array:
        fft_window = np.asarray(window)

        # validate length compatibility


#        if fft_window.size != n_fft:
#           raise ParameterError('Size mismatch between n_fft and len(window)')

# Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)
    #fft_window.assign(util.pad_center(convertTFtoNP(fft_window), n_fft))

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))
    #tf.reshape(fft_window, (-1,1))

    # Pad the time series so that frames are centered
    if center:
        util.valid_audio(y)
        y_ = np.pad(convertTFtoNP(y), int(n_fft // 2), mode='reflect')
    #    padding = int(n_fft // 2)
    #    y_frames = tf.pad(y, [[padding, padding],[padding,padding]], mode='REFLECT')

    # Window the time series.
    y_frames = util.frame(y_, frame_length=n_fft, hop_length=hop_length)
    #y_frames.assign(util.frame(convertTFtoNP(y_frames), frame_length=n_fft, hop_length=hop_length))

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')
    #stft_matrix = tf.zeros((int(1 + n_fft // 2), y_frames.get_shape()[1]._value),
    #                      dtype=dtype,
    #                      order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK /
                    (stft_matrix.shape[0] * stft_matrix.itemsize))

    #n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.get_shape()[0]._value *
    #                                      convertTFtoNP(stft_matrix).itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        #for bl_s in range(0, stft_matrix.get_shape()[1]._value, n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
        #bl_t = min(bl_s + n_columns, stft_matrix.get_shape()[1]._value)
        # RFFT and Conjugate here to match phase from DPWE code
        stft_matrix[:, bl_s:bl_t] = scipy.fftpack.fft(
            fft_window * y_frames[:, bl_s:bl_t],
            axis=0)[:stft_matrix.shape[0]].conj()
        #tf.scatter_update(stft_matrix, tf.constant(range(bl_s,bl_t)), tf.conj(tf.slice(tf.fft(
        #                                    fft_window * tf.slice(
        #                                    y_frames, [0,bl_s],[y_frames.get_shape()[0]._value,bl_t-bl_s])),
        #                                    [0],[stft_matrix.get_shape()[0]._value])))

    return stft_matrix