Example #1
0
def synthesize(
        fs,
        f0s,
        SPEC,
        NM=None,
        wavlen=None,
        ener_multT0=False,
        nm_cont=False  # If False, force binary state of the noise mask (by thresholding at 0.5)
    ,
        nm_lowpasswinlen=9,
        hp_f0coef=0.5  # factor of f0 for the cut-off of the high-pass filter (def. 0.5*f0)
    ,
        antipreechohwindur=0.001  # [s] Use to damp the signal at the beginning of the signal AND at the end of it
    # Following options are for post-processing the features, after the generation/transformation and thus before waveform synthesis
    ,
        pp_f0_rmsteps=False  # Removes steps in the f0 curve
    # (see sigproc.resampling.f0s_rmsteps(.) )
    ,
        pp_f0_smooth=None  # Smooth the f0 curve using median and FIR filters of given window duration [s]
    ,
        pp_atten1stharminsilences=None  # Typical value is -25
    ,
        verbose=1):

    winnbper = 4  # Number of periods in a synthesis windows. It still contains only one single pulse, but leaves space for the VTF to decay without being cut abruptly.

    # Copy the inputs to avoid modifying them
    f0s = f0s.copy()
    SPEC = SPEC.copy()
    if not NM is None: NM = NM.copy()
    else: NM = np.zeros(SPEC.shape)

    # Check the size of the inputs
    if f0s.shape[0] != SPEC.shape[0]:
        raise ValueError(
            'F0 size {} and spectrogram size {} do not match'.format(
                f0s.shape[0], SPEC.shape[0]))  # pragma: no cover
    if not NM is None:
        if SPEC.shape != NM.shape:
            raise ValueError(
                'spectrogram size {} and NM size {} do not match.'.format(
                    SPEC.shape, NM.shape))  # pragma: no cover

    if wavlen == None: wavlen = int(np.round(f0s[-1, 0] * fs))
    dftlen = (SPEC.shape[1] - 1) * 2
    shift = np.median(np.diff(f0s[:, 0]))
    if verbose > 0:
        print(
            'PML Synthesis (dur={}s, fs={}Hz, f0 in [{:.0f},{:.0f}]Hz, shift={}s, dftlen={})'
            .format(wavlen / float(fs), fs, np.min(f0s[:, 1]),
                    np.max(f0s[:, 1]), shift, dftlen))

    # Prepare the features

    # Enforce continuous f0
    f0s[:, 1] = np.interp(f0s[:, 0], f0s[f0s[:, 1] > 0, 0], f0s[f0s[:, 1] > 0,
                                                                1])
    # If asked, removes steps in the f0 curve
    if pp_f0_rmsteps:
        f0s = sp.f0s_rmsteps(f0s)
    # If asked, smooth the f0 curve using median and FIR filters
    if not pp_f0_smooth is None:
        print('    Smoothing f0 curve using {}[s] window'.format(pp_f0_smooth))
        import scipy.signal as sig
        lf0 = np.log(f0s[:, 1])
        bcoefslen = int(0.5 * pp_f0_smooth / shift) * 2 + 1
        lf0 = sig.medfilt(lf0, bcoefslen)
        bcoefs = np.hamming(bcoefslen)
        bcoefs = bcoefs / sum(bcoefs)
        lf0 = sig.filtfilt(bcoefs, [1], lf0)
        f0s[:, 1] = np.exp(lf0)

    winlenmax = getwinlen(np.min(f0s[:, 1]), fs, winnbper)
    if winlenmax > dftlen:
        warnings.warn(
            '\n\nWARNING: The maximum window length ({}) is bigger than the DFT length ({}). Please, increase the DFT length of your spectral features (the second dimension) or check if the f0 curve has extremly low values and try to clip them to higher values (at least higher than 50Hz). The f0 curve has been clipped to {}Hz.\n\n'
            .format(winlenmax, dftlen,
                    winnbper * fs / float(dftlen)))  # pragma: no cover
        f0s[:, 1] = np.clip(f0s[:, 1], winnbper * fs / float(dftlen - 2), 1e6)

    if not NM is None:
        # Remove noise below f0, as it is supposed to be already the case
        for n in range(NM.shape[0]):
            NM[n, :int((float(dftlen) / fs) * 2 * f0s[n, 1])] = 0.0

    if not nm_cont:
        print('    Forcing binary noise mask')
        NM[NM <= 0.5] = 0.0  # To be sure that voiced segments are not hoarse
        NM[NM > 0.5] = 1.0  # To be sure the noise segments are fully noisy

    # Generate the pulse positions [1](2) (i.e. the synthesis instants, the GCIs in voiced segments)
    ts = [0.0]
    while ts[-1] < float(wavlen) / fs:
        cf0 = np.interp(ts[-1], f0s[:, 0], f0s[:, 1])
        if cf0 < 50.0: cf0 = 50
        ts.append(ts[-1] + (1.0 / cf0))
    ts = np.array(ts)
    f0s = np.vstack((ts, np.interp(ts, f0s[:, 0], f0s[:, 1]))).T

    # Resample the features to the pulse positions

    # Spectral envelope uses the nearest, to avoid over-smoothing
    SPECR = np.zeros((f0s.shape[0], dftlen / 2 + 1))
    for n, t in enumerate(f0s[:, 0]):  # Nearest: Way better for plosives
        idx = int(np.round(t / shift))
        idx = np.clip(idx, 0, SPEC.shape[0] - 1)
        SPECR[n, :] = SPEC[idx, :]

    # Keep trace of the median energy [dB] over the whole signal
    ener = np.mean(SPECR, axis=1)
    idxacs = np.where(sp.mag2db(ener) > sp.mag2db(np.max(ener)) -
                      30)[0]  # Get approx active frames # TODO Param
    enermed = sp.mag2db(np.median(ener[idxacs]))  # Median energy [dB]
    ener = sp.mag2db(ener)

    # Resample the noise feature to the pulse positions
    # Smooth the frequency response of the mask in order to avoid Gibbs
    # (poor Gibbs nobody want to see him)
    nm_lowpasswin = np.hanning(nm_lowpasswinlen)
    nm_lowpasswin /= np.sum(nm_lowpasswin)
    NMR = np.zeros((f0s.shape[0], dftlen / 2 + 1))
    for n, t in enumerate(f0s[:, 0]):
        idx = int(np.round(t / shift))  # Nearest is better for plosives
        idx = np.clip(idx, 0, NM.shape[0] - 1)
        NMR[n, :] = NM[idx, :]
        if nm_lowpasswinlen > 1:
            NMR[n, :] = scipy.signal.filtfilt(nm_lowpasswin, [1.0], NMR[n, :])

    NMR = np.clip(NMR, 0.0, 1.0)

    # The complete waveform that we will fill with the pulses
    wav = np.zeros(wavlen)
    # Half window on the left of the synthesized segment to avoid pre-echo
    dampinhwin = np.hanning(
        1 +
        2 * int(np.round(antipreechohwindur * fs)))  # 1ms forced dampingwindow
    dampinhwin = dampinhwin[:(len(dampinhwin) - 1) / 2 + 1]

    for n, t in enumerate(f0s[:, 0]):
        f0 = f0s[n, 1]

        if verbose > 1:
            print "\rPM Synthesis (python) t={:4.3f}s f0={:3.3f}Hz               ".format(
                t, f0),

        # Window's length
        # TODO It should be ensured that the beggining and end of the
        #      noise is within the window. Nothing is doing this currently!
        winlen = getwinlen(f0, fs, winnbper)
        # TODO We also assume that the VTF's decay is shorter
        #      than winnbper-1 periods (dangerous with high pitched and tense voice).
        if winlen > dftlen:
            raise ValueError(
                'The window length ({}) is bigger than the DFT length ({}). Please, increase the dftlen of your spectral features or check if the f0 curve has extremly low values and try to clip them to higher values (at least higher than 50[Hz])'
                .format(winlen, dftlen))  # pragma: no cover

        # Set the rough position of the pulse in the window (the closest sample)
        # We keep a third of the window (1 period) on the left because the
        # pulse signal is minimum phase. And 2/3rd (remaining 2 periods)
        # on the right to let the VTF decay.
        pulseposinwin = int((1.0 / winnbper) * winlen)

        # The sample indices of the current pulse wrt. the final waveform
        winidx = int(round(fs * t)) + np.arange(winlen) - pulseposinwin

        # Build the pulse spectrum

        # Let start with a Dirac
        S = np.ones(dftlen / 2 + 1, dtype=np.complex64)

        # Add the delay to place the Dirac at the "GCI": exp(-j*2*pi*t_i)
        delay = -pulseposinwin - fs * (t - int(round(fs * t)) / float(fs))
        S *= np.exp((delay * 2j * np.pi / dftlen) * np.arange(dftlen / 2 + 1))

        # Add the spectral envelope
        # Both amplitude and phase
        E = SPECR[n, :]  # Take the amplitude from the given one
        if hp_f0coef != None:
            # High-pass it to avoid any residual DC component.
            fcut = hp_f0coef * f0
            if not pp_atten1stharminsilences is None and ener[
                    n] - enermed < pp_atten1stharminsilences:
                fcut = 1.5 * f0  # Try to cut between first and second harm
            HP = sp.butter2hspec(fcut, 4, fs, dftlen, high=True)
            E *= HP
            # Not necessarily good as it is non-causal, so make it causal...
            # ... together with the VTF response below.
        # Build the phase of the envelope from the amplitude
        E = sp.hspec2minphasehspec(E, replacezero=True)  # We spend 2 FFT here!
        S *= E  # Add it to the current pulse

        # Add energy correction wrt f0.
        # STRAIGHT and AHOCODER vocoders do it.
        # (why ? to equalize the energy when changing the pulse's duration ?)
        if ener_multT0:
            S *= np.sqrt(fs / f0)

        # Generate the segment of Gaussian noise
        # Use mid-points before/after pulse position
        if n > 0: leftbnd = int(np.round(fs * 0.5 * (f0s[n - 1, 0] + t)))
        else: leftbnd = int(np.round(fs * (t - 0.5 / f0s[n, 1])))  # int(0)
        if n < f0s.shape[0] - 1:
            rightbnd = int(np.round(fs * 0.5 * (t + f0s[n + 1, 0]))) - 1
        else:
            rightbnd = int(np.round(
                fs * (t + 0.5 / f0s[n, 1])))  #rightbnd=int(wavlen-1)
        gausswinlen = rightbnd - leftbnd  # The length of the noise segment
        gaussnoise4win = np.random.normal(size=(gausswinlen))  # The noise

        GN = np.fft.rfft(gaussnoise4win,
                         dftlen)  # Move the noise to freq domain
        # Normalize it by its energy (@Yannis, That's your answer at SSW9!)
        GN /= np.sqrt(np.mean(np.abs(GN)**2))
        # Place the noise within the pulse's window
        delay = (pulseposinwin - (leftbnd - winidx[0]))
        GN *= np.exp((delay * 2j * np.pi / dftlen) * np.arange(dftlen / 2 + 1))

        # Add it to the pulse spectrum, under the condition of the mask
        S *= GN**NMR[n, :]

        # That's it! the pulse spectrum is ready!

        # Move it to time domain
        deter = np.fft.irfft(S)[0:winlen]

        # Add half window on the left of the synthesized segment
        # to avoid any possible pre-echo
        deter[:leftbnd - winidx[0] - len(dampinhwin)] = 0.0
        deter[leftbnd - winidx[0] - len(dampinhwin):leftbnd -
              winidx[0]] *= dampinhwin

        # Add half window on the right
        # to avoid cutting the VTF response abruptly
        deter[-len(dampinhwin):] *= dampinhwin[::-1]

        # Write the synthesized segment in the final waveform
        if winidx[0] < 0 or winidx[-1] >= wavlen:
            # The window is partly outside of the waveform ...
            # ... thus copy only the existing part
            itouse = np.logical_and(winidx >= 0, winidx < wavlen)
            wav[winidx[itouse]] += deter[itouse]
        else:
            wav[winidx] += deter

    if verbose > 1:
        print '\r                                                               \r',

    if verbose > 2:  # pragma: no cover
        import matplotlib.pyplot as plt
        plt.ion()
        _, axs = plt.subplots(3, 1, sharex=True, sharey=False)
        times = np.arange(len(wav)) / float(fs)
        axs[0].plot(times, wav, 'k')
        axs[0].set_ylabel('Waveform\nAmplitude')
        axs[0].grid()
        axs[1].plot(f0s[:, 0], f0s[:, 1], 'k')
        axs[1].set_ylabel('F0\nFrequency [Hz]')
        axs[1].grid()
        axs[2].imshow(sp.mag2db(SPEC).T,
                      origin='lower',
                      aspect='auto',
                      interpolation='none',
                      extent=(f0s[0, 0], f0s[-1, 0], 0, 0.5 * fs))
        axs[2].set_ylabel('Amp. Envelope\nFrequency [Hz]')

        from IPython.core.debugger import Pdb
        Pdb().set_trace()

    return wav
Example #2
0
def synthesize(
        fs,
        f0s,
        SPEC,
        NM=None,
        wavlen=None,
        f0s_rmsteps=False  # Removes steps in the f0 curve
    # (see sigproc.resampling.f0s_rmsteps(.) )
    ,
        ener_multT0=False,
        nm_lowpasswinlen=9,
        hp_f0coef=0.5  # factor of f0 for the cut-off of the high-pass filter (def. 0.5*f0)
    ,
        antipreechohwindur=0.001  # [s]
    ,
        verbose=1):

    # Copy the inputs to avoid modifying them
    f0s = f0s.copy()
    SPEC = SPEC.copy()
    if not NM is None: NM = NM.copy()
    else: NM = np.zeros(SPEC.shape)

    # Check the size of the inputs
    if f0s.shape[0] != SPEC.shape[0]:
        raise ValueError(
            'F0 size {} and spectrogram size {} do not match'.format(
                len(f0), SPEC.shape[0]))
    if not NM is None:
        if SPEC.shape != NM.shape:
            raise ValueError(
                'spectrogram size {} and NM size {} do not match.'.format(
                    SPEC.shape, NM.shape))

    if wavlen == None: wavlen = int(np.round(f0s[-1, 0] * fs))
    dftlen = (SPEC.shape[1] - 1) * 2
    shift = np.median(np.diff(f0s[:, 0]))
    if verbose > 0:
        print(
            'PM Synthesis (dur={}s, fs={}Hz, f0 in [{:.0f},{:.0f}]Hz, shift={}s, dftlen={})'
            .format(wavlen / float(fs), fs, np.min(f0s[:, 1]),
                    np.max(f0s[:, 1]), shift, dftlen))

    # Prepare the features

    # Enforce continuous f0
    f0s[:, 1] = np.interp(f0s[:, 0], f0s[f0s[:, 1] > 0, 0], f0s[f0s[:, 1] > 0,
                                                                1])
    # If asked, removes steps in the f0 curve
    if f0s_rmsteps:
        f0s = sp.f0s_rmsteps(f0s)

    if not NM is None:
        # Remove noise below f0, as it is supposed to be already the case
        for n in range(NM.shape[0]):
            NM[n, :int((float(dftlen) / fs) * 2 * f0s[n, 1])] = 0.0

    # Generate the pulse positions [1](2) (i.e. the synthesis instants, the GCIs in voiced segments)
    ts = [0.0]
    while ts[-1] < float(wavlen) / fs:
        cf0 = np.interp(ts[-1], f0s[:, 0], f0s[:, 1])
        if cf0 < 50.0: cf0 = 50
        ts.append(ts[-1] + (1.0 / cf0))
    ts = np.array(ts)
    f0s = np.vstack((ts, np.interp(ts, f0s[:, 0], f0s[:, 1]))).T

    # Resample the features to the pulse positions

    # Spectral envelope uses the nearest, to avoid over-smoothing
    SPECR = np.zeros((f0s.shape[0], dftlen / 2 + 1))
    for n, t in enumerate(f0s[:, 0]):  # Nearest: Way better for plosives
        idx = int(np.round(t / shift))
        idx = np.clip(idx, 0, SPEC.shape[0] - 1)
        SPECR[n, :] = SPEC[idx, :]

    # Resample the noise feature to the pulse positions
    # Smooth the frequency response of the mask in order to avoid Gibbs
    # (poor Gibbs nobody want to see him)
    nm_lowpasswin = np.hanning(nm_lowpasswinlen)
    nm_lowpasswin /= np.sum(nm_lowpasswin)
    NMR = np.zeros((f0s.shape[0], dftlen / 2 + 1))
    for n, t in enumerate(f0s[:, 0]):
        idx = int(np.round(t / shift))  # Nearest is better for plosives
        idx = np.clip(idx, 0, NM.shape[0] - 1)
        NMR[n, :] = NM[idx, :]
        NMR[n, :] = scipy.signal.filtfilt(nm_lowpasswin, [1.0], NMR[n, :])

    NMR = np.clip(NMR, 0.0, 1.0)

    # The complete waveform that we will fill with the pulses
    wav = np.zeros(wavlen)
    # Half window on the left of the synthesized segment to avoid pre-echo
    dampinhwin = np.hanning(
        1 +
        2 * int(np.round(antipreechohwindur * fs)))  # 1ms forced dampingwindow
    dampinhwin = dampinhwin[:(len(dampinhwin) - 1) / 2 + 1]

    for n, t in enumerate(f0s[:, 0]):
        f0 = f0s[n, 1]

        if verbose > 1:
            print "\rPM Synthesis (python) t={:4.3f}s f0={:3.3f}Hz               ".format(
                t, f0),

        # Window's length
        # TODO It should be ensured that the beggining and end of the
        #      noise is within the window. Nothing is doing this currently!
        winlen = int(np.max(
            (0.050 * fs, 3 * fs / f0)) / 2) * 2 + 1  # Has to be odd
        # TODO We also assume that the VTF's decay is shorter
        #      than 2 periods (dangerous with high pitched tense voice).
        if winlen > dftlen: raise ValueError('winlen>dftlen')

        # Set the rough position of the pulse in the window (the closest sample)
        # We keep a third of the window (1 period) on the left because the
        # pulse signal is minimum phase. And 2/3rd (remaining 2 periods)
        # on the right to let the VTF decay.
        pulseposinwin = int(0.33 * winlen)

        # The sample indices of the current pulse wrt. the final waveform
        winidx = int(round(fs * t)) + np.arange(winlen) - pulseposinwin

        # Build the pulse spectrum

        # Let start with a Dirac
        S = np.ones(dftlen / 2 + 1, dtype=np.complex64)

        # Add the delay to place the Dirac at the "GCI": exp(-j*2*pi*t_i)
        delay = -pulseposinwin - fs * (t - int(round(fs * t)) / float(fs))
        S *= np.exp((delay * 2j * np.pi / dftlen) * np.arange(dftlen / 2 + 1))

        # Add the spectral envelope
        # Both amplitude and phase
        E = SPECR[n, :]  # Take the amplitude from the given one
        if hp_f0coef != None:
            # High-pass it to avoid any residual DC component.
            HP = sp.butter2hspec(hp_f0coef * f0, 4, fs, dftlen, high=True)
            E *= HP
            # Not necessarily good as it is non-causal, so make it causal...
            # ... together with the VTF response below.
        # Build the phase of the envelope from the amplitude
        E = sp.hspec2minphasehspec(E, replacezero=True)  # We spend 2 FFT here!
        S *= E  # Add it to the current pulse

        # Add energy correction wrt f0.
        # STRAIGHT and AHOCODER vocoders do it.
        # (why ? to equalize the energy when changing the pulse's duration ?)
        if ener_multT0:
            S *= np.sqrt(fs / f0)

        # Generate the segment of Gaussian noise
        # Use mid-points before/after pulse position
        if n > 0: leftbnd = int(np.round(fs * 0.5 * (f0s[n - 1, 0] + t)))
        else: leftbnd = int(np.round(fs * (t - 0.5 / f0s[n, 1])))  # int(0)
        if n < f0s.shape[0] - 1:
            rightbnd = int(np.round(fs * 0.5 * (t + f0s[n + 1, 0]))) - 1
        else:
            rightbnd = int(np.round(
                fs * (t + 0.5 / f0s[n, 1])))  #rightbnd=int(wavlen-1)
        gausswinlen = rightbnd - leftbnd  # The length of the noise segment
        gaussnoise4win = np.random.normal(size=(gausswinlen))  # The noise

        GN = np.fft.rfft(gaussnoise4win,
                         dftlen)  # Move the noise to freq domain
        # Normalize it by its energy (@Yannis, That's your answer at SSW9!)
        GN /= np.sqrt(np.mean(np.abs(GN)**2))
        # Place the noise within the pulse's window
        delay = (pulseposinwin - (leftbnd - winidx[0]))
        GN *= np.exp((delay * 2j * np.pi / dftlen) * np.arange(dftlen / 2 + 1))

        # Add it to the pulse spectrum, under the condition of the mask
        S *= GN**NMR[n, :]

        # That's it! the pulse spectrum is ready!

        # Move it to time domain
        deter = np.fft.irfft(S)[0:winlen]

        # Add half window on the left of the synthesized segment
        # to avoid any possible pre-echo
        deter[:leftbnd - winidx[0] - len(dampinhwin)] = 0.0
        deter[leftbnd - winidx[0] - len(dampinhwin):leftbnd -
              winidx[0]] *= dampinhwin

        # Write the synthesized segment in the final waveform
        if winidx[0] < 0 or winidx[-1] >= wavlen:
            # The window is partly outside of the waveform ...
            wav4win = np.zeros(winlen)
            # ... thus copy only the existing part
            itouse = np.logical_and(winidx >= 0, winidx < wavlen)
            wav[winidx[itouse]] += deter[itouse]
        else:
            wav[winidx] += deter

    if verbose > 1:
        print '\r                                                               \r',

    if verbose > 2:
        import matplotlib.pyplot as plt
        plt.ion()
        f, axs = plt.subplots(3, 1, sharex=True, sharey=False)
        times = np.arange(len(wav)) / float(fs)
        axs[0].plot(times, wav, 'k')
        axs[0].set_ylabel('Waveform\nAmplitude')
        axs[0].grid()
        axs[1].plot(f0s[:, 0], f0s[:, 1], 'k')
        axs[1].set_ylabel('F0\nFrequency [Hz]')
        axs[1].grid()
        axs[2].imshow(sp.mag2db(SPEC).T,
                      origin='lower',
                      aspect='auto',
                      interpolation='none',
                      extent=(f0s[0, 0], f0s[-1, 0], 0, 0.5 * fs))
        axs[2].set_ylabel('Amp. Envelope\nFrequency [Hz]')

        from IPython.core.debugger import Pdb
        Pdb().set_trace()

    return wav