def extract(file_list, train_scaler=False):
    fns = np.loadtxt(file_list, dtype='str')
    cur_batch_size = len(fns)
    f_to_mel = filters.mel(sr=sampling_rate,
                           n_fft=nfft,
                           n_freq_bins=n_freq_bins)

    print("Extracting features")
    mp_func = partial(log_filterbank_energy,
                      output_dir="features",
                      sampling_rate=sampling_rate,
                      nfft=nfft,
                      n_freq_bins=n_freq_bins,
                      spectral_frame_length_s=spectral_frame_length_s,
                      frame_length_s=frame_length_s,
                      hop_length_s=hop_length_s,
                      force=False,
                      mel_scale=True)
    feature_fns = mp_with_pbar(mp_func, fns, mp.cpu_count())

    if train_scaler:
        print("Training scaler")
        scaler = StandardScaler()
        for i, fn in tqdm(enumerate(feature_fns), total=len(feature_fns)):
            with h5py.File(fn, 'r') as f:
                spec = f['data']
            scaler.partial_fit(spec[:, :, 0].T)

        joblib.dump(scaler, "scaler.pkl")

    return feature_fns
Esempio n. 2
0
def fft_and_melscale(song,
                     nhop=512,
                     nffts=[1024, 2048, 4096],
                     mel_nband=80,
                     mel_freqlo=27.5,
                     mel_freqhi=16000.0,
                     include_zero_cross=False):
    """
    fft and melscale method.
    fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う.
    melscale: 周波数の次元を削減するとともに,log10の値を取っている.
    """

    feat_channels = []
    for nfft in nffts:

        feats = []
        window = signal.blackmanharris(nfft)
        filt = mel(song.samplerate, nfft, mel_nband, mel_freqlo, mel_freqhi)

        # get normal frame
        frame = make_frame(song.data, nhop, nfft)
        # melscaling
        processedframe = fft(window * frame)[:, :nfft // 2 + 1]
        processedframe = np.dot(filt, np.transpose(np.abs(processedframe)**2))
        processedframe = 20 * np.log10(processedframe + 0.1)
        feat_channels.append(processedframe)

    if include_zero_cross:
        song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
        print(song.zero_crossing)
    res = np.array(feat_channels)
    return res
def extract_f0_func_audiofile(audio_file, gender='M'):
    floor_sp, ceil_sp = -80, 30
    mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
    min_level = np.exp(-100 / 20 * np.log(10))
    b, a = butter_highpass(30, 16000, order=5)

    if gender == 'M':
        lo, hi = 50, 250
    elif gender == 'F':
        lo, hi = 100, 600
    else:
        raise ValueError
    prng = RandomState(0)
    x, fs = sf.read(audio_file)
    if(len(x.shape) >= 2):
        x = x[:, 0]
    if x.shape[0] % 256 == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.95 + (prng.rand(y.shape[0]) - 0.5) * 1e-06
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = (D_db + 100) / 100

    f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2)
    index_nonzero = (f0_rapt != -1e10)
    tmp = f0_rapt[index_nonzero]
    mean_f0, std_f0 = np.mean(tmp), np.std(tmp)

    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

    return S, f0_norm
Esempio n. 4
0
def log_melsp_01(x,
                 sr=16000,
                 n_fft=1024,
                 hop_length=256,
                 n_mels=80,
                 fmin=80,
                 fmax=8000):
    '''
    '''
    mel_basis = mel(sr, n_fft, fmin=fmin, fmax=fmax, n_mels=n_mels).T
    min_level = np.exp(-100 / 20 * np.log(10))
    b, a = butter_highpass(30, 16000, order=5)

    #
    # Remove drifting noise
    y = signal.filtfilt(b, a, x)
    # Ddd a little random noise for model roubstness
    prng = RandomState()
    wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06
    # Compute spect
    D = pySTFT(wav, fft_length=n_fft, hop_length=hop_length).T
    # Convert to mel and normalize
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = np.clip((D_db + 100) / 100, 0, 1)

    return S.astype(np.float32)
Esempio n. 5
0
def pncc(audio_wave,
         n_fft=1024,
         sr=16000,
         window="hamming",
         n_mels=40,
         n_pncc=13,
         weight_N=4,
         power=2,
         dct=True):

    pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave)
    stft_pre_emphasis_signal = np.abs(
        stft(pre_emphasis_signal, n_fft=n_fft, window=window))**power
    mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power
    power_stft_pre_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T)
    q_ = medium_time_power_calculation(power_stft_pre_signal)
    q_le = asymmetric_lawpass_filtering(q_, 0.999, 0.5)
    pre_q_0 = q_ - q_le
    q_0 = halfwave_rectification(pre_q_0)
    q_f = asymmetric_lawpass_filtering(q_0)
    q_th = temporal_masking(q_0)
    r_sp = after_temporal_masking(q_th, q_f)
    r_ = switch_excitation_or_non_excitation(r_sp=r_sp,
                                             q_f=q_f,
                                             q_le=q_le,
                                             q_power_stft_pre_signal=q_)
    s_ = weight_smoothing(r_=r_, q_=q_, N=weight_N)
    t_ = time_frequency_normalization(p_=power_stft_pre_signal, s_=s_)
    u_ = mean_power_normalization(t_, r_)
    v_ = power_function_nonlinearity(u_)
    dct_v = np.dot(filters.dct(n_pncc, v_.shape[1]), v_.T)
    if dct:
        return dct_v.T
    else:
        return v_.T
Esempio n. 6
0
 def __init__(
         self,
         n_fft: int = 512,
         n_mels: int = 80,
         sample_rate: int = 16000,
         hop_length: int = 200,
         f_max=8000,  # default
         f_min=0,  # default
         power=2.0,  # default
         win_length=None,
         window='hann',  # default
         center=True,
         pad_mode='reflect',  # default
         norm=None,  # default for pytorch
         htk=True  # default for pytorch
 ):
     self.n_fft = n_fft
     self.sample_rate = sample_rate
     self.pad_mode = pad_mode
     self.hop_length = hop_length
     self.power = power
     self.win_length = n_fft
     self.mel_basis = filters.mel(
         sr=sample_rate,
         n_fft=n_fft,
         n_mels=n_mels,  # mel filter
         fmin=f_min,  # mel filter
         fmax=f_max,  # mel filter
         norm=norm,  # mel filter
         htk=htk)
     self.fft_window = get_window(window, self.win_length,
                                  fftbins=True).reshape((-1, 1))
Esempio n. 7
0
    def extract(audio_fn):
        # Read and Resample the audio
        try:
            data, _ = librosa.core.load(audio_fn, sr=sampling_rate)
            data = normalize(data)
        except Exception as e:
            logging.exception(e)
            return None

        # ensure length
        if len(data) > duration:
            data = data[:duration]
        elif len(data) < duration:
            data = np.pad(data, (duration - len(data), ),
                          mode='constant',
                          constant_values=0)

        # spectrogram
        f, t, Sxx = sp.signal.spectrogram(data,
                                          fs=sampling_rate,
                                          window=window,
                                          nperseg=frame_length,
                                          noverlap=overlap_length,
                                          nfft=nfft)

        if mel_scale:
            # spectrogram -> log mel fb
            f_to_mel = filters.mel(sr=sampling_rate,
                                   n_fft=nfft,
                                   n_mels=n_freq_bins)
            Sxx = f_to_mel.dot(Sxx)

        Sxx = np.expand_dims(np.log(1e-8 + Sxx), axis=-1)

        return Sxx
Esempio n. 8
0
def frft_MFCC(S,
              fs,
              n_mfcc=13,
              n_mels=128,
              dct_type=2,
              norm='ortho',
              power=2,
              pic=None):
    n_fft = 2 * (S.shape[0] - 1)
    # Build a Mel filter
    y = np.abs(S)**power
    mel_basis = filters.mel(sr=fs,
                            n_fft=n_fft,
                            n_mels=n_mels,
                            fmin=0.0,
                            fmax=None,
                            htk=False,
                            norm=1)
    melspectrogram = np.dot(mel_basis, y)
    S_db = lib.core.power_to_db(melspectrogram)
    feature = fftpack.dct(S_db, axis=0, type=dct_type, norm=norm)[:n_mfcc]
    if pic is not None:
        visual.specgram(X=feature,
                        title='frft_mfcc',
                        xlabel='Time',
                        ylabel='frft_mfccs',
                        pic=pic + '_frft_mfcc')
    return feature
Esempio n. 9
0
def apply_melfb(spec, fs, n_mels=128, amin=1e-10):
    fbin = spec.shape[-1]
    n_fft = fbin * 2 - 2
    mfb = mel(fs, n_fft, n_mels=n_mels)
    spec = np.maximum(spec, amin)
    mspec = np.maximum(spec @ mfb.T, amin)
    return mspec
Esempio n. 10
0
def fftandmelscaleikkatsu(song,
                          nhop=512,
                          nffts=[1024, 2048, 4096],
                          mel_nband=80,
                          mel_freqlo=27.5,
                          mel_freqhi=16000.0,
                          include_zero_cross=False):
    feat_channels = []
    for nfft in nffts:
        feats = []
        window = signal.blackmanharris(nfft)
        filt = mel(song.samplerate, nfft, mel_nband, mel_freqlo, mel_freqhi)
        frame = Frame(song.data, nhop, nfft)
        # frame = Frame2(data, nhop, nfft, nffts[-1])
        print(frame.shape)
        processedframe = fft(window * frame)[:, :nfft // 2 + 1]
        processedframe = np.dot(filt, np.transpose(np.abs(processedframe)**2))
        processedframe = 20 * np.log10(processedframe + 0.1)
        # processedframe = normalize(processedframe, axis=1, copy=False)
        print(processedframe.shape)
        feat_channels.append(processedframe)
    if include_zero_cross:
        song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
        print(song.zero_crossing)
    return np.array(feat_channels)
Esempio n. 11
0
def test_mel_filterbank(N=15):
    np.random.seed(12345)

    i = 0
    while i < N:
        fs = np.random.randint(50, 10000)
        n_filters = np.random.randint(2, 20)
        window_len = np.random.randint(10, 100)
        norm = np.random.randint(2)

        mine = mel_filterbank(window_len,
                              n_filters,
                              fs,
                              min_freq=0,
                              max_freq=None,
                              normalize=bool(norm))

        theirs = mel(
            fs,
            n_fft=window_len,
            n_mels=n_filters,
            htk=True,
            norm=norm if norm == 1 else None,
        )

        np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
        i += 1
Esempio n. 12
0
    def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()

        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = mel(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
        self.mel_basis = nd.array(mel_basis)
Esempio n. 13
0
    def _wav_to_spec(self,
                     wav,
                     sample_rate,
                     wav_path=None,
                     introduce_noise=False):
        """Convert wav file to a mel spectrogram

		Args:
			wav (numpy array): audio data either 1-d (mono) or 2-d (stereo)
			sample_rate (int): the sampling rate of the .wav (sf.read[1])
			wav_path (str): Path to original wav file
			note that these two variables can be loaded using:
				wavfile, sample_rate = sf.read(os.path.join(input_dir, speaker, fileName))

		Returns:
			np.array: Mel spectrogram
		"""

        mel_basis = mel(Config.audio_sr,
                        Config.n_fft,
                        fmin=Config.fmin,
                        fmax=Config.fmax,
                        n_mels=Config.n_mels).T
        min_level = np.exp(Config.min_level_db / 20 * np.log(10))
        b, a = self._butter_highpass(30, Config.audio_sr, order=5)

        # Resample wav if needed
        if sample_rate != Config.audio_sr:
            wav = librosa.resample(wav, sample_rate, Config.audio_sr)
            print(
                f"Wav file with sr {sample_rate} != {Config.audio_sr}, Now resampling to {Config.audio_sr}, then try to write to {wav_path}"
            )

            if wav_path:
                sf.write(wav_path, wav,
                         Config.audio_sr)  # Write downsampled file

        # Remove drifting noise
        wav = signal.filtfilt(b, a, wav)

        # add a little random noise for model robustness
        if introduce_noise:
            log.info(f"Introducing random noise into wav.file")

            wav = wav * 0.96 + (self._prng.rand(wav.shape[0]) - 0.5) * 1e-06

        # Compute spectrogram
        D = self._pySTFT(wav,
                         fft_length=Config.n_fft,
                         hop_length=Config.hop_length).T
        # Convert to mel and normalize
        D_mel = np.dot(D, mel_basis)
        D_db = 20 * np.log10(np.maximum(
            min_level, D_mel)) - Config.ref_level_db  # amp to db
        S = np.clip((D_db - Config.min_level_db) / -Config.min_level_db, 0,
                    1)  # clip between 0-1

        return S
Esempio n. 14
0
def pncc(audio_wave,
         n_fft=512,
         sr=16000,
         winlen=0.020,
         winstep=0.010,
         n_mels=128,
         n_pncc=13,
         weight_N=4,
         power=2):

    pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave)
    mono_wave = to_mono(pre_emphasis_signal.T)
    stft_pre_emphasis_signal = np.abs(
        stft(mono_wave,
             n_fft=n_fft,
             hop_length=int(sr * winstep),
             win_length=int(sr * winlen),
             window=np.ones(int(sr * winlen)),
             center=False))**power

    mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power
    power_stft_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T)

    medium_time_power = medium_time_power_calculation(power_stft_signal)

    lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999,
                                                  0.5)

    subtracted_lower_envelope = medium_time_power - lower_envelope

    rectified_signal = halfwave_rectification(subtracted_lower_envelope)

    floor_level = asymmetric_lawpass_filtering(rectified_signal)

    temporal_masked_signal = temporal_masking(rectified_signal)

    final_output = switch_excitation_or_non_excitation(temporal_masked_signal,
                                                       floor_level,
                                                       lower_envelope,
                                                       medium_time_power)

    spectral_weight_smoothing = weight_smoothing(final_output,
                                                 medium_time_power,
                                                 L=n_mels)

    transfer_function = time_frequency_normalization(
        power_stft_signal, spectral_weight_smoothing)

    normalized_power = mean_power_normalization(transfer_function,
                                                final_output,
                                                L=n_mels)

    power_law_nonlinearity = power_function_nonlinearity(normalized_power)

    dct = np.dot(power_law_nonlinearity,
                 filters.dct(n_pncc, power_law_nonlinearity.shape[1]).T)

    return dct
Esempio n. 15
0
    def __init__(self, n_mels, sample_rate, filter_length, hop_length,
                 win_length=None, mel_fmin=0.0, mel_fmax=None):
        super(MelSpectrogram, self).__init__()
        self.stft = STFT(filter_length, hop_length, win_length)

        mel_basis = mel(sample_rate, filter_length, n_mels,
                        mel_fmin, mel_fmax, htk=True)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
Esempio n. 16
0
def get_filters(config: Dict):
    mel_basis = mel(config["rate"],
                    config["window"],
                    fmin=90,
                    fmax=config["fmax"],
                    n_mels=config["mels"]).T
    min_level = np.exp(-100 / 20 * np.log(10))
    b, a = butter_highpass(30, config["rate"], order=5)

    return mel_basis, min_level, b, a
Esempio n. 17
0
 def __init__(self, frame_stream, specfmt="dB", mels_N=12):
     '''
     DFTStream(frame_stream, specfmt, mels_N)        
     Create a stream of discrete Fourier transform (DFT) frames using the
     specified sample frame stream. Only bins up to the Nyquist rate are
     returned in the stream Optional arguments:
     
     specfmt - DFT output:  
         "complex" - return complex DFT results
          "dB" [default] - return power spectrum 20log10(magnitude)
          "mag^2" - magnitude squared spectrum
          "Mel" - melodic scale
     mels_N - Number of Mel filters to use.  Only applicable when
         specfmt == "Mel".
     '''
     
     self.format_types = {"complex" : 0,
                          "mag^2" : 1,
                          "dB" : 2,
                          "Mel" : 3}
     self.framer = frame_stream
     self.frame_len = frame_stream.get_framelen_samples()
     try:
         self.format = self.format_types[specfmt]
     except KeyError:
         raise ValueError("Unknown specfmt {}.  Use one of [{}]".format(
             specfmt, ", ".join(self.format_types.keys())))
          
     # Number of frequency bins is the same as the number of bins in the
     # frame
     self.dft_bins = self.frame_len
     
     # Only bins up to the Nyquist rate are usable.  The DFT routine that
     # we are using will return up to and including the Nyuist (half bins
     # plus 1 if even)
     self.Nyquist_Hz = self.framer.get_Fs() / 2.0
     # We add 1.1 instead of 1, see numpy.around for details which 
     # np.round uses.
     self.bins_Nyquist = np.int(np.round((self.frame_len+1.1)/2.0))
              
     self.window = signal.get_window("hamming", self.frame_len)        
     
     if self.format == self.format_types["Mel"]:
         # Construct Mel filters
         self.mel_filters = mel(self.framer.get_Fs(), 
             self.dft_bins, mels_N)
         # Center frequencies of the Mel filters in Hz
         # Returns two more than are actually used (0 Hz and Nyquist)
         self.bins_Hz = mel_frequencies(mels_N+2, 
                fmin=0, fmax=self.Nyquist_Hz)
         self.bins_Hz = self.bins_Hz[1:-1]  # Remove ends
         self.bins_N = len(self.bins_Hz)
     else:
         self.bins_Hz = np.arange(self.bins_Nyquist) / self.bins_Nyquist * self.Nyquist_Hz      
         self.bins_N = self.bins_Hz.shape[0]
Esempio n. 18
0
 def get_coeffs(self,A,num_ceps=13,num_filters=16,f_bins=400,fs=100,normalize=True,corr=False):
     fbank = filters.mel(fs,f_bins,num_filters, norm=None)
     fbank_coeffs = np.dot(fbank,A).T
     cc = fftpack.dct(fbank_coeffs, type=2, norm='ortho')[:, 1 : (num_ceps + 1)]
     if normalize == True:
         cc -= (np.mean(cc, axis=0) + 1e-8)
         fbank_coeffs -= (np.mean(fbank_coeffs, axis=0) + 1e-8)
     if corr == False:
         return cc
     else:
         return fbank_coeffs
Esempio n. 19
0
    def __init__(self, sampling_rate: int = 22050, n_fft: int = 1024, window_size: int = 1024, hop_size: int = 256,
                 num_mels: int = 80, fmin: float = 0., fmax: float = 8000.):
        super().__init__()
        self.n_fft = n_fft
        self.hop_size = hop_size
        self.window_size = window_size
        self.pad_size = (self.n_fft - self.hop_size) // 2

        mel_filter_tensor = torch.FloatTensor(mel(sampling_rate, n_fft, num_mels, fmin, fmax))
        self.register_buffer('mel_filter', mel_filter_tensor)
        self.register_buffer('window', torch.hann_window(window_size))
def mel_scaled_spectrogram(spectrogram: ndarray,
                           sr: int,
                           n_mels: Optional[int] = 128,
                           fmin: Optional[float] = 0.0,
                           fmax: Optional[Union[float, None]] = None,
                           htk: Optional[bool] = False):
    """Calculates the mel scaled version of the spectrogram.

    :param spectrogram: Spectrogram to be used.
    :type spectrogram: numpy.ndarray
    :param sr: Sampling frequency of the original signal.
    :type sr: int
    :param n_mels: Amount of mel filters to use, defaults to 128.
    :type n_mels: int, optional
    :param fmin: Minimum frequency for mel filters, defaults to 0.0.
    :type fmin: float, optional
    :param fmax: Maximum frequency for mel filters. If `None`, \
                 sr/2.0 is used. Defaults to None
    :type fmax: float|None, optional
    :param htk: Use HTK formula, instead of Slaney, defaults to False.
    :type htk: bool, optional
    :return: Mel scaled version of the input spectrogram, with shape \
             (channels, nb_mels, values) for channels >= 2, else \
             (nb_mels, values).
    :rtype: numpy.ndarray
    """
    ndim = spectrogram.ndim

    if ndim not in [2, 3]:
        raise AttributeError('Input spectrogram must be of shape '
                             '(channels, nb_frames, frames). '
                             f'Current input has {ndim} dimensions. '
                             f'Allowed are either 2 or 3.')

    n_fft = 2 * (spectrogram[ndim - 2] - 1)

    mel_filters = mel(
        sr=sr,
        n_fft=n_fft,
        n_mels=n_mels,
        fmin=fmin,
        fmax=fmax,
        htk=htk)

    if ndim == 2:
        mel_spectrogram = np_dot(mel_filters, spectrogram)
    else:
        mel_spectrogram = np_cat([expand_dims(np_dot(mel_filters, i), 0)
                                  for i in spectrogram], axis=0)

    return mel_spectrogram
Esempio n. 21
0
def melspectrogram(y=None,
                   sr=16000,
                   n_fft=400,
                   hop_length=160,
                   power=2.0,
                   **kwargs):
    """Compute a mel-scaled spectrogram.

    If a spectrogram input `S` is provided, then it is mapped directly onto
    the mel basis `mel_f` by `mel_f.dot(S)`.

    If a time-series input `y, sr` is provided, then its magnitude spectrogram
    `S` is first computed, and then mapped onto the mel scale by
    `mel_f.dot(S**power)`.  By default, `power=2` operates on a power spectrum.

    Parameters
    ----------
    y : np.ndarray [shape=(n,)] or None
        audio time-series

    sr : number > 0 [scalar]
        sampling rate of `y`

    n_fft : int > 0 [scalar]
        length of the FFT window

    hop_length : int > 0 [scalar]
        number of samples between successive frames.
        See `librosa.core.stft`

    power : float > 0 [scalar]
        Exponent for the magnitude melspectrogram.
        e.g., 1 for energy, 2 for power, etc.

    kwargs : additional keyword arguments
      Mel filter bank parameters.
      See `librosa.filters.mel` for details.

    Returns
    -------
    S : np.ndarray [shape=(n_mels, t)]
        Mel spectrogram
    """
    # Compute a magnitude spectrogram from input
    S = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length,
                    center=False))**power

    # Build a Mel filter
    mel_basis = filters.mel(sr, n_fft, **kwargs)

    return np.dot(mel_basis, S)
Esempio n. 22
0
def analyze(audio, time):
    """
    """
    # print(time)
    n_fft = int(44100 * 0.5)
    mel_basis = filters.mel(44100, n_fft, n_mels=256)
    # print(mel_basis.shape)
    start_idx = int(44100 * time)  #-n_fft/2)
    spec = np.log10(np.abs(np.fft.fft(audio[start_idx:start_idx + n_fft]))**2)
    spec = spec[:int(len(spec) / 2) + 1]
    # print(spec.shape)
    mel_spec = np.dot(mel_basis, spec)
    norm = np.linalg.norm(mel_spec, ord=2)
    return mel_spec / norm, norm
Esempio n. 23
0
def pncc(audio_wave,
         n_fft=1024,
         sr=16000,
         window="hamming",
         n_mels=40,
         n_pncc=13,
         weight_N=4,
         power=2,
         dct=True):

    pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave)
    stft_pre_emphasis_signal = np.abs(
        stft(pre_emphasis_signal, n_fft=n_fft, window=window))**power
    mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power
    power_stft_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T)
    medium_time_power = medium_time_power_calculation(power_stft_signal)
    lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999,
                                                  0.5)

    subtracted_lower_envelope = medium_time_power - lower_envelope
    rectified_signal = halfwave_rectification(subtracted_lower_envelope)

    floor_level = asymmetric_lawpass_filtering(rectified_signal)

    temporal_masked_signal = temporal_masking(rectified_signal)
    temporal_masked_signal = after_temporal_masking(temporal_masked_signal,
                                                    floor_level)

    final_output = switch_excitation_or_non_excitation(temporal_masked_signal,
                                                       floor_level,
                                                       lower_envelope,
                                                       medium_time_power)

    spectral_weight_smoothing = weight_smoothing(final_output,
                                                 medium_time_power, weight_N)

    transfer_function = time_frequency_normalization(
        power_stft_signal=power_stft_signal,
        spectral_weight_smoothing=spectral_weight_smoothing)

    normalized_power = mean_power_normalization(transfer_function,
                                                final_output)

    power_law_nonlinearity = power_function_nonlinearity(normalized_power)

    dct_v = np.dot(filters.dct(n_pncc, power_law_nonlinearity.shape[1]),
                   power_law_nonlinearity.T)

    return power_law_nonlinearity
Esempio n. 24
0
    def __init__(self, sample_rate, preemphasis, frequency, frame_length,
                 frame_shift, min_dbs, ref_dbs, mels_size, griff_lim_iters,
                 power):
        self.preemphasis = preemphasis
        self.n_fft = (frequency - 1) * 2
        self.win_length = int(frame_length / 1e3 * sample_rate)
        self.hop_length = int(frame_shift / 1e3 * sample_rate)
        self.min_dbs = min_dbs
        self.ref_dbs = ref_dbs
        self.griff_lim_iters = griff_lim_iters
        self.power = power

        # Create a Filterbank matrix to combine FFT bins into Mel-frequency bins
        self.mel_basis = filters.mel(sr=sample_rate,
                                     n_fft=self.n_fft,
                                     n_mels=mels_size)
Esempio n. 25
0
def transform2mel(spectrogram,
                  samplerate,
                  fft_window_size,
                  n_mel_bands=80,
                  freq_min=0,
                  freq_max=None):
    '''Transform to Mel

    convert a spectrogram to a Mel scale spectrogram by grouping original frequency bins
    to Mel frequency bands (using Mel filter from Librosa)

    Parameters
    spectrogram: input spectrogram
    samplerate: samplerate of audio signal
    fft_window_size: number of time window / frequency bins in the FFT analysis
    n_mel_bands: number of desired Mel bands, typically 20, 40, 80 (max. 128 which is default when 'None' is provided)
    freq_min: minimum frequency (Mel filters will be applied >= this frequency, but still return n_meld_bands number of bands)
    freq_max: cut-off frequency (Mel filters will be applied <= this frequency, but still return n_meld_bands number of bands)

    Returns:
    mel_spectrogram: Mel spectrogram: np.array of shape(n_mel_bands,frames) maintaining the number of frames in the original spectrogram
    '''

    from librosa.filters import mel

    # Syntax: librosa.filters.mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False)
    mel_basis = mel(samplerate,
                    fft_window_size,
                    n_mels=n_mel_bands,
                    fmin=freq_min,
                    fmax=freq_max)

    freq_bin_max = mel_basis.shape[1]  # will be fft_window_size / 2 + 1

    # IMPLEMENTATION WITH FOR LOOP
    # initialize Mel Spectrogram matrix
    #n_mel_bands = mel_basis.shape[0]  # get the number of bands from result in case 'None' was specified as parameter
    #mel_spectrogram = np.empty((n_mel_bands, frames))

    #for i in range(frames): # stepping through the wave segment, building spectrum for each window
    #    mel_spectrogram[:,i] = np.dot(mel_basis,spectrogram[0:freq_bin_max,i])

    # IMPLEMENTATION WITH DOT PRODUCT (15% faster)
    # multiply the mel filter of each band with the spectogram frame (dot product executes it on all frames)
    # filter will be adapted in a way so that frequencies beyond freq_max will be discarded
    mel_spectrogram = np.dot(mel_basis, spectrogram[0:freq_bin_max, :])
    return (mel_spectrogram)
Esempio n. 26
0
    def __init__(self, dlnet_config: dict, ds_config: str):
        """
        Init wrapper object. Reads DL Network config and dataset config.

        Parameters
        ----------
        dlnet_config : dict
            Config for DL Network and preprocessing
        ds_config : str
            Path to Dataset config to extract classes
        """
        # Set config:
        self.config = dlnet_config

        # Set random seed:
        random.seed = self.config['random_seed']

        # Classes
        if self.config['binary']:
            self.config['classes'] = ['compressed_wav', 'uncompr_wav']
        else:
            self.config['classes'] = self.get_classes_from_dataset(ds_config)

        # Input shape and filter settings:
        if self.config['calculate_mel']:
            # Mel filter init:
            self._mel_filter = filters.mel(self.config['sr'],
                                           self.config['n_fft'],
                                           n_mels=dlnet_config['n_mels'],
                                           norm='slaney')
            self.config['input_shape'] = (self.config['n_mels'],
                                          self.config['n_frames'], 1)
        elif self.config['filter_signal']:
            # Crop spectrogram
            # frequency array
            self._freqs = np.fft.rfftfreq(self.config['n_fft'],
                                          d=1 / self.config['sr'])
            # cutoff frequency bin at cutoff frequency
            self._cutoff_bin = int(
                np.argmin(np.abs(self._freqs -
                                 self.config['filter_config'][1])))
            self.config['input_shape'] = (
                int(len(self._freqs) - self._cutoff_bin),
                self.config['n_frames'], 1)
        else:
            self.config['input_shape'] = (int(self.config['n_fft'] / 2 + 1),
                                          self.config['n_frames'], 1)
Esempio n. 27
0
    def __init__(self,
                 flows,
                 n_group,
                 sr,
                 window_size,
                 n_mels,
                 hp,
                 use_conv1x1=False):
        super().__init__()
        self.flows = flows
        self.n_group = n_group
        self.win_size = window_size
        self.hop_size = hp.audio.hop_length
        self.n_mels = n_mels
        self.sr = sr
        self.sub_sr = self.hop_size // n_group

        self.upsampler = nn.Sequential(
            nn.ConvTranspose1d(n_mels,
                               n_mels,
                               self.sub_sr * 2 + 1,
                               self.sub_sr,
                               padding=self.sub_sr), nn.LeakyReLU(0.4, True))
        self.upsampler.apply(add_weight_norms)

        self.WNs = nn.ModuleList()

        if use_conv1x1:
            self.invconv1x1 = nn.ModuleList()

        # Set up layers with the right sizes based on how many dimensions
        # have been output already
        for k in range(flows):
            self.WNs.append(
                WN2D(n_group, n_mels, hp.model.dilation_channels,
                     hp.model.residual_channels, hp.model.skip_channels))
            if use_conv1x1:
                self.invconv1x1.append(
                    InvertibleConv1x1(n_group, memory_efficient=False))

        filters = mel(sr, window_size, n_mels, fmax=8000)
        self.filter_idx = np.nonzero(filters)
        self.register_buffer('filter_value',
                             torch.Tensor(filters[self.filter_idx]))
        self.filter_size = torch.Size(filters.shape)
        self.register_buffer('window', torch.hann_window(window_size))
Esempio n. 28
0
    def __init__(self, flows, n_group, n_early_every, n_early_size, sr,
                 window_size, hop_size, n_mels, memory_efficient, **kwargs):
        super().__init__()
        self.flows = flows
        self.n_group = n_group
        self.n_early_every = n_early_every
        self.n_early_size = n_early_size
        self.win_size = window_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.sr = sr

        self.upsample_factor = hop_size // n_group
        sub_win_size = window_size // n_group
        # self.upsampler = nn.ConvTranspose1d(n_mels, n_mels, sub_win_size, self.upsample_factor,
        #                                    padding=sub_win_size // 2, bias=False)

        self.invconv1x1 = nn.ModuleList()
        self.WNs = nn.ModuleList()

        # Set up layers with the right sizes based on how many dimensions
        # have been output already
        n_remaining_channels = n_group
        self.z_split_sizes = []
        for k in range(flows):
            if k % self.n_early_every == 0 and k:
                n_remaining_channels -= n_early_size
                self.z_split_sizes.append(n_early_size)
            self.invconv1x1.append(
                InvertibleConv1x1(n_remaining_channels,
                                  memory_efficient=memory_efficient))
            self.WNs.append(
                AffineCouplingBlock(WN,
                                    memory_efficient=memory_efficient,
                                    in_channels=n_remaining_channels // 2,
                                    aux_channels=n_mels,
                                    **kwargs))
        self.z_split_sizes.append(n_remaining_channels)

        filters = mel(sr, window_size, n_mels, fmax=8000)
        self.filter_idx = np.nonzero(filters)
        self.register_buffer('filter_value',
                             torch.Tensor(filters[self.filter_idx]))
        self.filter_size = torch.Size(filters.shape)
        self.register_buffer('window', torch.hann_window(window_size))
Esempio n. 29
0
    def __init__(self,
                 sr=22050,
                 n_fft=2048,
                 n_mels=128,
                 hop_length=512,
                 window='hann',
                 center=True,
                 pad_mode='reflect',
                 htk=False,
                 fmin=0.0,
                 fmax=None,
                 norm=1,
                 trainable_mel=False,
                 trainable_STFT=False):
        super(MelSpectrogram, self).__init__()
        self.stride = hop_length
        self.center = center
        self.pad_mode = pad_mode
        self.n_fft = n_fft

        # Create filter windows for stft
        start = time()
        wsin, wcos, self.bins2freq, _ = create_fourier_kernels(n_fft,
                                                               freq_bins=None,
                                                               window=window,
                                                               freq_scale='no',
                                                               sr=sr)
        self.wsin = torch.tensor(wsin, dtype=torch.float)
        self.wcos = torch.tensor(wcos, dtype=torch.float)
        print("STFT filter created, time used = {:.4f} seconds".format(time() -
                                                                       start))

        # Creating kenral for mel spectrogram
        start = time()
        mel_basis = mel(sr, n_fft, n_mels, fmin, fmax, htk=htk, norm=norm)
        self.mel_basis = torch.tensor(mel_basis)
        print("Mel filter created, time used = {:.4f} seconds".format(time() -
                                                                      start))

        if trainable_mel == True:
            self.mel_basis = torch.nn.Parameter(self.mel_basis)
        if trainable_STFT == True:
            self.wsin = torch.nn.Parameter(self.wsin)
            self.wcos = torch.nn.Parameter(self.wcos)
Esempio n. 30
0
class AudioProcessor:
    """Process audio data."""

    sample_rate = 16000
    top_db = 15
    ref_db = 20
    max_db = 100
    fft_len = 1024
    hop_len = 256
    mel_basis = mel(sample_rate, fft_len, fmin=90, fmax=7600, n_mels=80).T
    min_level = np.exp(-100 / 20 * np.log(10))

    @classmethod
    def butter_highpass(cls, cutoff=30, order=5):
        """Create butter highpass filter."""

        normal_cutoff = cutoff / (0.5 * cls.sample_rate)
        return butter(order, normal_cutoff, btype='high', analog=False)

    @classmethod
    def short_time_fourier_transform(cls, wav):
        """Apply short time Fourier transform."""

        d_matrix = stft(wav, n_fft=cls.fft_len, hop_length=cls.hop_len)
        return np.abs(d_matrix)

    @classmethod
    def file2spectrogram(cls, file_path):
        """Load audio file and create spectrogram."""

        wav = load(file_path, sr=cls.sample_rate)[0]
        wav = trim(wav, top_db=cls.top_db)[0]
        wav = filtfilt(*cls.butter_highpass(), wav)
        wav = wav * 0.96

        d_mag = cls.short_time_fourier_transform(wav)
        d_mel = np.dot(d_mag.T, cls.mel_basis)

        db_val = 20 * np.log10(np.maximum(cls.min_level, d_mel))
        db_scaled = db_val - cls.ref_db
        db_normalized = (db_scaled + cls.max_db) / cls.max_db

        return np.clip(db_normalized, 0, 1).astype(np.float32)
Esempio n. 31
0
def transform2mel(spectrogram,samplerate,fft_window_size,n_mel_bands = 80,freq_min = 0,freq_max = None):
    '''Transform to Mel

    convert a spectrogram to a Mel scale spectrogram by grouping original frequency bins
    to Mel frequency bands (using Mel filter from Librosa)

    Parameters
    spectrogram: input spectrogram
    samplerate: samplerate of audio signal
    fft_window_size: number of time window / frequency bins in the FFT analysis
    n_mel_bands: number of desired Mel bands, typically 20, 40, 80 (max. 128 which is default when 'None' is provided)
    freq_min: minimum frequency (Mel filters will be applied >= this frequency, but still return n_meld_bands number of bands)
    freq_max: cut-off frequency (Mel filters will be applied <= this frequency, but still return n_meld_bands number of bands)

    Returns:
    mel_spectrogram: Mel spectrogram: np.array of shape(n_mel_bands,frames) maintaining the number of frames in the original spectrogram
    '''

    from librosa.filters import mel

    # Syntax: librosa.filters.mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False)
    mel_basis = mel(samplerate,fft_window_size, n_mels=n_mel_bands,fmin=freq_min,fmax=freq_max)

    freq_bin_max = mel_basis.shape[1] # will be fft_window_size / 2 + 1

    # IMPLEMENTATION WITH FOR LOOP
    # initialize Mel Spectrogram matrix
    #n_mel_bands = mel_basis.shape[0]  # get the number of bands from result in case 'None' was specified as parameter
    #mel_spectrogram = np.empty((n_mel_bands, frames))

    #for i in range(frames): # stepping through the wave segment, building spectrum for each window
    #    mel_spectrogram[:,i] = np.dot(mel_basis,spectrogram[0:freq_bin_max,i])

    # IMPLEMENTATION WITH DOT PRODUCT (15% faster)
    # multiply the mel filter of each band with the spectogram frame (dot product executes it on all frames)
    # filter will be adapted in a way so that frequencies beyond freq_max will be discarded
    mel_spectrogram = np.dot(mel_basis,spectrogram[0:freq_bin_max,:])
    return (mel_spectrogram)