def __init__(self, n_fft=2048, hop_length=1024, n_mels=128, n_mfcc=40, norm='ortho', sample_rate=16000, f_min=40, f_max=7600, pad_end=True, center=False): """ uses log mels """ super().__init__() self.norm = norm self.n_mfcc = n_mfcc self.melspec = MelSpec(n_fft, hop_length, n_mels, sample_rate, power=2, f_min=f_min, f_max=f_max, pad_end=pad_end, center=center) dct_mat = create_dct(self.n_mfcc, self.melspec.n_mels, self.norm) self.register_buffer('dct_mat', dct_mat)
def __init__(self, sample_rate: int = 16000, n_mfcc: int = 40, dct_type: int = 2, norm: str = 'ortho', log_mels: bool = False, melkwargs: Optional[dict] = None) -> None: super(MFCC, self).__init__() supported_dct_types = [2] if dct_type not in supported_dct_types: raise ValueError('DCT type not supported: {}'.format(dct_type)) self.sample_rate = sample_rate self.n_mfcc = n_mfcc self.dct_type = dct_type self.norm = norm self.top_db = 80.0 self.amplitude_to_DB = AmplitudeToDB('power', self.top_db) if melkwargs is not None: self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate, **melkwargs) else: self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate) if self.n_mfcc > self.MelSpectrogram.n_mels: raise ValueError( 'Cannot select more MFCC coefficients than # mel bins') dct_mat = F.create_dct(self.n_mfcc, self.MelSpectrogram.n_mels, self.norm) self.register_buffer('dct_mat', dct_mat) self.log_mels = log_mels
def __init__(self, sample_rate: int, mel_size: int, n_fft: int, win_length: int, n_mfcc: int, hop_length: int, min_db: float, max_db: float, mel_min: float = 0., mel_max: float = None, norm: str = 'ortho'): super().__init__() self.n_mfcc = n_mfcc self.mel_func = LogMelSpectrogram( sample_rate, mel_size, n_fft, win_length, hop_length, min_db, max_db, mel_min, mel_max ) dct_mat = audio_func.create_dct(n_mfcc, mel_size, norm) self.register_buffer('dct_mat', dct_mat.transpose(0, 1))
def test_torchscript_create_dct(self): @torch.jit.script def jit_method(n_mfcc, n_mels, norm): # type: (int, int, Optional[str]) -> Tensor return F.create_dct(n_mfcc, n_mels, norm) n_mfcc = 40 n_mels = 128 norm = 'ortho' jit_out = jit_method(n_mfcc, n_mels, norm) py_out = F.create_dct(n_mfcc, n_mels, norm) self.assertTrue(torch.allclose(jit_out, py_out))
def mfcc(signal, samplerate=16000, winlen=0.025, hoplen=0.01, numcep=13, nfilt=26, nfft=None, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, plusEnergy=True, dct=None, winfunc=lambda x: torch.ones((x, ), device=device)): """ Compute MFCC from an audio signal. :param signal: (time,) :param samplerate: :param winlen: :param hoplen: :param numcep: The number of cepstrum to retain. :param nfilt: :param nfft: :param lowfreq: :param highfreq: :param preemph: :param ceplifter: :param plusEnergy: :param winfunc: :return: (nframes, numcep) """ nfft = nfft or calculate_nfft(samplerate, winlen) feat, energy = fbank(signal, samplerate, winlen, hoplen, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) feat = torch.log(feat) if not dct: dct = AF.create_dct(numcep, nfilt, norm='ortho').to(device) feat = feat.mm(dct) feat = lifter(feat, ceplifter) if plusEnergy: feat[:, 0] = torch.log(energy) return feat
def func(_): n_mfcc = 40 n_mels = 128 norm = "ortho" return F.create_dct(n_mfcc, n_mels, norm)
sample_rate = 16000 n_mfcc = 40 dct_type = 2 norm = 'ortho' log_mels = False sample_rate = sample_rate n_mfcc = n_mfcc dct_type = dct_type norm = norm top_db = 80.0 amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db) MelSpectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate) dct_mat = F.create_dct(n_mfcc, MelSpectrogram.n_mels, norm) tm = torchaudio.transforms.MFCC() a = tm(torch.tensor(np.sin(np.arange(1, 1000)), dtype=torch.float)) r.a = a # pack batch shape = waveform.size() waveform = waveform.reshape(-1, shape[-1]) mel_specgram = MelSpectrogram(waveform) if log_mels: log_offset = 1e-6 mel_specgram = torch.log(mel_specgram + log_offset) else:
def jit_method(n_mfcc, n_mels, norm): # type: (int, int, Optional[str]) -> Tensor return F.create_dct(n_mfcc, n_mels, norm)
LFR_inputs = [] T = inputs.shape[0] T_lfr = int(np.ceil(T / n)) for i in range(T_lfr): if m <= T - i * n: LFR_inputs.append(np.hstack(inputs[i * n:i * n + m])) else: # process last LFR frame num_padding = m - (T - i * n) frame = np.hstack(inputs[i * n:]) for _ in range(num_padding): frame = np.hstack((frame, inputs[-1])) LFR_inputs.append(frame) return np.vstack(LFR_inputs) dct_mat = F.create_dct(48, 80, 'ortho') print('MFCC 80 -> 48 LFR') def build_LFR_features(inputs, m, n): """ Actually, this implements stacking frames and skipping frames. if m = 1 and n = 1, just return the origin features. if m = 1 and n > 1, it works like skipping. if m > 1 and n = 1, it works like stacking but only support right frames. if m > 1 and n > 1, it works like LFR. Args: inputs_batch: inputs is T x D np.ndarray m: number of frames to stack n: number of frames to skip """
def __init__(self, n_mfcc: int, mel_size: int, norm: str = 'ortho'): super().__init__() self.n_mfcc = n_mfcc dct_mat = audio_func.create_dct(n_mfcc, mel_size, norm) self.register_buffer('dct_mat', dct_mat.transpose(0, 1))