def parse_audio(self, audio_path): if self.augment: y = load_randomly_augmented_audio(audio_path, self.sample_rate) else: y = load_audio(audio_path) if self.noiseInjector: logging.info("inject noise") add_noise = np.random.binomial(1, self.noise_prob) if add_noise: y = self.noiseInjector.inject_noise(y) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) # Short-time Fourier transform (STFT) D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if self.normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect
def transform_mfcc_from_file(fname, endian='in16', sr=16000, L=None, win_size=0.025, win_step=0.01, \ num_cep=13, nfilt=26, preemph=0.97, appendEnergy=True): sr, buff = load_audio(fname, sr=sr, endian=endian) if L: buff = chop_audio(buff, L) feat = transform_mfcc(buff, sr, win_size, win_step, num_cep, nfilt, preemph, appendEnergy) return feat