Esempio n. 1
0
    def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)

        if self.noiseInjector:
            logging.info("inject noise")
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)

        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)

        # Short-time Fourier transform (STFT)
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, phase = librosa.magphase(D)

        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)

        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect
def transform_mfcc_from_file(fname, endian='in16', sr=16000, L=None, win_size=0.025, win_step=0.01, \
                              num_cep=13, nfilt=26, preemph=0.97, appendEnergy=True):
    sr, buff = load_audio(fname, sr=sr, endian=endian)

    if L:
        buff = chop_audio(buff, L)
    feat = transform_mfcc(buff, sr, win_size, win_step, num_cep, nfilt,
                          preemph, appendEnergy)
    return feat