def parse_audio(self, audio_path: str, augment_method: int) -> Tensor: """ Parses audio. Args: audio_path (str): path of audio file augment_method (int): flag indication which augmentation method to use. Returns: feature_vector - **feature_vector** (torch.FloatTensor): feature from audio file. """ signal = load_audio(audio_path, self.del_silence) if augment_method == SpectrogramParser.NOISE_INJECTION or augment_method == SpectrogramParser.HYBRID_AUGMENT: signal = self.noise_injector(signal) feature_vector = self.transforms(signal) if self.normalize: feature_vector -= feature_vector.mean() feature_vector /= np.std(feature_vector) # Refer to "Sequence to Sequence Learning with Neural Network" paper if self.input_reverse: feature_vector = feature_vector[:, ::-1] feature_vector = FloatTensor(np.ascontiguousarray(np.swapaxes(feature_vector, 0, 1))) else: feature_vector = FloatTensor(feature_vector).transpose(0, 1) if augment_method == SpectrogramParser.SPEC_AUGMENT or augment_method == SpectrogramParser.HYBRID_AUGMENT: feature_vector = self.spec_augment(feature_vector) return feature_vector
def parse_audio(audio_path: str, del_silence: bool = True) -> Tensor: signal = load_audio(audio_path, del_silence) feature_vector = torchaudio.compliance.kaldi.fbank( Tensor(signal).unsqueeze(0), num_mel_bins=80, frame_length=20, frame_shift=10, window_type='hamming').transpose(0, 1).numpy() feature_vector -= feature_vector.mean() feature_vector = Tensor(feature_vector).transpose(0, 1) return feature_vector
def parse_audio(audio_path: str, del_silence: bool = True) -> Tensor: signal = load_audio(audio_path, del_silence) mfcc = librosa.feature.mfcc(y=signal, sr=16000, n_mfcc=40, n_fft=320, hop_length=160) mfcc -= mfcc.mean() mfcc = Tensor(mfcc).transpose(0, 1) mfcc = mfcc[:, ::-1] mfcc = torch.FloatTensor(np.ascontiguousarray(np.swapaxes(mfcc, 0, 1))) return mfcc
def parse_audio(audio_path: str, del_silence: bool = False, audio_extension: str = 'pcm') -> Tensor: signal = load_audio(audio_path, del_silence, extension=audio_extension) feature = torchaudio.compliance.kaldi.fbank( waveform=Tensor(signal).unsqueeze(0), num_mel_bins=80, frame_length=20, frame_shift=10, window_type='hamming').transpose(0, 1).numpy() feature -= feature.mean() feature /= np.std(feature) return torch.FloatTensor(feature).transpose(0, 1)
def parse_audio(self, audio_path: str, augment_method: int) -> Tensor: """ Parses audio. Args: audio_path (str): path of audio file augment_method (int): flag indication which augmentation method to use. Returns: feature_vector - **feature_vector** (torch.FloatTensor): feature from audio file. """ signal = load_audio(audio_path, self.del_silence, extension=self.audio_extension) if signal is None: logger.info("Audio is None : {0}".format(audio_path)) return None feature = self.transforms(signal) if self.normalize: feature -= feature.mean() feature /= np.std(feature) # Refer to "Sequence to Sequence Learning with Neural Network" paper if self.input_reverse: feature = feature[:, ::-1] feature = FloatTensor( np.ascontiguousarray(np.swapaxes(feature, 0, 1))) else: feature = FloatTensor(feature).transpose(0, 1) if augment_method == SpectrogramParser.SPEC_AUGMENT: feature = self.spec_augment(feature) return feature