def __init__(self, feature_extract_by: str = 'librosa', sample_rate: int = 16000, n_mels: int = 80, frame_length: int = 20, frame_shift: int = 10, del_silence: bool = False, input_reverse: bool = True, normalize: bool = False, transform_method: str = 'mel', time_mask_para: int = 70, freq_mask_para: int = 12, time_mask_num: int = 2, freq_mask_num: int = 2, sos_id: int = 1, eos_id: int = 2, target_dict: dict = None, noise_augment: bool = False, dataset_path: str = None, noiseset_size: int = 0, noise_level: float = 0.7) -> None: super(SpectrogramParser, self).__init__(dataset_path, noiseset_size, sample_rate, noise_level, noise_augment) self.del_silence = del_silence self.input_reverse = input_reverse self.normalize = normalize self.sos_id = sos_id self.eos_id = eos_id self.target_dict = target_dict self.spec_augment = SpecAugment(time_mask_para, freq_mask_para, time_mask_num, freq_mask_num) if transform_method.lower() == 'mel': self.transforms = MelSpectrogram(sample_rate, n_mels, frame_length, frame_shift, feature_extract_by) elif transform_method.lower() == 'mfcc': self.transforms = MFCC(sample_rate, n_mels, frame_length, frame_shift, feature_extract_by) elif transform_method.lower() == 'spect': self.transforms = Spectrogram(sample_rate, frame_length, frame_shift, feature_extract_by) elif transform_method.lower() == 'fbank': self.transforms = FilterBank(sample_rate, n_mels, frame_length, frame_shift) else: raise ValueError("Unsupported feature : {0}".format(transform_method))
def __init__( self, feature_extract_by: str = 'librosa', # which library to use for feature extraction sample_rate: int = 16000, # sample rate of audio signal. n_mels: int = 80, # Number of mfc coefficients to retain. frame_length: int = 20, # frame length for spectrogram frame_shift: int = 10, # Length of hop between STFT windows. del_silence: bool = False, # flag indication whether to delete silence or not input_reverse: bool = True, # flag indication whether to reverse input or not normalize: bool = False, # flag indication whether to normalize spectrum or not transform_method: str = 'mel', # which feature to use [mel, fbank, spect, mfcc] freq_mask_para: int = 12, # hyper Parameter for Freq Masking to limit freq masking length time_mask_num: int = 2, # how many time-masked area to make freq_mask_num: int = 2, # how many freq-masked area to make sos_id: int = 1, # start of sentence token`s identification eos_id: int = 2, # end of sentence token`s identification dataset_path: str = None, # noise dataset path audio_extension: str = 'pcm' # audio extension ) -> None: super(SpectrogramParser, self).__init__(dataset_path) self.del_silence = del_silence self.input_reverse = input_reverse self.normalize = normalize self.sos_id = sos_id self.eos_id = eos_id self.spec_augment = SpecAugment(freq_mask_para, time_mask_num, freq_mask_num) self.audio_extension = audio_extension if transform_method.lower() == 'mel': self.transforms = MelSpectrogram(sample_rate, n_mels, frame_length, frame_shift, feature_extract_by) elif transform_method.lower() == 'mfcc': self.transforms = MFCC(sample_rate, n_mels, frame_length, frame_shift, feature_extract_by) elif transform_method.lower() == 'spect': self.transforms = Spectrogram(sample_rate, frame_length, frame_shift, feature_extract_by) elif transform_method.lower() == 'fbank': self.transforms = FilterBank(sample_rate, n_mels, frame_length, frame_shift) else: raise ValueError( "Unsupported feature : {0}".format(transform_method))