def mfcc_sad(debugging): recipe = pp.make_pipeline(steps=[ pp.speech.AudioReader(sr=16000, sr_new=8000), pp.speech.PreEmphasis(coeff=0.97), pp.base.Converter(converter=WAV_FILES, input_name='path', output_name='name'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=0.025, step_length=0.01, n_fft=512, energy=False), # ====== spectrogram ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=20, fmax=3700, output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=24, remove_first_coef=True, first_coef_energy=True, output_name='mfcc'), # ====== SAD ====== # pp.speech.SADthreshold(energy_threshold=0.5, smooth_window=5, output_name='sad'), pp.speech.SADgmm(nb_mixture=3, smooth_window=3, input_name='energy', output_name='sad'), pp.speech.ApplyingSAD(input_name=('mspec',)), pp.speech.AcousticNorm(input_name=('mspec',), mean_var_norm=True, windowed_mean_var_norm=True, win_length=121), # ====== cleaning ====== # pp.base.DeleteFeatures(input_name=('stft', 'raw', 'spec', 'sad', 'sad_threshold', 'energy')), pp.base.AsType(dtype='float16') ], debug=debugging) return recipe
def mfcc(augmentation=None): delete_list = ['stft', 'spec', 'raw', 'mfcc_energy', 'sad_threshold'] if augmentation is not None: delete_list.append('sad') extractors = pp.make_pipeline(steps=[ SREAugmentor(augmentation) if isinstance(augmentation, string_types ) else SREAudioReader(), pp.speech.PreEmphasis(coeff=0.97, input_name='raw'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH, step_length=Config.STEP_LENGTH, n_fft=Config.NFFT, window=Config.WINDOW, padding=False, energy=False), # ====== for x-vector ====== # pp.speech.PowerSpecExtractor( power=2.0, input_name='stft', output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=20, fmax=3700, input_name=('spec', 'sr'), output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=24, remove_first_coef=True, first_coef_energy=True, input_name='mspec', output_name='mfcc'), # ====== extract SAD ====== # pp.speech.SADthreshold(energy_threshold=0.5, energy_mean_scale=0.5, frame_context=2, proportion_threshold=0.12, smooth_window=5, input_name='mfcc_energy', output_name='sad') if augmentation is None else SADreader(ds_path=os.path.join(PATH_ACOUSTIC_FEATURES, 'mfcc')), pp.speech.ApplyingSAD(input_name=('mspec', 'mfcc'), sad_name='sad', keep_unvoiced=False if CURRENT_STATE == SystemStates.EXTRACT_FEATURES else True), # ====== normalization ====== # pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True, win_length=301, input_name=('mspec', 'mfcc')), # ====== post processing ====== # pp.base.DeleteFeatures(input_name=delete_list), pp.base.AsType(dtype='float16'), ]) return extractors
def bnf(augmentation=None): raise NotImplementedError bnf_network = N.models.BNF_2048_MFCC40() recipe = pp.make_pipeline(steps=[ SREAudioReader(), pp.speech.PreEmphasis(coeff=0.97), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH, step_length=Config.STEP_LENGTH, n_fft=Config.NFFT, window=Config.WINDOW), # ====== SAD ====== # pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'), pp.speech.SADgmm(nb_mixture=3, smooth_window=3, input_name='energy', output_name='sad'), # ====== BNF ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=Config.NCEPS, fmin=Config.FMIN, fmax=Config.FMAX, input_name='spec', output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=Config.NCEPS, remove_first_coef=False, input_name='mspec', output_name='mfcc'), pp.base.AsType(dtype='float32', input_name='mfcc'), pp.speech.BNFExtractor(input_name='mfcc', output_name='bnf', sad_name='sad', network=bnf_network, remove_non_speech=True, stack_context=10, pre_mvn=True, batch_size=5218), # ====== normalization ====== # pp.speech.AcousticNorm(input_name=('bnf', ), mean_var_norm=True, windowed_mean_var_norm=True, win_length=301), # ====== cleaning ====== # pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy', 'sad_threshold', 'spec', 'mspec', 'mfcc')), pp.base.AsType(dtype='float16') ]) return recipe
def bnf_sad(debugging): bnf_network = N.models.BNF_2048_MFCC40() recipe = pp.make_pipeline(steps=[ pp.speech.AudioReader(sr=16000, sr_new=8000, best_resample=True, remove_dc=True), pp.speech.PreEmphasis(coeff=0.97), pp.base.Converter(converter=WAV_FILES, input_name='path', output_name='name'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=0.025, step_length=0.010, window='hamm', n_fft=512), # ====== SAD ====== # pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'), pp.speech.SADgmm(nb_mixture=3, smooth_window=3, input_name='energy', output_name='sad'), # ====== spectrogram ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=100, fmax=4000, input_name='spec', output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True, input_name='mspec', output_name='mfcc'), pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)), pp.speech.ApplyingSAD(input_name=('mspec', 'mfcc')), # ====== BNF ====== # pp.speech.MelsSpecExtractor(n_mels=40, fmin=100, fmax=4000, input_name='spec', output_name='mspec_bnf'), pp.speech.MFCCsExtractor(n_ceps=40, remove_first_coef=False, input_name='mspec_bnf', output_name='mfcc_bnf'), pp.base.AsType(dtype='float32', input_name='mfcc_bnf'), pp.speech.BNFExtractor(input_name='mfcc_bnf', output_name='bnf', sad_name='sad', network=bnf_network, remove_non_speech=True, stack_context=10, pre_mvn=True, batch_size=5218), # ====== normalization ====== # pp.speech.AcousticNorm(input_name=('mspec', 'bnf', 'mfcc'), mean_var_norm=True, windowed_mean_var_norm=True, win_length=301), # ====== cleaning ====== # pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy', 'sad', 'sad_threshold', 'spec', 'mspec_bnf', 'mfcc_bnf')), pp.base.AsType(dtype='float16') ], debug=debugging) return recipe
def mfcc_sad(debugging): recipe = pp.make_pipeline( steps=[ pp.speech.AudioReader(sr=16000, sr_new=8000), pp.speech.PreEmphasis(coeff=0.97), pp.base.Converter(converter=WAV_FILES, input_name='path', output_name='name'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=0.025, step_length=0.01, n_fft=512, energy=False), # ====== spectrogram ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=20, fmax=3700, output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=24, remove_first_coef=True, first_coef_energy=True, output_name='mfcc'), # ====== SAD ====== # pp.speech.SADthreshold(energy_threshold=0.5, smooth_window=5, output_name='sad'), pp.speech.SADgmm(nb_mixture=3, smooth_window=3, input_name='energy', output_name='sad'), pp.speech.ApplyingSAD(input_name=('mspec', )), pp.speech.AcousticNorm(input_name=('mspec', ), mean_var_norm=True, windowed_mean_var_norm=True, win_length=121), # ====== cleaning ====== # pp.base.DeleteFeatures(input_name=('stft', 'raw', 'spec', 'sad', 'sad_threshold', 'energy')), pp.base.AsType(dtype='float16') ], debug=debugging) return recipe
def mfcc(augmentation=None): delete_list = ['stft', 'spec', 'raw', 'mfcc_energy', 'sad_threshold'] if augmentation is not None: delete_list.append('sad') extractors = pp.make_pipeline(steps=[ SREAugmentor(augmentation) if isinstance(augmentation, string_types) else SREAudioReader(), pp.speech.PreEmphasis(coeff=0.97, input_name='raw'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH, step_length=Config.STEP_LENGTH, n_fft=Config.NFFT, window=Config.WINDOW, padding=False, energy=False), # ====== for x-vector ====== # pp.speech.PowerSpecExtractor(power=2.0, input_name='stft', output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=20, fmax=3700, input_name=('spec', 'sr'), output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=24, remove_first_coef=True, first_coef_energy=True, input_name='mspec', output_name='mfcc'), # ====== extract SAD ====== # pp.speech.SADthreshold(energy_threshold=0.5, energy_mean_scale=0.5, frame_context=2, proportion_threshold=0.12, smooth_window=5, input_name='mfcc_energy', output_name='sad') if augmentation is None else SADreader(ds_path=os.path.join(PATH_ACOUSTIC_FEATURES, 'mfcc')), pp.speech.ApplyingSAD(input_name=('mspec', 'mfcc'), sad_name='sad', keep_unvoiced=False if CURRENT_STATE == SystemStates.EXTRACT_FEATURES else True), # ====== normalization ====== # pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True, win_length=301, input_name=('mspec', 'mfcc')), # ====== post processing ====== # pp.base.DeleteFeatures(input_name=delete_list), pp.base.AsType(dtype='float16'), ]) return extractors
def bnf(augmentation=None): raise NotImplementedError bnf_network = N.models.BNF_2048_MFCC40() recipe = pp.make_pipeline(steps=[ SREAudioReader(), pp.speech.PreEmphasis(coeff=0.97), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH, step_length=Config.STEP_LENGTH, n_fft=Config.NFFT, window=Config.WINDOW), # ====== SAD ====== # pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'), pp.speech.SADgmm(nb_mixture=3, smooth_window=3, input_name='energy', output_name='sad'), # ====== BNF ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=Config.NCEPS, fmin=Config.FMIN, fmax=Config.FMAX, input_name='spec', output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=Config.NCEPS, remove_first_coef=False, input_name='mspec', output_name='mfcc'), pp.base.AsType(dtype='float32', input_name='mfcc'), pp.speech.BNFExtractor(input_name='mfcc', output_name='bnf', sad_name='sad', network=bnf_network, remove_non_speech=True, stack_context=10, pre_mvn=True, batch_size=5218), # ====== normalization ====== # pp.speech.AcousticNorm(input_name=('bnf',), mean_var_norm=True, windowed_mean_var_norm=True, win_length=301), # ====== cleaning ====== # pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy', 'sad_threshold', 'spec', 'mspec', 'mfcc')), pp.base.AsType(dtype='float16') ]) return recipe
extractors = pp.make_pipeline( steps=[ pp.speech.AudioReader(sr_new=8000, best_resample=True, remove_dc=True), pp.speech.PreEmphasis(coeff=0.97), pp.base.Converter( converter=lambda x: os.path.basename(x).split('.')[0], input_name='path', output_name='name'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=0.025, step_length=0.005, n_fft=512, window='hamm', energy=False), # ====== spectrogram ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=64, fmax=4000, input_name=('spec', 'sr'), output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True, first_coef_energy=True, input_name='mspec', output_name='mfcc'), pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)), # ====== SAD ====== # pp.base.RenameFeatures(input_name='mfcc_energy', output_name='energy'), pp.speech.SADthreshold(energy_threshold=0.55, smooth_window=5, input_name='energy', output_name='sad'), # ====== normalization ====== # pp.base.DeleteFeatures(input_name=('stft', 'spec', 'sad_threshold') ), pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True, input_name=('mspec', 'mfcc')), # ====== post processing ====== # pp.base.AsType(dtype='float16'), ], debug=False)
mpi = MPI(jobs=cmds, func=mpi_fn, ncpu=cpu_count() - 1, batch=12) for i in mpi: prog.add(i) # =========================================================================== # Extract Acoustic features # =========================================================================== jobs = get_all_files(wav_path, filter_func=lambda x: '.wav' == x[-4:]) assert len(jobs) == TOTAL_FILES # ====== configuration ====== # if not os.path.exists(outpath) or args.ds: extractors = pp.make_pipeline(steps=[ pp.speech.AudioReader(sr=None, sr_new=8000, best_resample=True, remove_dc=True), pp.base.Converter( converter=lambda x: os.path.basename(x).split('.')[0], input_name='path', output_name='name'), pp.base.AsType(dtype='float16', input_name='raw') ], debug=False) processor = pp.FeatureProcessor(jobs=jobs, path=outpath, extractor=extractors, n_cache=0.08, ncpu=None, override=True) processor.run() pp.validate_features(processor, path='/tmp/tidigits', nb_samples=12,
extractors = pp.make_pipeline(steps=[ pp.speech.AudioReader(sr=FeatureConfigs.sr, dataset=audio), pp.speech.PreEmphasis(coeff=0.97), pp.speech.Dithering(), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=FeatureConfigs.frame_length, step_length=FeatureConfigs.step_length, n_fft=FeatureConfigs.n_fft, window=FeatureConfigs.window), # ====== SAD ====== # pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'), pp.speech.SADgmm(nb_mixture=3, nb_train_it=25, input_name='energy', output_name='sad'), # ====== for x-vector ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=FeatureConfigs.fmin, fmax=FeatureConfigs.fmax, input_name=('spec', 'sr'), output_name='mspec'), # ====== BNF ====== # pp.speech.MelsSpecExtractor(n_mels=FeatureConfigs.n_mels, fmin=FeatureConfigs.fmin, fmax=FeatureConfigs.fmax, input_name=('spec', 'sr'), output_name='mspec_bnf'), pp.speech.MFCCsExtractor(n_ceps=FeatureConfigs.n_ceps, remove_first_coef=False, input_name='mspec', output_name='mfcc_bnf'), pp.base.AsType(dtype='float32', input_name='mfcc_bnf'), pp.speech.BNFExtractor(input_name='mfcc_bnf', output_name='bnf', stack_context=10, pre_mvn=True, sad_name='sad', remove_non_speech=False, network=bnf_network, batch_size=2048), # ====== MFCCs with deltas ====== # pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True, input_name='mspec', output_name='mfcc'), pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)), # ====== SDC ====== # pp.speech.MFCCsExtractor(n_ceps=7, remove_first_coef=True, input_name='mspec', output_name='sdc'), pp.speech.RASTAfilter(rasta=True, input_name='sdc', output_name='sdc'), # ====== normalization ====== # pp.base.DeleteFeatures(input_name=('stft', 'spec', 'mspec_bnf', 'mfcc_bnf', 'sad_threshold')), pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True, sad_name=None, ignore_sad_error=True, input_name=('mspec', 'mfcc', 'sdc', 'bnf')), # ====== post processing ====== # pp.base.EqualizeShape0(input_name=('mspec', 'mfcc', 'sdc', 'bnf', 'energy', 'sad')), pp.base.AsType(dtype='float16'), ], debug=args.debug)
def bnf_all(debugging): bnf_network = N.models.BNF_2048_MFCC40() recipe = pp.make_pipeline( steps=[ pp.speech.AudioReader(sr=16000, sr_new=8000, best_resample=True, remove_dc=True), pp.speech.PreEmphasis(coeff=0.97), pp.base.Converter(converter=WAV_FILES, input_name='path', output_name='name'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=0.025, step_length=0.010, window='hamm', n_fft=512), # ====== SAD ====== # pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'), pp.speech.SADgmm(nb_mixture=3, smooth_window=3, input_name='energy', output_name='sad'), # ====== spectrogram ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=100, fmax=4000, input_name='spec', output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True, input_name='mspec', output_name='mfcc'), pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)), # ====== BNF ====== # pp.speech.MelsSpecExtractor(n_mels=40, fmin=100, fmax=4000, input_name='spec', output_name='mspec_bnf'), pp.speech.MFCCsExtractor(n_ceps=40, remove_first_coef=False, input_name='mspec_bnf', output_name='mfcc_bnf'), pp.base.AsType(dtype='float32', input_name='mfcc_bnf'), pp.speech.BNFExtractor(input_name='mfcc_bnf', output_name='bnf', sad_name='sad', network=bnf_network, remove_non_speech=False, stack_context=10, pre_mvn=True, batch_size=1234), # ====== normalization ====== # pp.speech.AcousticNorm(input_name=('mspec', 'bnf', 'mfcc'), mean_var_norm=True, windowed_mean_var_norm=True, win_length=301), # ====== cleaning ====== # pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy', 'sad', 'sad_threshold', 'spec', 'mspec_bnf', 'mfcc_bnf')), pp.base.AsType(dtype='float16') ], debug=debugging) return recipe
print(fmt % tuple(line)) # =========================================================================== # More detail pipeline # =========================================================================== pp1 = make_pipeline(steps=[ speech.AudioReader(), speech.STFTExtractor(frame_length=0.025, padding=False), # spectra analysis speech.PowerSpecExtractor(output_name='spec', power=1.0), speech.PowerSpecExtractor(output_name='pspec', power=2.0), speech.Power2Db(input_name='pspec', output_name='db'), # Cepstra analysis speech.MelsSpecExtractor(n_mels=40, input_name=('pspec', 'sr')), speech.MFCCsExtractor(n_ceps=13, input_name='mspec'), # others speech.PitchExtractor(frame_length=0.025, f0=True), speech.SADgmm(input_name='stft_energy'), speech.RASTAfilter(input_name='mfcc', output_name='rasta'), base.EqualizeShape0(input_name=None), speech.AcousticNorm(input_name=('mfcc', 'mspec', 'spec'), output_name=('mfcc_norm', 'mspec_norm', 'spec_norm')), speech.ApplyingSAD(input_name='mfcc', output_name='mfcc_sad'), base.StackFeatures(n_context=4, input_name='mfcc') ]) formatted_printer(feats=pp1.transform(AUDIO_PATH)) print("///////////////////////////") # =========================================================================== # Fast pipeline # =========================================================================== pp2 = make_pipeline(steps=[
extractors = pp.make_pipeline( steps=[ pp.speech.AudioReader(sr=FeatureConfigs.sr, dataset=audio), pp.speech.PreEmphasis(coeff=0.97), pp.speech.Dithering(), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=FeatureConfigs.frame_length, step_length=FeatureConfigs.step_length, n_fft=FeatureConfigs.n_fft, window=FeatureConfigs.window), # ====== SAD ====== # pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'), pp.speech.SADgmm(nb_mixture=3, nb_train_it=25, input_name='energy', output_name='sad'), # ====== for x-vector ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=FeatureConfigs.fmin, fmax=FeatureConfigs.fmax, input_name=('spec', 'sr'), output_name='mspec'), # ====== BNF ====== # pp.speech.MelsSpecExtractor(n_mels=FeatureConfigs.n_mels, fmin=FeatureConfigs.fmin, fmax=FeatureConfigs.fmax, input_name=('spec', 'sr'), output_name='mspec_bnf'), pp.speech.MFCCsExtractor(n_ceps=FeatureConfigs.n_ceps, remove_first_coef=False, input_name='mspec', output_name='mfcc_bnf'), pp.base.AsType(dtype='float32', input_name='mfcc_bnf'), pp.speech.BNFExtractor(input_name='mfcc_bnf', output_name='bnf', stack_context=10, pre_mvn=True, sad_name='sad', remove_non_speech=False, network=bnf_network, batch_size=2048), # ====== MFCCs with deltas ====== # pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True, input_name='mspec', output_name='mfcc'), pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)), # ====== SDC ====== # pp.speech.MFCCsExtractor(n_ceps=7, remove_first_coef=True, input_name='mspec', output_name='sdc'), pp.speech.RASTAfilter(rasta=True, input_name='sdc', output_name='sdc'), # ====== normalization ====== # pp.base.DeleteFeatures(input_name=('stft', 'spec', 'mspec_bnf', 'mfcc_bnf', 'sad_threshold')), pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True, sad_name=None, ignore_sad_error=True, input_name=('mspec', 'mfcc', 'sdc', 'bnf')), # ====== post processing ====== # pp.base.EqualizeShape0(input_name=('mspec', 'mfcc', 'sdc', 'bnf', 'energy', 'sad')), pp.base.AsType(dtype='float16'), ], debug=args.debug)
mpi = MPI(jobs=cmds, func=mpi_fn, ncpu=cpu_count() - 1, batch=12) for i in mpi: prog.add(i) # =========================================================================== # Extract Acoustic features # =========================================================================== jobs = get_all_files(wav_path, filter_func=lambda x: '.wav' == x[-4:]) assert len(jobs) == TOTAL_FILES # ====== configuration ====== # if not os.path.exists(outpath) or args.ds: extractors = pp.make_pipeline(steps=[ pp.speech.AudioReader(sr=None, sr_new=8000, best_resample=True, remove_dc=True), pp.base.Converter(converter=lambda x: os.path.basename(x).split('.')[0], input_name='path', output_name='name'), pp.base.AsType(dtype='float16', input_name='raw') ], debug=False) processor = pp.FeatureProcessor(jobs=jobs, path=outpath, extractor=extractors, n_cache=0.08, ncpu=None, override=True) processor.run() pp.validate_features(processor, path='/tmp/tidigits', nb_samples=12, override=True) with open(os.path.join(outpath, 'README'), 'w') as f: f.write(README) # ====== check the preprocessed dataset ====== # ds = F.Dataset(outpath, read_only=True) print(ds) print(ctext(ds.md5, 'yellow')) ds.close()
len(os.listdir(PATH_ACOUSTIC_FEATURES)) != 14 or \ bool(args.acous): extractors = pp.make_pipeline(steps=[ pp.speech.AudioReader(sr_new=8000, best_resample=True, remove_dc=True), pp.speech.PreEmphasis(coeff=0.97), pp.base.Converter(converter=lambda x: os.path.basename(x).split('.')[0], input_name='path', output_name='name'), # ====== STFT ====== # pp.speech.STFTExtractor(frame_length=0.025, step_length=0.005, n_fft=512, window='hamm', energy=False), # ====== spectrogram ====== # pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'), pp.speech.MelsSpecExtractor(n_mels=24, fmin=64, fmax=4000, input_name=('spec', 'sr'), output_name='mspec'), pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True, first_coef_energy=True, input_name='mspec', output_name='mfcc'), pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)), # ====== SAD ====== # pp.base.RenameFeatures(input_name='mfcc_energy', output_name='energy'), pp.speech.SADthreshold(energy_threshold=0.55, smooth_window=5, input_name='energy', output_name='sad'), # ====== normalization ====== # pp.base.DeleteFeatures(input_name=('stft', 'spec', 'sad_threshold')), pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True, input_name=('mspec', 'mfcc')), # ====== post processing ====== # pp.base.AsType(dtype='float16'), ], debug=False) with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore')