def __init__( self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000, cache_dir=None #added ): self.sequence_len = sequence_len self.hop = hop self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[1] self.t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=False), ] if freq_compression == "linear": self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t.append( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t.append(T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t.append( T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])) self.t.append( T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )) #self.file_reader = AsyncFileReader() self.t = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(self.t), n_fft=fft_size, hop_length=fft_hop, #file_reader=AsyncFileReader() )
def __init__( self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000, center=True ): self.sp = signal.signal_proc() self.hop = hop self.center = center self.filename = file_name self.sequence_len = sequence_len self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[1] spec_t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=self.center), ] self.spec_transforms = T.Compose(spec_t) if freq_compression == "linear": self.t_compr_f = (T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t_compr_f = (T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t_compr_f = (T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"] )
def __init__(self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000): self.sequence_len = sequence_len self.hop = hop self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[ 1] # total num of samples in the audio (transposed mono) self.t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=False), ] if freq_compression == "linear": self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t.append( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t.append(T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t.append( T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])) self.t.append( T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )) self.t = T.Compose(self.t)
def __init__(self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=4096, hop_length=441, freq_compression="linear", n_freq_bins=256, f_min=0, f_max=18000, seq_len=128, augmentation=False, noise_files=[], min_max_normalize=False, *args, **kwargs): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.n_fft = n_fft self.hop_length = hop_length self.f_min = f_min self.f_max = f_max valid_freq_compressions = ["linear", "mel", "mfcc"] if freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(freq_compression, valid_freq_compressions), ) self.freq_compression = freq_compression self.possible_call_labels = re.compile("|".join(["call"])) self.possible_nocall_labels = re.compile("|".join(["noise"])) self._logger.debug("Number of files : {}".format(len(self.file_names))) _n_calls = 0 for f in self.file_names: if self.is_call(f): _n_calls += 1 self._logger.debug("Number of calls: {}".format(_n_calls)) self._logger.debug( "Number of noise: {}".format(len(self.file_names) - _n_calls)) self.augmentation = augmentation spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] self.file_reader = AsyncFileReader() if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if augmentation: self._logger.debug( "Init augmentation transforms for time and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: self._logger.debug("Running without augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) self.t_compr_mfcc = T.M2MFCC(n_mfcc=32) else: raise "Undefined frequency compression" if augmentation: if noise_files: self._logger.debug( "Init augmentation transform for random noise addition") self.t_addnoise = T.RandomAddNoise( noise_files, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, return_original=True) else: self.t_addnoise = None self.t_compr_a = T.Amp2Db( min_level_db=DefaultSpecDatasetOps["min_level_db"]) if min_max_normalize: self.t_norm = T.MinMaxNormalize() self._logger.debug("Init min-max-normalization activated") else: self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self._logger.debug("Init 0/1-dB-normalization activated") self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=2048, #4096 hop_length=220, #441 freq_compression="linear", n_freq_bins=256, # determines the width of the image f_min=0, f_max=18000, seq_len=128, # shd be adjusted together with sequence_len in class StridedAudioDataset (called by predict.py) augmentation=False, noise_files=[], *args, **kwargs): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.n_fft = n_fft self.hop_length = hop_length self.f_min = f_min self.f_max = f_max # mel: log transformation of freq (Hz scale to Mel scale) # attention: Mel-spectrograms as a network input led to an excessive loss of resolution in higher frequency bands, which was # a big problem considering the high-frequency pulsed calls and whistles. valid_freq_compressions = ["linear", "mel", "mfcc"] if freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(freq_compression, valid_freq_compressions), ) self.freq_compression = freq_compression # combine a RegExp pattern into pattern objects for pattern matching self.possible_call_labels = re.compile("|".join(["call"])) self.possible_nocall_labels = re.compile("|".join(["noise"])) self._logger.debug("Number of files : {}".format(len(self.file_names))) _n_calls = 0 for f in self.file_names: if self.is_call(f): _n_calls += 1 self._logger.debug("Number of calls: {}".format(_n_calls)) self._logger.debug( "Number of noise: {}".format(len(self.file_names) - _n_calls)) self.augmentation = augmentation spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), # return: a vector tensor T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] self.file_reader = AsyncFileReader() # if user chooses to not cache .spec by omitting the directory if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: # where .spec is created and stored # n_fft, hop_length: meta in spec_dict self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if augmentation: self._logger.debug( "Init augmentation transforms for time and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: self._logger.debug("Running without augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC()) else: raise "Undefined frequency compression" if augmentation: if noise_files: self._logger.debug( "Init augmentation transform for random noise addition") self.t_addnoise = T.RandomAddNoise( noise_files, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, return_original=True ) # if return_original = True, both augmented and original specs are returned else: self.t_addnoise = None self.t_compr_a = T.Amp2Db( min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=1024, hop_length=512, freq_compression="linear", n_freq_bins=256, f_min=None, f_max=18000, *args, **kwargs ): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.sp = signal.signal_proc() self.sr = sr self.f_min = f_min self.f_max = f_max self.n_fft = n_fft self.hop_length = hop_length self.sp = signal.signal_proc() self.freq_compression = freq_compression valid_freq_compressions = ["linear", "mel", "mfcc"] if self.freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(self.freq_compression, valid_freq_compressions), ) self._logger.debug( "Number of test files: {}".format(len(self.file_names)) ) spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False) ] self.file_reader = AsyncFileReader() if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if self.freq_compression == "linear": self.t_compr_f = T.Interpolate( n_freq_bins, sr, f_min, f_max ) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC() ) else: raise "Undefined frequency compression" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=4096, hop_length=441, freq_compression="linear", n_freq_bins=256, f_min=0, f_max=18000, seq_len=128, augmentation=False, noise_files_train=[], noise_files_val=[], noise_files_test=[], random=False, *args, **kwargs ): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.sp = signal.signal_proc() self.df = 15.0 self.exp_e = 0.1 self.bin_pow = 2.0 self.gaus_mean = 0.0 self.gaus_stdv = 12.5 self.poisson_lambda = 15.0 self.orig_noise_value = -5 self.f_min = f_min self.f_max = f_max self.n_fft = n_fft self.random = random self.hop_length = hop_length self.augmentation = augmentation self.file_reader = AsyncFileReader() self.noise_files_val = noise_files_val self.noise_files_test = noise_files_test self.freq_compression = freq_compression self.noise_files_train = noise_files_train valid_freq_compressions = ["linear", "mel", "mfcc"] if self.freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(self.freq_compressio, valid_freq_compressions), ) self._logger.debug( "Number of files to denoise : {}".format(len(self.file_names)) ) spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader()) if self.augmentation: self._logger.debug("Init augmentation transforms for intensity, time, and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: #only for noise augmentation during validation phase - intensity, time and pitch augmentation is not used during validation/test self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() self._logger.debug("Running without intensity, time, and pitch augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose(T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) self.t_compr_mfcc = T.M2MFCC(n_mfcc=32) else: raise "Undefined frequency compression" if self.augmentation and self.noise_files_train and self.dataset_name == "train": self._logger.debug("Init training real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_train, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) elif not self.augmentation and self.noise_files_val and self.dataset_name == "val": self._logger.debug("Init validation real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_val, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) elif not self.augmentation and self.noise_files_test and self.dataset_name == "test": self._logger.debug("Init test real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_test, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) else: self.t_addnoise = None raise "ERROR: Init noise files for noise adding does not have a proper setup per split!" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)