def test_compute_num_augs_per_species(self): # Get # num_samples # foo 10 # bar 25 # fum 50 aug_goals = AugmentationGoals.MEDIAN population = pd.DataFrame.from_dict({ 'foo': 10, 'bar': 25, 'fum': 50 }, orient='index', columns=['num_samples']) Utils.compute_num_augs_per_species(aug_goals, population) num_samples = population.loc[:, 'num_samples'] med = num_samples.median() print(f"Median: {med}") # species foo must receive med-10 = 15 augmentations # bar med-25 = 0 augmentations # fum med-50 = -25 --> 0 augmentations truth = {'foo': 15, 'bar': 0, 'fum': 0} res = Utils.compute_num_augs_per_species(aug_goals, population) self.assertDictEqual(truth, res)
def create_dest_dirs(self, species_list): ''' Creates all directories that will hold new spectrogram snippets for each species. For each directory: if dir exists: o if overwrite_policy is True, wipe the dir o if overwrite_policy is SKIP, leave the directory in place, contents intact o else ask user. If response is Yes, wipe the dir else raise FileExistsError :param species_list: names of species to process :type species_list: [str] :return: top level dir for spectrograms (same as self.out_dir) :rtype: (str) :raise FileExistsError: if a dest dir exists and not allowed to wipe it. ''' # Root dir of each species' spectro snippets: Utils.create_folder(self.out_dir, overwrite_policy=self.overwrite_policy) # One dir each for the spectrogram snippets of one species: for species in species_list: species_spectros_dir = os.path.join(self.out_dir, species) if not Utils.create_folder(species_spectros_dir, overwrite_policy=self.overwrite_policy): raise FileExistsError( f"Target dir {species_spectros_dir} exists; aborting") return self.out_dir
def cull(self, dir_root, fextension): # Good recording code ranges: good_rec_id_rngs = [range(50000, 70001), range(170000, 180001), ] # Get all audio file paths relative # to dir_root: pattern = f'*{fextension}' wav_paths = Utils.find_in_dir_tree(dir_root, pattern=pattern) #********* # wav_paths = ['/foo/bar/AM01_20190711_049999.wav', # no # '/foo/bar/AM01_20190711_050000.wav', # yes # '/foo/bar/AM01_20190711_070000.wav', # yes # '/foo/bar/AM01_20190711_070001.wav', # no # '/foo/bar/AM01_20190711_169999.wav', # no # '/foo/bar/AM01_20190711_170000.wav', # yes # '/foo/bar/AM01_20190711_170001.wav', # no # ] # #********* # Get just the filename without parents # and extension: to_delete = [] for aud_path in wav_paths: ser_num = self.extract_ser_num(aud_path) if ser_num in good_rec_id_rngs[0] \ or ser_num in good_rec_id_rngs[1]: continue else: to_delete.append(aud_path) print(f"Examined {len(wav_paths)} {pattern} files...") if len(to_delete) > 0: if Utils.user_confirm(f"List the {len(to_delete)} bad files? (n/Y)", default='Y'): for fpath in to_delete: print(f"{os.path.getsize(fpath)} bytes: {fpath}") if Utils.user_confirm(f"Delete {len(to_delete)} aud files? (N/y):", default='N'): num_deleted = 0 for fname in to_delete: try: os.remove(fname) except Exception as e: print(f"Could not delete {fname}: {repr(e)}") else: num_deleted += 1 print(f"Removed {num_deleted} files") else: print('Canceling') else: print('No files are out of good recorder serial number ranges')
def test_compute_num_augs_per_species(self): aug_volumes = AugmentationGoals.MAX sample_distrib_df = Utils.sample_compositions_by_species( self.spectros_dir) augs_to_do = Utils.compute_num_augs_per_species( aug_volumes, sample_distrib_df) self.assertEqual(augs_to_do['AMADEC'], 4) self.assertEqual(augs_to_do['FORANA'], 0)
def test_sample_compositions_by_species(self): dist_df = Utils.sample_compositions_by_species(self.spectros_dir) #truth = pd.DataFrame.from_dict({'AMADEC' : 1, 'FORANA' : 5}, orient='index', columns=['num_samples']) self.assertListEqual(list(dist_df.columns), ['num_samples']) self.assertEqual(int(dist_df.loc['AMADEC']), 1) self.assertEqual(int(dist_df.loc['FORANA']), 5)
def prep_aug_tmp_dirs(self, dst_tmp_dir): ''' Copies AMADEC single-spectrogram directory, and FORANA 5-spectrogram dir to the given tmp dir. Creates dir 'aug_spectros' in that same tmp dir. Returns path to that aug_spectros dir. :param dst_tmp_dir: temporary directory :type dst_tmp_dir: src :return: output directory for future spectro augmentations :rtype: str ''' # Do all testing in the tmp dir, where # all files/dirs will be deleted automatically: for species_dir in Utils.listdir_abs(self.full_species_root): species_name = Path(species_dir).stem dst_species_dir = os.path.join(dst_tmp_dir, species_name) shutil.copytree(species_dir, dst_species_dir) # Dir where augmentations are to be placed, # one subdir per species: out_dir = os.path.join(dst_tmp_dir, 'aug_spectros') os.mkdir(out_dir) return out_dir
def test_chop_one_spectrogram_file(self): with tempfile.TemporaryDirectory(dir='/tmp', prefix='chopping', ) as dir_nm: chopper = SpectrogramChopper( self.spectro_root, dir_nm, overwrite_policy=WhenAlreadyDone.OVERWRITE ) species = Path(self.spectro_file).parent.stem outdir = os.path.join(dir_nm, species) true_snippet_time_width = chopper.chop_one_spectro_file( self.spectro_file, outdir, 'DOVE', skip_size=self.skip_size ) snippet_names = os.listdir(outdir) num_expected_snippets = 0 cur_time = true_snippet_time_width while cur_time < self.duration: num_expected_snippets += 1 cur_time += self.skip_size self.assertEqual(len(snippet_names), num_expected_snippets) # Check embedded metadata of one snippet: _spectro, metadata = SoundProcessor.load_spectrogram(Utils.listdir_abs(outdir)[0]) self.assertEqual(round(float(metadata['duration(secs)']), 3), round(true_snippet_time_width, 3) ) self.assertEqual(metadata['species'], 'DOVE')
def sign_of_life(cls, job, num_already_present_imgs, outdir, start_time, force_rewrite=False): # Time for sign of life? now_time = datetime.datetime.now() time_duration = now_time - start_time # Every 3 seconds, but at least 3: if force_rewrite \ or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0): # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(time_duration, granularity=4) # Get current and new spectro imgs in outdir: num_now_present_imgs = len( Utils.find_in_dir_tree(outdir, pattern="*.png")) num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs # Keep printing number of done snippets in the same # terminal line: print((f"{job.name}---Number of spectros: {num_now_present_imgs} " f"({num_newly_present_imgs} new) after {duration_str}"), end='\r') return num_newly_present_imgs else: return num_already_present_imgs
def test_compute_worker_assignments_one_spectro_done(self): # Scenario: one spectro was already done: with tempfile.TemporaryDirectory(dir='/tmp', prefix='test_spectro') as dst_dir: # Fake-create an existing spectrogram: os.mkdir(os.path.join(dst_dir, 'HENLES_S')) done_spectro_path = os.path.join(dst_dir, 'HENLES_S/SONG_Henicorhinaleucosticta_xc259378.png') Path(done_spectro_path).touch() num_tasks_done = len(Utils.find_in_dir_tree( dst_dir, pattern='*.png', entry_type='file')) true_num_assignments = self.num_sound_files - num_tasks_done self.verify_worker_assignments(self.sound_root, dst_dir, WhenAlreadyDone.SKIP, true_num_assignments) # We are to overwrite existing files, # all sound files will need to be done: true_num_assignments = self.num_sound_files self.verify_worker_assignments(self.sound_root, dst_dir, WhenAlreadyDone.OVERWRITE, true_num_assignments)
def setUpClass(cls): cls.cur_dir = os.path.dirname(__file__) cls.sound_root = os.path.join(cls.cur_dir, 'sound_data') # Number of cores to use: num_cores = mp.cpu_count() cls.num_workers = round(num_cores * Utils.MAX_PERC_OF_CORES_TO_USE / 100) cls.num_sound_files = len(Utils.find_in_dir_tree( cls.sound_root, pattern='*.mp3', entry_type='file')) cls.assignments = np.array( [[('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259380.mp3'), ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259381.mp3') ], [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259383.mp3'), ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259379.mp3') ], [('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259378.mp3'), ('HENLES_S', 'SONG_Henicorhinaleucosticta_xc259384.mp3') ], [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc513.mp3'), ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc518466.mp3')], [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc531750.mp3'), ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc50519.mp3')], [('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc511477.mp3'), ('DYSMEN_S', 'SONG_Dysithamnusmentalis_xc548015.mp3') ] ])
def test_binary_in_interval_search(self): intervals = [Interval(1, 3), Interval(4, 5), Interval(6, 7)] res = Utils.binary_in_interval_search(intervals, 0, 'low_val', 'high_val') assert (res == -1) res = Utils.binary_in_interval_search(intervals, 1, 'low_val', 'high_val') assert (res == 0) res = Utils.binary_in_interval_search(intervals, 2, 'low_val', 'high_val') assert (res == 0) res = Utils.binary_in_interval_search(intervals, 3, 'low_val', 'high_val') assert (res == -1) res = Utils.binary_in_interval_search(intervals, 4, 'low_val', 'high_val') assert (res == 1) res = Utils.binary_in_interval_search(intervals, 5, 'low_val', 'high_val') assert (res == -1) res = Utils.binary_in_interval_search(intervals, 8, 'low_val', 'high_val') assert (res == -1)
def chop_all(self): ''' Workhorse: Assuming self.in_dir is root of all species audio samples: self.in_dir Species1 Species2 ... Speciesn smpl1_1.mp3 smpl2_1.mp3 smpln_1.mp3 smpl1_2.mp3 smpl2_2.mp3 smpln_2mp3 ... Chops each .mp3 (or .wav) file into window_len snippets. Saves those snippets in a new directory. Creates a spectrogram for each snippet, and saves those in a different, new directory. Resulting directories under self.out_dir will be: self.out_dir spectrograms wav-files If self.specific_species is None, audio files under all species are chopped. Else, self.specific_species is expected to be a list of species names that correspond to the names of species directories above: Species1, Species2, etc. Returns a 2-tuple: (number of created .wav audio snippet files, number of created .png spectrogram snippet files, ''' for species in self.species_list: audio_files = os.listdir(os.path.join(self.in_dir, species)) num_files = len(audio_files) for i, sample_name in enumerate(audio_files): # Chop one audio file: self.log.info(f"Chopping {species} audio {i}/{num_files}") self.chop_one_audio_file(self.in_dir, species, sample_name, self.out_dir) self.num_chopped += num_files num_spectros = utils.find_in_dir_tree(self.spectrogram_dir_path, pattern='*.png') num_audios = utils.find_in_dir_tree(self.wav_dir_path, pattern='*.wav') return (num_audios, num_spectros)
def setUpClass(cls): cls.cur_dir = os.path.dirname(__file__) cls.model_path = os.path.join( cls.cur_dir, '../../birdsong/tests/models/mod_2021-05-04T13_02_14_net_resnet18_pre_True_frz_0_lr_0.01_opt_SGD_bs_128_ks_7_folds_10_gray_True_classes_34_ep9.pth' ) cls.snips_dir = os.path.join( cls.cur_dir, '../../birdsong/utils/tests/data/fld_snippets' ) cls.example_img_path = Utils.listdir_abs(cls.snips_dir)[0]
def test_listdir_abs(self): # Get the built-in directory listing # with just the file names: nearly_truth = os.listdir(self.cur_dir) abs_paths = Utils.listdir_abs(self.cur_dir) self.assertEquals(len(nearly_truth), len(abs_paths)) # Check existence of first file or dir: self.assertTrue(os.path.exists(abs_paths[0]))
def test_find_in_tree_gen(self): res = list(Utils.find_in_tree_gen(self.spectros_dir, pattern='*.png')) expected = [ f"{self.spectros_dir}/AMADEC/Amaziliadecora1061880.png", f"{self.spectros_dir}/FORANA/SONG_XC609364-41759.png", f"{self.spectros_dir}/FORANA/SONG_XC253440-FORANA04.png", f"{self.spectros_dir}/FORANA/SONG_XC520628-passarochao.png", f"{self.spectros_dir}/FORANA/SONG_XC360575-BFAN.png", f"{self.spectros_dir}/FORANA/SONG_XC171241-Formicarius_analis.png" ] self.assertSetEqual(set(res), set(expected))
def test_orig_file_name(self): # Identity: aug_nm = "foo.wav" orig = Utils.orig_file_name(aug_nm) self.assertEquals(orig, aug_nm) aug_nm = "Amaziliadecora1061880-volume-10.wav" orig = Utils.orig_file_name(aug_nm) self.assertEquals(orig, 'Amaziliadecora1061880.wav') aug_nm = 'Amaziliadecora1061883-rain_bgd0ms.wav' orig = Utils.orig_file_name(aug_nm) self.assertEquals(orig, 'Amaziliadecora1061883.wav') aug_nm = 'Amaziliadecora1061886-shift4600ms.wav' orig = Utils.orig_file_name(aug_nm) self.assertEquals(orig, 'Amaziliadecora1061886.wav') # With directory relative: aug_nm = 'foo/bar/Amaziliadecora1061886-shift4600ms.wav' orig = Utils.orig_file_name(aug_nm) self.assertEquals(orig, 'foo/bar/Amaziliadecora1061886.wav') # With directory absolute: aug_nm = '/foo/bar/Amaziliadecora1061886-shift4600ms.wav' orig = Utils.orig_file_name(aug_nm) self.assertEquals(orig, '/foo/bar/Amaziliadecora1061886.wav')
def check_spectro_sanity(self, dirs_filled): ''' Raises assertion error if any file in the passed-in list of directories is less than 5000 bytes long :param dirs_filled: list of directories whose content files to check for size :type dirs_filled: [str] ''' # Check that each spectro is of # reasonable size: for species_dst_dir in dirs_filled: for spec_file in Utils.listdir_abs(species_dst_dir): self.assertTrue(os.stat(spec_file).st_size > 5000)
def record_creation_times(self, dirs_filled): ''' Given list of absolute file paths, return a dict mapping each path to a Unix modification time in fractional epoch seconds :param dirs_filled: list of absolute file paths :type dirs_filled: [str] :return dict of modification times :rtype {str : float} ''' file_times = {} for species_dst_dir in dirs_filled: for spec_fname in Utils.listdir_abs(species_dst_dir): file_times[spec_fname] = os.path.getmtime(spec_fname) return file_times
def test_generate_all_augmentations_max(self): with tempfile.TemporaryDirectory(dir='/tmp', prefix='test_spectro') as dst_dir: out_dir = self.prep_aug_tmp_dirs(dst_dir) # Tell the augmenter where the src and dest roots are: self.spectro_augmenter_max.input_dir_path = dst_dir self.spectro_augmenter_max.output_dir_path = out_dir # AMADEC has 1 spectro, FORANA has 5 # MAX is 5, So AMADEC needs 4 augementation: num_augs_needed = 4 self.spectro_augmenter_max.generate_all_augmentations() # Should have one directory in aug_spectros new_dirs = Utils.listdir_abs(out_dir) self.assertTrue(len(new_dirs) == 1) # AMADEC subdir should have 2 new files new_files = os.listdir(new_dirs[0]) self.assertTrue(len(new_files), num_augs_needed)
def setUpClass(cls): super(TestChopSpectrograms, cls).setUpClass() cls.skip_size = 2 # sec cls.cur_dir = os.path.dirname(__file__) cls.spectro_root = os.path.join(cls.cur_dir, 'spectro_data_long') cls.spectro_file = os.path.join(cls.spectro_root, 'DOVE/dove_long.png') cls.num_spectro_files = len(Utils.find_in_dir_tree( cls.spectro_root, pattern='*.png', entry_type='file')) _spectro, metadata = SoundProcessor.load_spectrogram(cls.spectro_file) try: cls.duration = float(metadata['duration']) except KeyError: raise AssertionError(f"Spectrogram test file {os.path.basename(cls.spectro_file)} has no duration metadata") cls.default_win_len = 5 # seconds
def create_dest_dirs(self, species_list): ''' Creates all directories that will hold new audio snippets and spectrograms for each species. For each directory: if dir exists: o if overwrite_policy is True, wipe the dir o else ask user. If response is Yes, wipe the dir else raise FileExistsError :param species_list: names of species to process :type species_list: [str] :return: top level dirs for audio snippets and spectrograms :rtype: (str) :raise FileExistsError: if a dest dir exists and not allowed to wipe it. ''' # Root dir of the two dirs that will hold new # audio snippet and spectrogram files utils.create_folder(self.out_dir, overwrite_policy=self.overwrite_policy) # Below the rootP spectrogram_dir_path = os.path.join(self.out_dir,'spectrograms/') wav_dir_path = os.path.join(self.out_dir,'wav-files/') if not utils.create_folder(spectrogram_dir_path, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting") if not utils.create_folder(wav_dir_path, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting") # One dir each for the audio and spectrogram # snippets of one species: for species in species_list: species_spectros_dir = os.path.join(spectrogram_dir_path, species) if not utils.create_folder(species_spectros_dir, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {species_spectros_dir} exists; aborting") species_audio_dir = os.path.join(wav_dir_path, species) if not utils.create_folder(species_audio_dir, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {species_audio_dir} exists; aborting") return(wav_dir_path, spectrogram_dir_path)
def cull_spectro_paths(cls, species_or_recorder_name, dst_dir, rec_paths, overwrite_policy=WhenAlreadyDone.ASK): #******* DISABLED ************ # method analogous to cull_rec_paths() in create_spectrograms() # Currently below is just a copy from create_spectrograms(). # If we end up needing culling, update this body return rec_paths #******* DISABLED ************ # NEVER REACHED new_rec_paths = [] for aud_fname in rec_paths: fname_stem = Path(aud_fname).stem dst_path = os.path.join(dst_dir, species_or_recorder_name, f"{fname_stem}.png") if not os.path.exists(dst_path): # Destination spectrogram does not exist; # keep this audio file in the to-do list: new_rec_paths.append(aud_fname) continue if overwrite_policy == WhenAlreadyDone.OVERWRITE: os.remove(dst_path) new_rec_paths.append(aud_fname) continue if overwrite_policy == WhenAlreadyDone.SKIP: # Don't even assign audio file to a worker, # since its spectro already exists: continue if overwrite_policy == WhenAlreadyDone.ASK: if Utils.user_confirm( f"Spectrogram for {dst_path} exists; overwrite?"): os.remove(dst_path) new_rec_paths.append(aud_fname) continue return new_rec_paths
def __init__(self, input_dir_path, plot=False, overwrite_policy=False, aug_goals=AugmentationGoals.MEDIAN, random_augs = False, multiple_augs = False,): ''' :param input_dir_path: directory holding .wav files :type input_dir_path: str :param plot: whether or not to plot informative chars along the way :type plot: bool :param overwrite_policy: if true, don't ask each time previously created work will be replaced :type overwrite_policy: bool :param aug_goals: either an AugmentationGoals member, or a dict with a separate AugmentationGoals for each species: {species : AugmentationGoals} (See definition of AugmentationGoals; TENTH/MAX/MEDIAN) :type aug_goals: {AugmentationGoals | {str : AugmentationGoals}} :param random_augs: if this is true, will randomly choose augmentation to use for each new sample :type random_augs: bool :param multiple_augs: if we want to allow multiple augmentations per sample (e.g. time shift and volume)): :type multiple_augs: bool ''' self.log = LoggingService() if not isinstance(overwrite_policy, WhenAlreadyDone): raise TypeError(f"Overwrite policy must be a member of WhenAlreadyDone, not {type(overwrite_policy)}") if not os.path.isabs(input_dir_path): raise ValueError(f"Input path must be a full, absolute path; not {input_dir_path}") self.input_dir_path = input_dir_path self.multiple_augs = multiple_augs self.plot = plot self.overwrite_policy = overwrite_policy self.species_names = Utils.find_species_names(self.input_dir_path) # If aug_goals is not a dict mapping # each species to an aug_goals, but just # a single AugmentationGoals, create # a dict from all bird species, mapping # each to that same value: if type(aug_goals) != dict: aug_goals = {species : aug_goals for species in self.species_names } # Get dataframe with row lables being the # species, and one col with number of samples # in the respective species: # num_samples # sp1 10 # sp2 15 # .. self.sample_distrib_df = Utils.sample_compositions_by_species(input_dir_path, augmented=False) if plot: # Plot a distribution: self.sample_distrib_df.plot.bar() # Build a dict with number of augmentations to do # for each species: self.augs_to_do = Utils.compute_num_augs_per_species(aug_goals, self.sample_distrib_df) # Get input dir path without trailing slash: #**** canonical_in_path = str(Path(input_dir_path)) # Create the descriptive name of an output directory # for the augmented samples: if random_augs: os.path.join(Path(input_dir_path).parent, 'augmented_samples_random') self.output_dir_path = os.path.join(Path(input_dir_path).parent, 'augmented_samples_random') else: assert(self.ADD_NOISE + self.TIME_SHIFT + self.VOLUME == 1) dir_nm = f"Augmented_samples_-{self.ADD_NOISE:.2f}n-{self.TIME_SHIFT:.2f}ts-{self.VOLUME:.2f}w" self.output_dir_path = os.path.join(Path(input_dir_path).parent, dir_nm) if self.multiple_augs: self.output_dir_path += "/" else: # Indicate that augmentations are mutually exclusive self.output_dir_path += "-exc/" self.log.info(f"Results will be in {self.output_dir_path}") Utils.create_folder(self.output_dir_path, self.overwrite_policy) # Hide the UserWarning: PySoundFile failed. Trying audioread instead. warnings.filterwarnings(action="ignore", message="PySoundFile failed. Trying audioread instead.", category=UserWarning, module='', lineno=0)
def augment_one_species(self, in_dir, out_dir, num_augs_to_do): ''' Takes one species, and a number of audio augmentations to do. Generates the files, and returns a list of the newly created files (full paths). The maximum number of augmentations created depends on the number of audio augmentation methods available (currently 3), and the number of audio files available for the given species: num-available-audio-augs * num-of-audio-files If num_augs_to_do is higher than the above maximum, only that maximum is created. The rest will need to be accomplished by spectrogram augmentation in a different portion of the workflow. Augmentations are effectively done round robin across all of the species' audio files such that each file is augmented roughly the same number of times until num_augs_to_do is accomplished. :param in_dir: directory holding one species' audio files :type in_dir: str :param out_dir: destination for new audio files :type out_dir: src :param num_augs_to_do: number of augmentations :type num_augs_to_do: int :returns: list of newly created file paths :rtype: [src] ''' # By convention, species name is the last part of the directory: species_name = Path(in_dir).stem # Create subfolder for the given species: if not Utils.create_folder(out_dir, self.overwrite_policy): self.log.info(f"Skipping augmentations for {species_name}") return [] # Get dict: {full-path-to-an-audio_file : 0} # The zeroes will be counts of augmentations # needed for that file: in_wav_files = {full_in_path : 0 for full_in_path in Utils.listdir_abs(in_dir) } # Cannot do augmentations for species with 0 samples if len(in_wav_files) == 0: self.log.info(f"Skipping for {species_name} since there are no original samples.") return [] # Distribute augmenations across the original # input files: aug_assigned = 0 while aug_assigned < num_augs_to_do: for fname in in_wav_files.keys(): in_wav_files[fname] += 1 aug_assigned += 1 if aug_assigned >= num_augs_to_do: break new_sample_paths = [] failures = 0 for in_fname, num_augs_this_file in in_wav_files.items(): # Create augs with different methods: # Pick audio aug methods to apply (without replacement) # Note that if more augs are to be applied to each file # than methods are available, some methods will need # to be applied multiple times; no problem, as each # method includes randomness: max_methods_sample_size = min(len(list(AudAugMethod)), num_augs_this_file) methods = random.sample(list(AudAugMethod), max_methods_sample_size) # Now have something like: # [volume, time-shift], or all methods: [volume, time-shift, noise] if num_augs_this_file > len(methods): # Repeat the methods as often as # needed: num_method_set_repeats = int(math.ceil(num_augs_this_file/len(methods))) # The slice to num_augs_this_file chops off # the possible excess from the array replication: method_seq = (methods * num_method_set_repeats)[:num_augs_this_file] # Assuming num_augs_per_file is 7, we not have method_seq: # [m1,m2,m3,m1,m2,m3,m1] else: method_seq = methods for method in method_seq: out_path_or_err = self.create_new_sample(in_fname, out_dir, method) if isinstance(out_path_or_err, Exception): failures += 1 else: new_sample_paths.append(out_path_or_err) self.log.info(f"Audio aug report: {len(new_sample_paths)} new files; {failures} failures") return new_sample_paths, failures
def img_generator(self, in_img_or_dir): if os.path.isfile(in_img_or_dir): return iter([in_img_or_dir]) return Utils.find_in_tree_gen(in_img_or_dir, '*.png')
def create_snips_gen_for_sel_tbls(self, snippets_src, sel_tables_src): ''' Given one or more Raven selection tables, and one or more recording snippet paths, return a dict: {<recording-id> : SelTblSnipsAssoc-inst<table-path, snippets-dir>} where recording-id is like AM01_20190719_063242; table-path is the full path to one selection table with the respective recording-id, and snippets-dir is root of a director containing the snippets covered in the recording. Usage concept: o There are relatively few selection tables, since they are human-generated o There can be thousands of snippet .png files whose time spans are covered in one table o The data structure returned from this method can be used like this: tbl_snips_match = create_snips_gen_for_sel_tbls('/foo/my_snips', '/bar/my_tbls') # For each selection table, work on the snippets # that are covered by that table for rec_id in tbl_snips_match: for snip_path in tbl_snips_match.snips_iterator(): <do something with spectrogram snippet> :param snippets_src: iterable over absolute paths to snippets, or the absolute path to a directory :type snippets_src: {Iterator(str) | str} :param sel_tables_src: absolute path to selection table, or path to a directory that contains selection tables, or iterator over absolute paths to selection tables :type sel_tables_src: str :returned dict mapping recording ID to SelTblSnipsAssoc instances :rtype {str : SelTblSnipsAssoc} ''' # Table paths may be an individual # file, a directory, or a generator # of absolute paths. Sanity checks: if type(sel_tables_src) == str: if not os.path.isabs(sel_tables_src): raise ValueError( f"Table paths must be a generator, or an absolute path to a selection table or dir" ) if os.path.isfile(sel_tables_src): sel_tables_src = [sel_tables_src] elif os.path.isdir(sel_tables_src): sel_tables_src = Utils.listdir_abs(sel_tables_src) # If not a string, sel_tables_src better be a generator: elif not isinstance(sel_tables_src, types.GeneratorType): raise ValueError( f"Table paths must be a generator, or an absolute path to a selection table or dir" ) # Same checks for snippet location: if type(snippets_src) == str: if not os.path.isabs(snippets_src) \ or not os.path.isdir(snippets_src): raise ValueError( f"Snippet paths must be a generator, or an absolute path to a snippet dir" ) snippets_src = iter(Utils.listdir_abs(snippets_src)) # If not a string, snippets_src better be a generator: elif not isinstance(sel_tables_src, types.GeneratorType): raise ValueError( f"Snippets src must be a generator, or an absolute path to dir" ) # Build a dict: # {<recording_id> : <dir-of-matching-snippets>} recording_selection_tables = {} for table_path in sel_tables_src: recording_id = self.extract_recording_id(table_path) if recording_id is not None: recording_selection_tables[recording_id] = \ SelTblSnipsAssoc(table_path, snippets_src) return recording_selection_tables
def __init__(self, selection_tbl_loc, spectrogram_locs, out_dir, unittesting=False): ''' Create snippet copies into out_dir for all snippets that are covered by any of the given selection tables. :param selection_tbl_loc: path to individual selection table or a directory containing selection tables. Each tbl is a tsv file with extension .txt :type selection_tbl_loc: str :param spectrogram_locs: individual or directory of spectrogram snippets. :type spectrogram_locs: str :param out_dir: destination of snippet copies :type out_dir: src :param unittesting: if True, does not initialize the instance, or run any operations :type unittesting: bool ''' if unittesting: return if not os.path.exists(selection_tbl_loc): print(f"Cannot open {selection_tbl_loc}") sys.exit(1) if not os.path.exists(spectrogram_locs): print(f"Spectrogram snippets {spectrogram_locs} not found") sys.exit(1) # Is path to sel tbl an individual tsv file? if os.path.isfile(selection_tbl_loc): table_paths = iter([selection_tbl_loc]) else: # Caller gave directory of .csv files. # Get them all recursively: table_paths = Utils.find_in_tree_gen(selection_tbl_loc, pattern="*.txt") # Is snippets path to an individual .png snippet file? if os.path.isfile(spectrogram_locs): snippet_paths = iter([spectrogram_locs]) else: # Caller gave directory of .png files. # Get them all recursively: snippet_paths = Utils.find_in_tree_gen(spectrogram_locs, pattern="*.png") # If out_dir does not exist, create it, # and all dirs along the path: if not os.path.exists(out_dir): os.makedirs(out_dir) # Get dict: # {<recording-id> : SelTblSnipsAssoc-instance} # where each SelTblSnipsAssoc instance is a generator # of snippet metadata from snippet that are covered in # the selection table that is associated with the instance. # In addition the absolute snippet path is added as # entry of key 'snip_path'. # # The generator feeds out the snippet metadata in order of # start time. # # For brevity, call each instance of SelTblSnipsAssoc # an 'assoc' rec_id_assocs = self.create_snips_gen_for_sel_tbls( snippet_paths, table_paths) for assoc in rec_id_assocs.values(): # The assoc focuses on a single selection # table, and the snippets it covers. # Get the info contained in each row of # the sel tb. This will be a list of dicts, each with # the information from one selection tbl row: selections = Utils.read_raven_selection_table( assoc.raven_sel_tbl_path) # Go through each snippet in the association, enrich its # metadata with species info. Then copy the enriched # snippet to the target dir: for snip_metadata in iter(assoc): self.match_snippet(selections, snip_metadata, out_dir)
def add_background(cls, file_name, noise_path, out_dir, len_noise_to_add=5.0): ''' Takes an absolute file path, and the path to a directory that contains noise to overlay onto the given sound file (wind, rain, etc.). Returns a numpy structure corresponding to the original audio with the noise overlaid, plus the sample rate of the new sample. A file name is suggested for the sample. It is composed of elements such as the nature and duration of the noise. Client may choose to ignore or use. :param file_name: absolute path to sound file :type file_name: str :param noise_path: absolute path to directory with noise files :type noise_path: str :param out_dir: destination directory of new audio file :type out_dir: str :param len_noise_to_add: how much of a noise snippet to overlay (seconds) :type len_noise_to_add: float :return: full path of new audio file :rtype: str ''' len_noise_to_add = float(len_noise_to_add) backgrounds = os.listdir(noise_path) # Pick a random noise file: background_name = backgrounds[random.randint(0, len(backgrounds) - 1)] cls.log.info(f"Adding {background_name} to {file_name}.") # We will be working with 1 second as the smallest unit of time # load all of both wav files and determine the length of each noise, noise_sr = SoundProcessor.load_audio( os.path.join(noise_path, background_name)) # type(noise) = np.ndarray orig_recording, orig_sr = SoundProcessor.load_audio(file_name) new_sr = math.gcd(noise_sr, orig_sr) if noise_sr != orig_sr: # Resample both noise and orig records so that they have same sample rate cls.log.info(f"Resampling: {background_name} and {file_name}") noise = librosa.resample(noise, noise_sr, new_sr) orig_recording = librosa.resample(orig_recording, orig_sr, new_sr) # input("ready?") noise_duration = librosa.get_duration(noise, noise_sr) if noise_duration < len_noise_to_add: cls.log.info( f"Duration:{noise_duration} < len_noise_to_add:{len_noise_to_add}. Will only add {noise_duration}s of noise" ) samples_per_segment = len(noise) elif noise_duration >= len_noise_to_add: # randomly choose noise segment samples_per_segment = int( new_sr * len_noise_to_add ) # this is the number of samples per 5 seconds # Place noise randomly: subsegment_start = random.randint(0, len(noise) - samples_per_segment) noise = noise[subsegment_start:subsegment_start + samples_per_segment] cls.log.info( f"len(noise) after random segment: {len(noise)}; noise duration: {len(noise)/new_sr}" ) orig_duration = librosa.core.get_duration(orig_recording, orig_sr) # if orig_recording is shorter than the noise we want to add, just add 5% noise if orig_duration < len_noise_to_add: cls.log.info( f"Recording: {file_name} was shorter than len_noise_to_add. Adding 5% of recording len worth of noise." ) new_noise_len = orig_duration * 0.05 noise = noise[:int(new_noise_len * new_sr)] noise_start_loc = random.randint( 0, len(orig_recording) - samples_per_segment) cls.log.info( f"Inserting noise starting at {noise_start_loc/new_sr} seconds.") # split original into three parts: before_noise, during_noise, after_noise before_noise = orig_recording[:noise_start_loc] during_noise = orig_recording[noise_start_loc:noise_start_loc + samples_per_segment] after_noise = orig_recording[noise_start_loc + samples_per_segment:] assert len(during_noise) == len(noise) segment_with_noise = during_noise + Utils.noise_multiplier( orig_recording, noise) * noise first_half = np.concatenate((before_noise, segment_with_noise)) new_sample = np.concatenate( (first_half, after_noise)) # what i think it should be new_duration = librosa.get_duration(new_sample, float(new_sr)) assert new_duration == orig_duration # File name w/o extension: sample_file_stem = Path(file_name).stem noise_file_stem = Path(background_name).stem noise_dur = str(int(noise_start_loc / new_sr * 1000)) file_name = f"{sample_file_stem}-{noise_file_stem}_bgd{noise_dur}ms.wav" # Ensure that the fname doesn't exist: uniq_fname = Utils.unique_fname(out_dir, file_name) out_path = os.path.join(out_dir, uniq_fname) soundfile.write(out_path, new_sample, new_sr) return out_path
def test_from_commandline(self): with tempfile.TemporaryDirectory(dir='/tmp', prefix='test_spectro') as dst_dir: args = Arguments() args.input = self.spectro_root args.outdir = dst_dir args.workers = None # Number of spectrogram .png files # in source tree: spectros_to_chop = Utils.find_in_dir_tree(self.spectro_root, '*.png') manager = mp.Manager() global_info = manager.dict() global_info['jobs_status'] = manager.list() # ------ Chop spectrograms: SpectrogramChopper.run_workers( args, global_info, overwrite_policy=WhenAlreadyDone.OVERWRITE ) dirs_filled = [os.path.join(dst_dir, species_dir) for species_dir in os.listdir(dst_dir)] num_spectros_done = sum([len(Utils.find_in_dir_tree(one_filled_dir, '*.png')) for one_filled_dir in dirs_filled]) self.assertTrue(num_spectros_done > len(spectros_to_chop)) self.check_spectro_sanity(dirs_filled) # Remember the creation times: file_times = self.record_creation_times(dirs_filled) # ------ SKIP the existing spectrograms: # Run again, asking to skip already existing # spectros: global_info = manager.dict() global_info['jobs_status'] = manager.list() SpectrogramChopper.run_workers( args, global_info, overwrite_policy=WhenAlreadyDone.SKIP ) dirs_filled = [os.path.join(dst_dir, species_dir) for species_dir in os.listdir(dst_dir)] # Mod times of png files must NOT have changed, # b/c of skipping new_file_times = self.record_creation_times(dirs_filled) self.assertDictEqual(new_file_times, file_times) # ------ Force RECREATION of spectrograms: # Run again with OVERWRITE, forcing the # spectros to be done again: global_info = manager.dict() global_info['jobs_status'] = manager.list() SpectrogramChopper.run_workers( args, global_info, overwrite_policy=WhenAlreadyDone.OVERWRITE ) dirs_filled = [os.path.join(dst_dir, species_dir) for species_dir in os.listdir(dst_dir)] self.check_spectro_sanity(dirs_filled) # File times must be *different* from previous # run because we asked to overwrite: new_file_times = self.record_creation_times(dirs_filled) for fname in file_times.keys(): try: self.assertTrue(new_file_times[fname] != file_times[fname]) except KeyError as e: print(repr(e))
# Enforce args to set_info or add_info being # equal length, i.e. having 'names' and 'values' # as pairs: if (setting and len(info_to_set) % 2 != 0) \ or (adding and len(info_to_add) % 2 != 0): print( "Info entries must be pairs of keys and values; length is odd numbered here" ) sys.exit(1) # Safety precaution just for setting # (and thereby overwriting) metadata: if setting and not args.force: if not Utils.user_confirm( "Really want to overwrite png file metadata? (N/y)", default='n'): print("Canceling") sys.exit(0) if args.printout: print("Metadata before:") PNGMetadataManipulator.extract_metadata(args.snippet_src, show=args.show, printout=args.printout) print("") # Setting info_to_set: if args.outfile is None: # Overwrite the input file, # i.e. add metadata in place: