def setUp(self): self.spect_params = dict(fft_size=512, step_size=64, freq_cutoffs=(500, 10000), thresh=6.25, transform_type='log_spect') self.tmp_output_dir = tempfile.mkdtemp() # ---- cbins ------------------------------- self.audio_dir_cbin = TEST_DATA_DIR.joinpath('cbins', 'gy6or6', '032312') self.audio_files_cbin = sorted(list( self.audio_dir_cbin.glob('*.cbin'))) self.audio_files_cbin = [str(path) for path in self.audio_files_cbin] self.annot_files_cbin = files_from_dir(annot_dir=self.audio_dir_cbin, annot_format='notmat') scribe_cbin = crowsetta.Transcriber(annot_format='notmat') self.annot_list_cbin = scribe_cbin.from_file( annot_file=self.annot_files_cbin) self.labelset_cbin = set(list('iabcdefghjk')) # sort annotation, audio into lists so we can verify labelset works # "good" = all labels in annotation are in labelset self.good = [(annot_file, Path(annot.audio_file).name) for annot_file, annot in zip(self.annot_files_cbin, self.annot_list_cbin) if set(annot.seq.labels).issubset(self.labelset_cbin)] # "bad" = has labels not in labelset self.bad = [(annot_file, Path(annot.audio_file).name) for annot_file, annot in zip(self.annot_files_cbin, self.annot_list_cbin) if not set(annot.seq.labels).issubset(self.labelset_cbin)]
def setUp(self): self.spect_dir = TEST_DATA_DIR.joinpath('mat', 'llb3', 'spect') self.spect_files = self.spect_dir.glob('*.mat') self.spect_files = sorted([str(path) for path in self.spect_files]) self.spect_format = 'mat' self.annot_mat = TEST_DATA_DIR.joinpath('mat', 'llb3', 'llb3_annot_subset.mat') self.annot_mat = str(self.annot_mat) self.scribe = crowsetta.Transcriber(annot_format='yarden') self.annot_list = self.scribe.from_file(self.annot_mat) self.labelset_mat = { 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19 }
def test_lbl_tb2segments_recovers_onsets_offsets_labels_from_real_data( self): # TODO: make all this into fixture(s?) when switching to PyTest scribe = crowsetta.Transcriber(annot_format='notmat') annot_list = scribe.from_file(annot_file=ANNOT_PATHS) annot_list = [ annot for annot in annot_list # need to remove any annotations that have labels not in labelset if not any(lbl not in LABELMAP.keys() for lbl in annot.seq.labels) ] spect_annot_map = vak.annotation.source_annot_map( SPECT_PATHS, annot_list, ) lbl_tb_list = [] for spect_file, annot in spect_annot_map.items(): lbls_int = [LABELMAP[lbl] for lbl in annot.seq.labels] time_bins = vak.files.spect.load(spect_file)[TIMEBINS_KEY] lbl_tb_list.append( vak.labeled_timebins.label_timebins( lbls_int, annot.seq.onsets_s, annot.seq.offsets_s, time_bins, unlabeled_label=LABELMAP['unlabeled'])) for lbl_tb, annot in zip(lbl_tb_list, spect_annot_map.values()): labels, onsets_s, offsets_s = vak.labeled_timebins.lbl_tb2segments( lbl_tb, LABELMAP, TIMEBIN_DUR) self.assertTrue(np.array_equal(labels, annot.seq.labels)) self.assertTrue( np.allclose(onsets_s, annot.seq.onsets_s, atol=0.001, rtol=0.03)) self.assertTrue( np.allclose(offsets_s, annot.seq.offsets_s, atol=0.001, rtol=0.03))
def test_source_annot_map_cbin_yarden(self): scribe = crowsetta.Transcriber(annot_format='yarden') mat_dir = TEST_DATA_DIR.joinpath('mat', 'llb3') annot_file = str(mat_dir.joinpath('llb3_annot_subset.mat')) annot_list = scribe.from_file(annot_file=annot_file) spect_files = mat_dir.joinpath('spect').glob('*.mat') spect_files = [str(path) for path in spect_files] source_annot_map = vak.annotation.source_annot_map( source_files=spect_files, annot_list=annot_list) for source, annot in list(source_annot_map.items()): self.assertTrue(source in spect_files) self.assertTrue(annot in annot_list) source_annot_map.pop(source) # if every source file got mapped to an annot, and we mapped all of them, # then dictionary should be empty after loop self.assertTrue(source_annot_map == {})
def test_source_annot_map_cbin_notmat(self): scribe = crowsetta.Transcriber(annot_format='notmat') cbin_dir = TEST_DATA_DIR.joinpath('cbins', 'gy6or6', '032312') notmats = cbin_dir.glob('*.not.mat') notmats = [str(path) for path in notmats] annot_list = scribe.from_file(annot_file=notmats) audio_files = cbin_dir.glob('*.cbin') audio_files = [str(path) for path in audio_files] source_annot_map = vak.annotation.source_annot_map( source_files=audio_files, annot_list=annot_list) for source, annot in list(source_annot_map.items()): self.assertTrue(source in audio_files) self.assertTrue(annot in annot_list) source_annot_map.pop(source) # if every source file got mapped to an annot, and we mapped all of them, # then dictionary should be empty after loop self.assertTrue(source_annot_map == {})
def test_source_annot_map_wav_koumura(self): scribe = crowsetta.Transcriber(annot_format='koumura') koumura_dir = TEST_DATA_DIR.joinpath('koumura', 'Bird0') annot_xml = str(koumura_dir.joinpath('Annotation.xml')) wavpath = koumura_dir.joinpath('Wave') annot_list = scribe.from_file(annot_file=annot_xml, wavpath=str(wavpath)) audio_files = wavpath.glob('*.wav') audio_files = [str(path) for path in audio_files] source_annot_map = vak.annotation.source_annot_map( source_files=audio_files, annot_list=annot_list) for source, annot in list(source_annot_map.items()): self.assertTrue(source in audio_files) self.assertTrue(annot in annot_list) source_annot_map.pop(source) # if every source file got mapped to an annot, and we mapped all of them, # then dictionary should be empty after loop self.assertTrue(source_annot_map == {})
def main(train_dur=TRAIN_DUR, val_dur=VAL_DUR, annot_ext=ANNOT_EXT, annot_format=ANNOT_FORMAT, subset_dir=SUBSET_DIR, labelset=None): """makes training set of specified duration by taking subset of files in current directory and copying to a newly-created sub-directory """ labelset = list(labelset) # assumes single string annot_files = glob(f'*{annot_ext}') annot_files = sorted(annot_files) scribe = crowsetta.Transcriber(annot_format=annot_format) annots = scribe.from_file(annot_files) dur = 0 annot_ctr = 0 annots_to_use = [] total_dur = train_dur + val_dur while dur < total_dur: if annot_ctr > len(annots): raise ValueError( f'ran out of annotation files before finding subset of duration {total_dur}' ) if labelset: if not set(annots[annot_ctr].seq.labels).issubset(set(labelset)): annot_ctr += 1 continue dur += annots[annot_ctr].seq.offsets_s[-1] annots_to_use.append(annots[annot_ctr]) annot_ctr += 1 os.makedirs(subset_dir) for annot in annots_to_use: annot_stem = Path(annot.annot_file).name.split('.')[0] files_this_annot = glob(f'{annot_stem}*') for file in files_this_annot: shutil.move(file, subset_dir)
def main(train_dur=TRAIN_DUR, val_dur=VAL_DUR, annot_ext=ANNOT_EXT, voc_format=VOC_FORMAT, subset_dir=SUBSET_DIR, labelset=None): """makes training set of specified duration by taking subset of files in current directory and copying to a newly-created sub-directory """ labelset = list(labelset) # assumes single string annot_files = glob(f'*{annot_ext}') annot_files = sorted(annot_files) scribe = crowsetta.Transcriber(voc_format=voc_format) seqs = scribe.to_seq(annot_files) dur = 0 seq_ctr = 0 seqs_to_use = [] total_dur = train_dur + val_dur while dur < total_dur: if seq_ctr > len(seqs): raise ValueError( f'ran out of annotation files before finding subset of duration {total_dur}' ) if labelset: if not set(seqs[seq_ctr].labels).issubset(set(labelset)): seq_ctr += 1 continue dur += seqs[seq_ctr].offsets_s[-1] seqs_to_use.append(seqs[seq_ctr]) seq_ctr += 1 os.makedirs(subset_dir) for seq in seqs_to_use: seq_stem = Path(seq.file).stem seq_files = glob(f'{seq_stem}*') for seq_file in seq_files: shutil.move(seq_file, subset_dir)
def from_df(vak_df): """get list of annotations from a vak DataFrame. If no annotation format is specified for the DataFrame (in the 'annot_format' column), returns None. Parameters ---------- vak_df : DataFrame representating a dataset of vocalizations, with column 'annot_format'. Returns ------- annots : list of annotations for each row in the dataframe, represented as crowsetta.Annotation instances. Notes ----- This function encapsulates logic for handling different types of annotations; it determines whether each row has a separate annotation file, or if instead there is a single annotation file associated with all rows. If the latter, then the function opens that file and makes sure that each row from the dataframe can be paired with an annotation (using `source_annot_map`). """ annot_format = format_from_df(vak_df) if annot_format is None: return None scribe = crowsetta.Transcriber(annot_format=annot_format) if len(vak_df['annot_path'].unique()) == 1: # --> there is a single annotation file associated with all rows # this can be true in two different cases: # (1) many rows, all have the same file # (2) only one row, so there's only one annotation file (which may contain annotation for multiple source files) annot_path = vak_df['annot_path'].unique().item() annots = scribe.from_file(annot_file=annot_path) # as long as we have at least as many annotations as there are rows in the dataframe if ((isinstance(annots, list) and len(annots) >= len(vak_df)) or # case 1 (isinstance(annots, crowsetta.Annotation) and len(vak_df) == 1)): # case 2 if isinstance(annots, crowsetta.Annotation): annots = [ annots ] # wrap in list for source_annot_map to iterate over it # then we can try and map those annotations to the rows audio_annot_map = source_annot_map(vak_df['audio_path'].values, annots) # sort by row of dataframe annots = [ audio_annot_map[audio_path] for audio_path in vak_df['audio_path'].values ] else: raise ValueError( 'unable to load labels from dataframe; found a single annotation file associated with all ' 'rows in dataframe, but loading it did not return a list of annotations for each row.\n' f'Single annotation file: {annot_path}\n' f'Loading it returned a {type(annots)}.') elif len(vak_df['annot_path'].unique()) == len(vak_df): # --> there is a unique annotation file (path) for each row, iterate over them to get labels from each annots = [ scribe.from_file(annot_file=annot_path) for annot_path in vak_df['annot_path'].values ] else: raise ValueError( 'unable to load labels from dataframe; did not find an annotation file for each row or ' 'a single annotation file associated with all rows.') return annots
def predict( csv_path, checkpoint_path, labelmap_path, annot_format, to_format_kwargs, model_config_map, window_size, num_workers=2, spect_key='s', timebins_key='t', spect_scaler_path=None, device=None, logger=None, ): """make predictions on dataset with trained model specified in config.toml file. Function called by command-line interface. Parameters ---------- csv_path : str path to where dataset was saved as a csv. checkpoint_path : str path to directory with checkpoint files saved by Torch, to reload model labelmap_path : str path to 'labelmap.json' file. annot_format : str format of annotations. Any format that can be used with the crowsetta library is valid. to_format_kwargs : dict keyword arguments for crowsetta `to_format` function. Defined in .toml config file as a table. An example for the notmat annotation format (as a dictionary) is: {'min_syl_dur': 10., 'min_silent_dur', 6., 'threshold': 1500}. model_config_map : dict where each key-value pair is model name : dict of config parameters window_size : int size of windows taken from spectrograms, in number of time bins, shown to neural networks num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. spect_key : str key for accessing spectrogram in files. Default is 's'. timebins_key : str key for accessing vector of time bins in files. Default is 't'. device : str Device on which to work with model + data. Defaults to 'cuda' if torch.cuda.is_available is True. spect_scaler_path : str path to a saved SpectScaler object used to normalize spectrograms. If spectrograms were normalized and this is not provided, will give incorrect results. Other Parameters ---------------- logger : logging.Logger instance created by vak.logging.get_logger. Default is None. Returns ------- None """ if device is None: device = get_default_device() # ---------------- load data for prediction ------------------------------------------------------------------------ if spect_scaler_path: log_or_print(f'loading SpectScaler from path: {spect_scaler_path}', logger=logger, level='info') spect_standardizer = joblib.load(spect_scaler_path) else: log_or_print(f'Not loading SpectScaler, no path was specified', logger=logger, level='info') spect_standardizer = None transform, target_transform = transforms.get_defaults( 'predict', spect_standardizer, window_size=window_size, return_padding_mask=False, ) log_or_print(f'loading dataset to predict from csv path: {csv_path}', logger=logger, level='info') pred_dataset = UnannotatedDataset.from_csv( csv_path=csv_path, split='predict', window_size=window_size, spect_key=spect_key, timebins_key=timebins_key, transform=transform, ) pred_data = torch.utils.data.DataLoader( dataset=pred_dataset, shuffle=False, batch_size=1, # hard coding to make this work for now num_workers=num_workers) # ---------------- set up to convert predictions to annotation files ----------------------------------------------- log_or_print( f'will convert predictions to specified annotation format: {annot_format}', logger=logger, level='info') log_or_print( f'will use following settings for converting to annotation format: {to_format_kwargs}', logger=logger, level='info') scribe = crowsetta.Transcriber(annot_format=annot_format) log_or_print(f'loading labelmap from path: {labelmap_path}', logger=logger, level='info') with labelmap_path.open('r') as f: labelmap = json.load(f) dataset_df = pd.read_csv(csv_path) timebin_dur = io.dataframe.validate_and_get_timebin_dur(dataset_df) log_or_print(f'dataset has timebins with duration: {timebin_dur}', logger=logger, level='info') # ---------------- do the actual predicting + converting to annotations -------------------------------------------- input_shape = pred_dataset.shape # if dataset returns spectrogram reshaped into windows, # throw out the window dimension; just want to tell network (channels, height, width) shape if len(input_shape) == 4: input_shape = input_shape[1:] log_or_print( f'shape of input to networks used for predictions: {input_shape}', logger=logger, level='info') log_or_print( f'instantiating models from model-config map:/n{model_config_map}', logger=logger, level='info') models_map = models.from_model_config_map(model_config_map, num_classes=len(labelmap), input_shape=input_shape) for model_name, model in models_map.items(): # ---------------- do the actual predicting -------------------------------------------------------------------- log_or_print( f'loading checkpoint for {model_name} from path: {checkpoint_path}', logger=logger, level='info') model.load(checkpoint_path) log_or_print(f'running predict method of {model_name}', logger=logger, level='info') pred_dict = model.predict(pred_data=pred_data, device=device) # ---------------- converting to annotations ------------------------------------------------------------------ # note use no transforms dataset_for_annot = UnannotatedDataset.from_csv( csv_path=csv_path, split='predict', window_size=window_size, spect_key=spect_key, timebins_key=timebins_key, ) data_for_annot = torch.utils.data.DataLoader(dataset=dataset_for_annot, shuffle=False, batch_size=1, num_workers=num_workers) # use transform "outside" of Dataset so we can get back crop vec pad_to_window = transforms.PadToWindow(window_size, return_padding_mask=True) progress_bar = tqdm(data_for_annot) log_or_print('converting predictions to annotation files', logger=logger, level='info') for ind, batch in enumerate(progress_bar): x, y = batch[0], batch[ 1] # here we don't care about putting on some device outside cpu if len(x.shape) == 3: # ("batch", freq_bins, time_bins) x = x.cpu().numpy().squeeze() x_pad, padding_mask = pad_to_window(x) y_pred_ind = pred_dict['y'].index(y) y_pred = pred_dict['y_pred'][y_pred_ind] y_pred = torch.argmax(y_pred, dim=1) # assumes class dimension is 1 y_pred = torch.flatten(y_pred).cpu().numpy()[padding_mask] labels, onsets_s, offsets_s = labelfuncs.lbl_tb2segments( y_pred, labelmap=labelmap, timebin_dur=timebin_dur) # DataLoader wraps strings in a tuple, need to unpack if type(y) == tuple and len(y) == 1: y = y[0] audio_fname = files.spect.find_audio_fname(y) audio_filename = Path(y).parent.joinpath(audio_fname) audio_filename = str( audio_filename) # in case function doesn't accept Path scribe.to_format(labels=labels, onsets_s=onsets_s, offsets_s=offsets_s, filename=audio_filename, **to_format_kwargs)
HERE = os.path.dirname(__file__)f TEST_DATA_DIR = os.path.join(HERE, '..', '..', 'test_data') SETUP_SCRIPTS_DIR = os.path.join(HERE, '..', '..', 'setup_scripts') NUM_SAMPLES = 10 # number of times to sample behavior of random-number generator audio_dir_cbin = os.path.join(TEST_DATA_DIR, 'cbins', 'gy6or6', '032312') audio_files_cbin = glob(os.path.join(audio_dir_cbin, '*.cbin')) annot_files_cbin = files_from_dir(annot_dir=audio_dir_cbin, annot_format='notmat') scribe_cbin = crowsetta.Transcriber(annot_format='notmat') annot_list_cbin = scribe_cbin.from_file(annot_file=annot_files_cbin) labelset_cbin = set(list('iabcdefghjk')) durs_cbin = [] labels_cbin = [] for audio_file, annot in zip(audio_files_cbin, annot_list_cbin): if set(annot.seq.labels).issubset(labelset_cbin): labels_cbin.append(annot.seq.labels) fs, data = load_cbin(audio_file) durs_cbin.append(data.shape[0] / fs) spect_dir_mat = os.path.join(TEST_DATA_DIR, 'mat', 'llb3', 'spect') spect_files_mat = glob(os.path.join(spect_dir_mat, '*.mat')) annot_mat = os.path.join(TEST_DATA_DIR, 'mat', 'llb3', 'llb3_annot_subset.mat') scribe_yarden = crowsetta.Transcriber(annot_format='yarden') annot_list_mat = scribe_yarden.from_file(annot_mat)
def _check_output(self, data_dir, labelset, audio_format, spect_format, annot_format, annot_file, vds_paths, num_expected_paths, splits=None, specd_durs=None): self.assertTrue(len(vds_paths) == num_expected_paths) # check that all files from data_dir that should've gone into dataset # actually made it into dataset if audio_format: data_files_from_dir = vak.io.audio.files_from_dir( data_dir, audio_format) elif spect_format: data_files_from_dir = vak.files.files.from_dir( data_dir, spect_format) if num_expected_paths == 1: vds = Dataset.from_json(json_fname=vds_paths[0]) if audio_format: data_files_in_vds = [voc.audio_path for voc in vds.voc_list] elif spect_format: data_files_in_vds = [voc.spect_path for voc in vds.voc_list] if labelset is None: self.assertTrue(data_files_from_dir == data_files_in_vds) else: scribe = crowsetta.Transcriber(voc_format=annot_format) if annot_file: annot_list = scribe.to_seq(file=annot_file) else: annot_files = vak.annotation.files_from_dir( annot_dir=data_dir, annot_format=annot_format) annot_list = scribe.to_seq(file=annot_files) for data_file, annot in zip(data_files_from_dir, annot_list): if set(annot.labels).issubset(labelset): self.assertTrue(data_file in data_files_in_vds) else: self.assertTrue(data_file not in data_files_in_vds) # if we split the dataset, make sure the split worked if splits and specd_durs: for split, specd_dur in zip(splits, specd_durs): path = [path for path in vds_paths if split in path] self.assertTrue(len(path) == 1) path = path[0] if specd_dur > 0: vds_loaded = Dataset.from_json(json_fname=path) total_dur = sum( [voc.duration for voc in vds_loaded.voc_list]) self.assertTrue((total_dur >= specd_dur)) elif specd_dur == -1: vds_loaded = Dataset.from_json(json_fname=path) total_dur = sum( [voc.duration for voc in vds_loaded.voc_list]) source_vds_path = [ path for path in vds_paths if 'test' not in path and 'train' not in path ][0] source_vds = Dataset.from_json(json_fname=source_vds_path) source_dur = sum( [voc.duration for voc in source_vds.voc_list]) if split == 'train': test_path = [ path for path in vds_paths if 'test' in path ][0] test_vds = Dataset.from_json(json_fname=test_path) test_dur = sum( [voc.duration for voc in test_vds.voc_list]) self.assertTrue( isclose(total_dur, source_dur - test_dur)) elif split == 'test': train_path = [ path for path in vds_paths if 'train' in path ][0] train_vds = Dataset.from_json(json_fname=train_path) train_dur = sum( [voc.duration for voc in train_vds.voc_list]) self.assertTrue( isclose(total_dur, source_dur - train_dur)) return True
def annot_list_koumura(annot_file_koumura): scribe = crowsetta.Transcriber(format="koumura") annot_list = scribe.from_file(annot_file_koumura) return annot_list
def annot_list_notmat(annot_files_notmat): scribe = crowsetta.Transcriber(format="notmat") annot_list = scribe.from_file(annot_files_notmat) return annot_list
def annot_list_yarden(annot_file_yarden): scribe = crowsetta.Transcriber(format="yarden") annot_list = scribe.from_file(annot_file_yarden) return annot_list