def generate_json(row, DT_ID): common_name = common_names[row.species] species = ' '.join(re.findall('[A-Z][^A-Z]*', row.species)).capitalize() # wav info sr = get_samplerate(row.wavloc.as_posix()) wav_duration = librosa.get_duration(filename=row.wavloc) fn = row.wavloc.stem DATASET_ID = 'woodpecker_' + species.lower().replace(' ', '_') json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (fn + ".JSON")) # make json dictionary json_dict = {} json_dict["indvs"] = {"UNK": {}} # add species json_dict["species"] = species json_dict["common_name"] = common_name json_dict["wav_loc"] = row.wavloc.as_posix() json_dict["sound_type"] = row.call_type json_dict["origin"] = row.origin # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration # dump json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID): # get sr and duration sr = get_samplerate(row.wavloc.as_posix()) wav_duration = librosa.get_duration(filename=row.wavloc) # create json json_dict = {} json_dict["common_name"] = "Macaque" json_dict["species"] = "Macaque mulatta" json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration json_dict["wav_loc"] = row.wavloc.as_posix() json_dict["idnum"] = row.idnum json_dict["samplerate_hz"] = sr json_dict["indvs"] = { row.indv: { "coos": { "start_times": NoIndent([0.0]), "end_times": NoIndent([wav_duration]), } } } json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (row.wavloc.stem + ".JSON")) json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID): # wav info try: sr = get_samplerate(row.wavloc.as_posix()) except Exception as e: print(row.wavloc.as_posix(), e) wav_duration = librosa.get_duration(filename=row.wavloc) # make json dictionary json_dict = {} # add species json_dict["species"] = "Lonchura striata" json_dict["common_name"] = "White rumped munia" json_dict["wav_loc"] = row.wavloc.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration json_dict["wav_num"] = row.wav_num # add syllable information json_dict["indvs"] = {row.indv: {}} # dump json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) wav_stem = row.indv + "_" + str(row.wav_num) json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID): datet = datetime.strptime(row.wavdate, "%Y-%m-%d_%H-%M-%S-%f") datestr = datet.strftime("%Y-%m-%d_%H-%M-%S") sr = get_samplerate(row.wavloc.as_posix()) wav_duration = librosa.get_duration(filename=row.wavloc.as_posix()) # general json info # make json dictionary json_dict = {} json_dict["species"] = "European starling" json_dict["common_name"] = "Sturnus vulgaris" json_dict["indvs"] = {row.indv: {}} json_dict["datetime"] = datestr # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration json_dict["wav_loc"] = row.wavloc.as_posix() # generate json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (row.wavloc.stem + ".JSON")) # save json ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID): # wav info sr = get_samplerate(row.wavloc.as_posix()) wav_duration = librosa.get_duration(filename=row.wavloc) # make json dictionary json_dict = {} # add species json_dict["species_id"] = row.species json_dict["species"] = species_dict[row.species_group] json_dict["common_name"] = species_dict_common[row.species_group] json_dict["wav_loc"] = row.wavloc.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration # add syllable information json_dict["indvs"] = {row.species: {}} DATASET_ID = 'insect_dataset_' + species_dict_common[row.species_group] json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) wav_stem = row.wavloc.stem json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID): wavdate = datetime(year=int(row.year), day=int(row.day), month=int(row.month)) wav_date = wavdate.strftime("%Y-%m-%d_%H-%M-%S") # wav samplerate and duration sr = get_samplerate(row.wav_loc.as_posix()) wav_duration = librosa.get_duration(filename=row.wav_loc) # wav general information json_dict = {} json_dict["datetime"] = wav_date json_dict["samplerate_hz"] = sr json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration json_dict["species"] = "Mus musculus" json_dict["common_name"] = "House mouse" json_dict["wav_loc"] = row.wav_loc.as_posix() json_dict["age"] = row.AGE json_dict["FemaleMouse"] = row.FemaleMouse json_dict['call_type'] = row.SONG json_dict["weight"] = row.Weight json_dict["indvs"] = {row.indv: {}} json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) json_out = DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / ( row.wav_loc.stem + ".JSON") # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID): # wav info sr = get_samplerate(row.wav_loc.as_posix()) wav_duration = librosa.get_duration(filename=row.wav_loc) # make json dictionary json_dict = {} # add species json_dict["species"] = "Taeniopygia guttata" json_dict["common_name"] = "Zebra finch" json_dict["wav_loc"] = row.wav_loc.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration json_dict["wav_num"] = row.voc_num json_dict["vocalization_type"] = row.vocalization_type json_dict["voc_type_full"] = row.voc_type_full json_dict["voc_type_def"] = call_dict[row.vocalization_type] json_dict["age"] = row.age json_dict["datetime"] = row.recordingdate.strftime("%Y-%m-%d_%H-%M-%S") # add syllable information json_dict["indvs"] = { row.indv: { "elements": { "start_times": [0.0], "end_times": [wav_duration] } } } json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) wav_stem = row.wav_loc.stem json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json_custom(wavfile, DT_ID): indv = wavfile.parent.parent.stem dataset_id = wavfile.parent.parent.parent.stem wav_loc = wavfile.as_posix() dt = datetime.now() datestring = dt.strftime("%Y-%m-%d") DATASET_ID = f'{dataset_id}_{indv}' sr = get_samplerate(wavfile.as_posix()) wav_duration = librosa.get_duration(filename=wavfile.as_posix()) wav_loc = wavfile.as_posix() # make json dictionary json_dict = { "sample_rate": sr, "species": indv, "datetime": datestring, "wav_loc": wav_loc, "samplerate_hz": sr, "length_s": wav_duration, } # no manual segmentation json_dict["indvs"] = { indv: { "syllables": { "start_times": [], "end_times": [], "labels": [], } } } # generate json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wavfile.stem + ".JSON")) # save json ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w")) return
def generate_json(row, DT_ID): species = row.species.lstrip().capitalize() DATASET_ID = "NA_BIRDS_" + species.lower().replace(" ", "_") # sample rate and duration sr = get_samplerate(row.wavloc.as_posix()) wav_duration = librosa.get_duration(filename=row.wavloc) # make json dictionary json_dict = {} json_dict["indvs"] = { "UNK": { "syllables": { "start_times": [0], "end_times": [wav_duration] } } } # add species json_dict["species"] = species json_dict["common_name"] = common_names[species] # add wav number json_dict["wav_num"] = int(row.wavnum) # add wav location json_dict["wav_loc"] = row.wavloc.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration # dump json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) # save information json_name = species.lower().replace(" ", "_") + '_' + str( row.wavnum).zfill(4) json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (json_name + ".JSON")) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def gen_wav_json(wf, wav_df, DT_ID, save_wav=False): """ generates a JSON of segmental iformation from the wav_df row if the flag save_wav is set to true, also generates a WAV file Arguments: wf {[type]} -- [description] wav_df {[type]} -- [description] DT_ID {[type]} -- [description] Keyword Arguments: save_wav {bool} -- [description] (default: {False}) """ wav_stem = wf.stem # output locations if save_wav: # load wav file bout_wav, sr = librosa.load(wf, mono=True, sr=None) wav_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "WAV" / (wav_stem + ".WAV")) bout_duration = len(bout_wav) / sr # save wav file ensure_dir(wav_out) librosa.output.write_wav(wav_out, y=bout_wav, sr=sr, norm=True) else: sr = get_samplerate(wav_df.iloc[0].wavloc.as_posix()) wav_out = wav_df.iloc[0].wavloc bout_duration = librosa.get_duration( filename=wav_df.iloc[0].wavloc.as_posix()) json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")) # create json dictionary indv = wav_df.iloc[0].indv json_dict = {} json_dict["indvs"] = {indv: {"phrases": {}}} json_dict["rendition"] = wav_df.iloc[0].rendition json_dict["datetime"] = wav_df.iloc[0].datetime.strftime( "%Y-%m-%d_%H-%M-%S") json_dict["original_wav"] = wav_df.iloc[0].wavloc.as_posix() json_dict["samplerate_hz"] = sr json_dict["indvs"][indv]["phrases"]["start_times"] = NoIndent( list(wav_df.phrase_start.values)) json_dict["indvs"][indv]["phrases"]["end_times"] = NoIndent( list(wav_df.phrase_end.values)) json_dict["indvs"][indv]["phrases"]["labels"] = NoIndent( list(wav_df.phrase_label.values)) json_dict["wav_loc"] = wav_out.as_posix() json_dict["length_s"] = bout_duration json_dict["species"] = "Serinus canaria forma domestica" json_dict["common_name"] = "Domestic canary" # generate json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) # save json ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(wavfile, DT_ID, song_db): indv = wavfile.parent.parent.stem try: dt = datetime.strptime(wavfile.stem, "%Y-%m-%d_%H-%M-%S-%f") except ValueError: dt = datetime.now() datestring = dt.strftime("%Y-%m-%d") row = song_db[(song_db.SubjectName == indv) & (song_db.recording_date == datestring) & (song_db.recording_time == dt.time())].iloc[0] # make json dictionary json_dict = {} for key in dict(row).keys(): if type(row[key]) == pd._libs.tslibs.timestamps.Timestamp: json_dict[key] = row[key].strftime("%Y-%m-%d_%H-%M-%S") elif type(row[key]) == dtt: json_dict[key] = row[key].strftime("%H:%M:%S") elif type(row[key]) == pd._libs.tslibs.nattype.NaTType: continue else: json_dict[key] = row[key] species_name = row.Species_short_name.replace(" ", "_") common_name = row.Subject_species.replace(" ", "_") DATASET_ID = "BIRD_DB_" + species_name json_dict["species"] = species_name json_dict["common_name"] = common_name json_dict["datetime"] = datestring sr = get_samplerate(wavfile.as_posix()) wav_duration = librosa.get_duration(filename=wavfile.as_posix()) json_dict["wav_loc"] = wavfile.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration tg = wavfile.parent.parent / "TextGrids" / (wavfile.stem + ".TextGrid") if not tg.exists(): print(tg.as_posix(), 'File does not exist') return textgrid = tgio.openTextgrid(fnFullPath=tg) tierlist = textgrid.tierDict[textgrid.tierNameList[0]].entryList start_times = [i.start for i in tierlist] end_times = [i.end for i in tierlist] labels = [i.label for i in tierlist] json_dict["indvs"] = { indv: { "syllables": { "start_times": NoIndent(start_times), "end_times": NoIndent(end_times), "labels": NoIndent(labels), } } } # generate json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wavfile.stem + ".JSON")) # save json ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_noise_and_json(bout_number, fn, DT_ID, wavloc, file_df): # location of wav #wavloc = np.array(wavs)[np.array([i.stem for i in wavs]) == fn][0] # wav time wavdate = datetime.strptime(fn, "%y%m%d-%H%M") wav_date = wavdate.strftime("%Y-%m-%d_%H-%M-%S") # wav samplerate and duration sr = get_samplerate(wavloc.as_posix()) wav_duration = librosa.get_duration(filename=wavloc) # df of syllables in file #file_df = label_df[label_df.file == fn].sort_values(by="start_time") ## find the longest stretch of non-vocal behavior in this wav noise_start, noise_end = find_longest_nonvocal_stretch( file_df, wav_duration) bout_start_string = avgn.utils.general.seconds_to_str(noise_start) # determine save locations noise_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "NOISE" / (fn + "__" + bout_start_string + ".WAV")) json_out = DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (fn + ".JSON") # wav general information json_dict = {} json_dict["bout_number"] = bout_number json_dict["species"] = "Megaptera novaengliae" json_dict["common_name"] = "Humpback whale" json_dict["datetime"] = wav_date json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration json_dict["wav_loc"] = wavloc.as_posix() json_dict["noise_loc"] = noise_out.as_posix() json_dict["indvs"] = { "UNK": { "syllables": { "start_times": NoIndent(list(file_df.start_time.values.astype("float"))), "end_times": NoIndent(list(file_df.end_time.astype("float"))), "high_freq": NoIndent(list(file_df.high_freq.astype("float"))), "low_freq": NoIndent(list(file_df.low_freq.astype("float"))), "SNR": NoIndent(list(file_df.SNR.astype("float"))), } } } json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) # save wav file noise_wav, sr = librosa.load(wavloc, sr=None, mono=True, offset=noise_start, duration=noise_end - noise_start) avgn.utils.paths.ensure_dir(noise_out) librosa.output.write_wav(noise_out, y=noise_wav, sr=sr, norm=True) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID, noise_indv_df): """ generate a json from available wav information for stowell dataset """ DATASET_ID = "stowell_" + row.species sr = get_samplerate(row.wavloc.as_posix()) wav_duration = librosa.get_duration(filename=row.wavloc) # make json dictionary json_dict = {} json_dict["indvs"] = {row.indv: {}} # add species json_dict["species"] = row.species species = { "chiffchaff": "Phylloscopus collybita", "littleowl": "Athene noctua", "pipit": "Anthus trivialis", } json_dict["species"] = species[row.species] json_dict["common_name"] = row.species # add year information json_dict["year"] = row.year # add train/test split json_dict["train"] = row.trntst # add wav number json_dict["wav_num"] = int(row.wavnum) # add wav location json_dict["wav_loc"] = row.wavloc.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration # get noise loc noise_indv_df = noise_indv_df[(noise_indv_df.species == row.species)] noise_indv_df = noise_indv_df[(noise_indv_df.year == row.year)] noise_indv_df = noise_indv_df[(noise_indv_df.groundx == row.groundx)] noise_indv_df = noise_indv_df[(noise_indv_df.fgbg == 'bg')] if len(noise_indv_df[noise_indv_df.wavnum == row.wavnum]) > 0: noise_loc = (noise_indv_df[noise_indv_df.wavnum == row.wavnum].iloc[0].wavloc.as_posix()) else: if len(noise_indv_df) > 0: noise_loc = noise_indv_df.iloc[0].wavloc.as_posix() else: noise_loc = '' return json_dict["noise_loc"] = noise_loc # dump json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) # save information json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (row.wavloc.stem + ".JSON")) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(DSLOC, DT_ID, bird, wfn, wfn_df): # wav location wav_loc = DSLOC / bird / "Wave" / wfn # wav info sr = get_samplerate(wav_loc.as_posix()) wav_duration = librosa.get_duration(filename=wav_loc) # make json dictionary json_dict = {} # add species json_dict["species"] = "Lonchura striata domestica" json_dict["common_name"] = "Bengalese finch" json_dict["wav_loc"] = wav_loc.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration # make a dataframe of wav info # wfn_df = bird_df[bird_df.WaveFileName == wfn] seq_df = pd.DataFrame( ([[ list(np.repeat(sequence_num, len(row.NotePositions))), list(row.NoteLabels), np.array( (np.array(row.NotePositions).astype("int") + int(row.Position)) / sr).astype("float64"), np.array( (np.array(row.NotePositions).astype("int") + np.array(row.NoteLengths).astype("int") + int(row.Position)) / sr).astype("float64"), ] for sequence_num, (idx, row) in enumerate(wfn_df.iterrows())]), columns=["sequence_num", "labels", "start_times", "end_times"], ) # add syllable information json_dict["indvs"] = { bird: { "notes": { "start_times": NoIndent(list(np.concatenate(seq_df.start_times.values))), "end_times": NoIndent(list(np.concatenate(seq_df.end_times.values))), "labels": NoIndent(list(np.concatenate(seq_df.labels.values))), "sequence_num": NoIndent([ int(i) for i in np.concatenate(seq_df.sequence_num.values) ]), } } } # dump json json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) wav_stem = bird + "_" + wfn.split(".")[0] json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")) # save json avgn.utils.paths.ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))
def generate_json(row, DT_ID): wav = row.wavloc cond = wav.parent.stem.split("_") if len(cond) == 2: common_name, condition = cond else: common_name = cond[0] condition = None if common_name == "mouse": if condition == "C57BL": data_id = wav.stem.split("_")[0] indv_id = mouse_id_dict[data_id] elif condition == "BALBc": indv_id = wav.stem.split("-")[0] elif common_name == "rat": indv_id = wav.stem.split("_")[-2] elif common_name == "gerbil": indv_id = wav.stem # wav info sr = get_samplerate(row.wavloc.as_posix()) wav_duration = librosa.get_duration(filename=row.wavloc) species = species_dict[common_name] # make json dictionary json_dict = {} # add species json_dict["condition"] = condition json_dict["species"] = species json_dict["common_name"] = common_name json_dict["wav_loc"] = row.wavloc.as_posix() # rate and length json_dict["samplerate_hz"] = sr json_dict["length_s"] = wav_duration # get syllable start and end times csv = row.wavloc.parent / (row.wavloc.stem + ".csv") voc_df = pd.read_csv(csv, header=None)[[0, 1]] voc_df.columns = ["start_time", "end_time"] # add syllable information json_dict["indvs"] = { indv_id: { "syllables": { "start_times": NoIndent(list(voc_df.start_time.values)), "end_times": NoIndent(list(voc_df.end_time.values)), } } } DATASET_ID = "tachibana_" + common_name # dump json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2) wav_stem = row.wavloc.stem json_out = (DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")) wav_out = DATA_DIR / "processed" / DATASET_ID / DT_ID / "WAV" / (wav_stem + ".WAV") print(json_out) # save json ensure_dir(json_out.as_posix()) print(json_txt, file=open(json_out.as_posix(), "w"))