def make_spec( syll_wav, fs, hparams, mel_matrix=None, use_tensorflow=False, use_mel=True, return_tensor=False, norm_uint8=False, ): """ """ if use_tensorflow: import tensorflow as tf from avgn.signalprocessing.spectrogramming_tf import spectrogram_tensorflow # convert to float if type(syll_wav[0]) == int: syll_wav = int16_to_float32(syll_wav) # create spec if use_tensorflow: spec = spectrogram_tensorflow(syll_wav, fs, hparams) if use_mel: spec = tf.transpose(tf.tensordot(spec, mel_matrix, 1)) if not return_tensor: spec = spec.numpy() else: spec = spectrogram(syll_wav, fs, hparams) if use_mel: spec = np.dot(spec.T, mel_matrix).T if norm_uint8: spec = (norm(spec) * 255).astype("uint8") return spec
file_current = 'tutor_bl5w5_0017.WAV' rate, data_loaded = load_wav(mypath+'\\'+file_current) data = data_loaded times = np.linspace(0,len(data)/rate,len(data)); # filter data data = butter_bandpass_filter(data, butter_min, butter_max, rate) plt.plot(times,data) hparams.ref_level_db = 90 spec_orig = spectrogram(data, rate, hparams) plot_spec( norm(spec_orig), fig=None, ax=None, rate=None, hop_len_ms=None, cmap=plt.cm.afmhot, show_cbar=True, figsize=(20, 6), ) # segment results = dynamic_threshold_segmentation(data, hparams,
def dynamic_threshold_segmentation(vocalization, hparams, verbose=False, min_syllable_length_s=0.1, spectral_range=None): """ computes a spectrogram from a waveform by iterating through thresholds to ensure a consistent noise level Arguments: vocalization {[type]} -- waveform of song rate {[type]} -- samplerate of datas Keyword Arguments: min_level_db {int} -- default dB minimum of spectrogram (threshold anything below) (default: {-80}) min_level_db_floor {int} -- highest number min_level_db is allowed to reach dynamically (default: {-40}) db_delta {int} -- delta in setting min_level_db (default: {5}) n_fft {int} -- FFT window size (default: {1024}) hop_length_ms {int} -- number audio of frames in ms between STFT columns (default: {1}) win_length_ms {int} -- size of fft window (ms) (default: {5}) ref_level_db {int} -- reference level dB of audio (default: {20}) pre {float} -- coefficient for preemphasis filter (default: {0.97}) min_syllable_length_s {float} -- shortest expected length of syllable (default: {0.1}) min_silence_for_spec {float} -- shortest expected length of silence in a song (used to set dynamic threshold) (default: {0.1}) silence_threshold {float} -- threshold for spectrogram to consider noise as silence (default: {0.05}) max_vocal_for_spec {float} -- longest expected vocalization in seconds (default: {1.0}) spectral_range {[type]} -- spectral range to care about for spectrogram (default: {None}) verbose {bool} -- display output (default: {False}) Returns: [results] -- [dictionary of results] """ rate = hparams.sample_rate n_fft = hparams.n_fft hop_length_ms = hparams.hop_length_ms win_length_ms = hparams.win_length_ms min_level_db = hparams.min_level_db min_level_db_floor = hparams.min_level_db_floor db_delta = hparams.db_delta ref_level_db = hparams.ref_level_db pre = hparams.preemphasis min_silence_for_spec = hparams.min_silence_for_spec max_vocal_for_spec = hparams.max_vocal_for_spec silence_threshold = hparams.silence_threshold # does the envelope meet the standards necessary to consider this a bout envelope_is_good = False # make a copy of the hyperparameters # make a copy of the original spectrogram # spec_orig = spectrogram_nn( # vocalization, # rate, # n_fft=n_fft, # hop_length_ms=hop_length_ms, # win_length_ms=win_length_ms, # ref_level_db=ref_level_db, # pre=pre, # ) # spec_orig = spectrogram_nn(vocalization, # hparams) spec_orig = spectrogram(vocalization, rate, hparams) fft_rate = 1000 / hop_length_ms if spectral_range is not None: spec_bin_hz = (rate / 2) / np.shape(spec_orig)[0] spec_orig = spec_orig[int(spectral_range[0] / spec_bin_hz):int(spectral_range[1] / spec_bin_hz), :, ] # loop through possible thresholding configurations starting at the highest for _, mldb in enumerate( tqdm( np.arange(min_level_db, min_level_db_floor, db_delta), leave=False, disable=(not verbose), )): # set the minimum dB threshold min_level_db = mldb # normalize the spectrogram # spec = norm(_normalize(spec_orig, min_level_db=min_level_db)) spec = norm(spec_orig) # subtract the median spec = spec - np.median(spec, axis=1).reshape((len(spec), 1)) spec[spec < 0] = 0 # get the vocal envelope vocal_envelope = np.max(spec, axis=0) * np.sqrt(np.mean(spec, axis=0)) # normalize envelope vocal_envelope = vocal_envelope / np.max(vocal_envelope) # Look at how much silence exists in the signal onsets, offsets = onsets_offsets( vocal_envelope > silence_threshold) / fft_rate onsets_sil, offsets_sil = ( onsets_offsets(vocal_envelope <= silence_threshold) / fft_rate) # if there is a silence of at least min_silence_for_spec length, # and a vocalization of no greater than max_vocal_for_spec length, the env is good if len(onsets_sil) > 0: # frames per second of spectrogram # longest silences and periods of vocalization max_silence_len = np.max(offsets_sil - onsets_sil) max_vocalization_len = np.max(offsets - onsets) if verbose: print("longest silence", max_silence_len) print("longest vocalization", max_vocalization_len) if max_silence_len > min_silence_for_spec: if max_vocalization_len < max_vocal_for_spec: envelope_is_good = True break if verbose: print("Current min_level_db: {}".format(min_level_db)) if not envelope_is_good: return None onsets, offsets = onsets_offsets( vocal_envelope > silence_threshold) / fft_rate # threshold out short syllables length_mask = (offsets - onsets) >= min_syllable_length_s return { "spec": spec, "vocal_envelope": vocal_envelope.astype("float32"), "min_level_db": min_level_db, "onsets": onsets[length_mask], "offsets": offsets[length_mask], }
def process_bird_wav( bird, wav_info, wav_time, params, save_to_folder, visualize=False, skip_created=False, seconds_timeout=300, save_spectrograms=True, verbose=False, ): """splits a wav file into periods of silence and periods of sound based on params """ # Load up the WAV rate, data = load_wav(wav_info) params["sample_rate"] = rate if rate is None or data is None: return # bandpass filter data = butter_bandpass_filter(data.astype("float32"), params["lowcut"], params["highcut"], rate, order=2) data = float32_to_int16(data) # we only want one channel if len(np.shape(data)) == 2: data = data[:, 0] # threshold the (root mean squared of the) audio rms_data, sound_threshed = RMS( data, rate, params["rms_stride"], params["rms_window"], params["rms_padding"], params["noise_thresh"], ) # Find the onsets/offsets of sound onset_sounds, offset_sounds = detect_onsets_offsets( np.repeat(sound_threshed, int(params["rms_stride"] * rate)), threshold=0, min_distance=0, ) # make sure all onset sounds are at least zero (due to downsampling in RMS) onset_sounds[onset_sounds < 0] = 0 # threshold clips of sound for onset_sound, offset_sound in zip(onset_sounds, offset_sounds): # segment the clip clip = data[onset_sound:offset_sound] ### if the clip is thresholded, as noise, do not save it into dataset # bin width in Hz of spectrogram freq_step_size_Hz = (rate / 2) / params["num_freq"] bout_spec = threshold_clip(clip, rate, freq_step_size_Hz, params, visualize=visualize, verbose=verbose) if bout_spec is None: # visualize spectrogram if desired if visualize: # compute spectrogram of clip wav_spectrogram = spectrogram(int16_to_float32(clip), params) visualize_spec(wav_spectrogram, show=True) continue # determine the datetime of this clip start_time = wav_time + timedelta(seconds=onset_sound / float(rate)) time_string = start_time.strftime("%Y-%m-%d_%H-%M-%S-%f") # create a subfolder for the individual bird if it doesn't already exist bird_folder = Path(save_to_folder).resolve() / bird ensure_dir(bird_folder) # save data save_bout_wav(data, rate, bird_folder, bird, wav_info, time_string, skip_created) # save the spectrogram of the data if save_spectrograms: save_bout_spec(bird_folder, bout_spec, time_string, skip_created)
def threshold_clip(clip, rate, freq_step_size_Hz, params, visualize=False, verbose=False): """ determines if a clip is a bout, or noise based on threshold parameters """ # get the length of the segment segment_length = len(clip) / float(rate) # return if the clip is the wrong length if segment_length <= params["min_segment_length_s"]: if verbose: print("Segment length {} less than minimum of {}".format( segment_length, params["min_segment_length_s"])) return if segment_length >= params["max_segment_length_s"]: if verbose: print("Segment length {} greather than maximum of {}".format( segment_length, params["max_segment_length_s"])) return # compute spectrogram of clip wav_spectrogram = spectrogram(int16_to_float32(clip), params) # determine the power of the spectral envelope norm_power = np.mean(wav_spectrogram, axis=0) norm_power = (norm_power - np.min(norm_power)) / (np.max(norm_power) - np.min(norm_power)) # get the maximum power region of the frequency envelope peak_power_Hz = np.argmax(norm_power) * freq_step_size_Hz # threshold for the location of peak power if peak_power_Hz < params["vocal_range_Hz"][0]: if verbose: print("Peak power {} Hz less than minimum of {}".format( peak_power_Hz, params["vocal_range_Hz"][0])) return # threshold based on silence vocal_power = zero_one_norm( np.sum( wav_spectrogram[:, int(params["vocal_range_Hz"][0] / freq_step_size_Hz ):int(params["vocal_range_Hz"][1] / freq_step_size_Hz), ], axis=1, )) # the percent of the spectrogram below the noise threshold pct_silent = np.sum(vocal_power <= params["noise_thresh"]) / float( len(vocal_power)) if pct_silent < params["min_silence_pct"]: if verbose: print("Percent silent {} /% less than maximum of {}".format( pct_silent, params["min_silence_pct"])) return if visualize: visualize_spec(wav_spectrogram, show=True) # compute threshold statistics return wav_spectrogram