コード例 #1
0
def make_spec(
    syll_wav,
    fs,
    hparams,
    mel_matrix=None,
    use_tensorflow=False,
    use_mel=True,
    return_tensor=False,
    norm_uint8=False,
):
    """
    """
    if use_tensorflow:
        import tensorflow as tf
        from avgn.signalprocessing.spectrogramming_tf import spectrogram_tensorflow
    # convert to float
    if type(syll_wav[0]) == int:
        syll_wav = int16_to_float32(syll_wav)

    # create spec

    if use_tensorflow:
        spec = spectrogram_tensorflow(syll_wav, fs, hparams)
        if use_mel:
            spec = tf.transpose(tf.tensordot(spec, mel_matrix, 1))
            if not return_tensor:
                spec = spec.numpy()
    else:
        spec = spectrogram(syll_wav, fs, hparams)
        if use_mel:
            spec = np.dot(spec.T, mel_matrix).T
    if norm_uint8:
        spec = (norm(spec) * 255).astype("uint8")

    return spec
コード例 #2
0
file_current = 'tutor_bl5w5_0017.WAV'


rate, data_loaded = load_wav(mypath+'\\'+file_current)
data = data_loaded
times = np.linspace(0,len(data)/rate,len(data));


# filter data
data = butter_bandpass_filter(data, butter_min, butter_max, rate)
plt.plot(times,data)


hparams.ref_level_db = 90
spec_orig = spectrogram(data,
                            rate,
                            hparams)
plot_spec(
    norm(spec_orig),
    fig=None,
    ax=None,
    rate=None,
    hop_len_ms=None,
    cmap=plt.cm.afmhot,
    show_cbar=True,
    figsize=(20, 6),
)

# segment
results = dynamic_threshold_segmentation(data,
                                          hparams,
コード例 #3
0
def dynamic_threshold_segmentation(vocalization,
                                   hparams,
                                   verbose=False,
                                   min_syllable_length_s=0.1,
                                   spectral_range=None):
    """
    computes a spectrogram from a waveform by iterating through thresholds
         to ensure a consistent noise level
    
    Arguments:
        vocalization {[type]} -- waveform of song
        rate {[type]} -- samplerate of datas
    
    Keyword Arguments:
        min_level_db {int} -- default dB minimum of spectrogram (threshold anything below) (default: {-80})
        min_level_db_floor {int} -- highest number min_level_db is allowed to reach dynamically (default: {-40})
        db_delta {int} -- delta in setting min_level_db (default: {5})
        n_fft {int} -- FFT window size (default: {1024})
        hop_length_ms {int} -- number audio of frames in ms between STFT columns (default: {1})
        win_length_ms {int} -- size of fft window (ms) (default: {5})
        ref_level_db {int} -- reference level dB of audio (default: {20})
        pre {float} -- coefficient for preemphasis filter (default: {0.97})
        min_syllable_length_s {float} -- shortest expected length of syllable (default: {0.1})
        min_silence_for_spec {float} -- shortest expected length of silence in a song (used to set dynamic threshold) (default: {0.1})
        silence_threshold {float} -- threshold for spectrogram to consider noise as silence (default: {0.05})
        max_vocal_for_spec {float} -- longest expected vocalization in seconds  (default: {1.0})
        spectral_range {[type]} -- spectral range to care about for spectrogram (default: {None})
        verbose {bool} -- display output (default: {False})
    
    
    Returns:
        [results] -- [dictionary of results]
    """
    rate = hparams.sample_rate
    n_fft = hparams.n_fft
    hop_length_ms = hparams.hop_length_ms
    win_length_ms = hparams.win_length_ms
    min_level_db = hparams.min_level_db
    min_level_db_floor = hparams.min_level_db_floor
    db_delta = hparams.db_delta
    ref_level_db = hparams.ref_level_db
    pre = hparams.preemphasis
    min_silence_for_spec = hparams.min_silence_for_spec
    max_vocal_for_spec = hparams.max_vocal_for_spec
    silence_threshold = hparams.silence_threshold

    # does the envelope meet the standards necessary to consider this a bout
    envelope_is_good = False

    # make a copy of the hyperparameters

    # make a copy of the original spectrogram
    # spec_orig = spectrogram_nn(
    #     vocalization,
    #     rate,
    #     n_fft=n_fft,
    #     hop_length_ms=hop_length_ms,
    #     win_length_ms=win_length_ms,
    #     ref_level_db=ref_level_db,
    #     pre=pre,
    # )
    # spec_orig = spectrogram_nn(vocalization,
    #             hparams)

    spec_orig = spectrogram(vocalization, rate, hparams)

    fft_rate = 1000 / hop_length_ms

    if spectral_range is not None:
        spec_bin_hz = (rate / 2) / np.shape(spec_orig)[0]
        spec_orig = spec_orig[int(spectral_range[0] /
                                  spec_bin_hz):int(spectral_range[1] /
                                                   spec_bin_hz), :, ]

    # loop through possible thresholding configurations starting at the highest
    for _, mldb in enumerate(
            tqdm(
                np.arange(min_level_db, min_level_db_floor, db_delta),
                leave=False,
                disable=(not verbose),
            )):
        # set the minimum dB threshold
        min_level_db = mldb
        # normalize the spectrogram
        # spec = norm(_normalize(spec_orig, min_level_db=min_level_db))
        spec = norm(spec_orig)

        # subtract the median
        spec = spec - np.median(spec, axis=1).reshape((len(spec), 1))
        spec[spec < 0] = 0

        # get the vocal envelope
        vocal_envelope = np.max(spec, axis=0) * np.sqrt(np.mean(spec, axis=0))
        # normalize envelope
        vocal_envelope = vocal_envelope / np.max(vocal_envelope)

        # Look at how much silence exists in the signal
        onsets, offsets = onsets_offsets(
            vocal_envelope > silence_threshold) / fft_rate
        onsets_sil, offsets_sil = (
            onsets_offsets(vocal_envelope <= silence_threshold) / fft_rate)

        # if there is a silence of at least min_silence_for_spec length,
        #  and a vocalization of no greater than max_vocal_for_spec length, the env is good
        if len(onsets_sil) > 0:
            # frames per second of spectrogram

            # longest silences and periods of vocalization
            max_silence_len = np.max(offsets_sil - onsets_sil)
            max_vocalization_len = np.max(offsets - onsets)
            if verbose:
                print("longest silence", max_silence_len)
                print("longest vocalization", max_vocalization_len)

            if max_silence_len > min_silence_for_spec:
                if max_vocalization_len < max_vocal_for_spec:
                    envelope_is_good = True
                    break
        if verbose:
            print("Current min_level_db: {}".format(min_level_db))

    if not envelope_is_good:
        return None

    onsets, offsets = onsets_offsets(
        vocal_envelope > silence_threshold) / fft_rate

    # threshold out short syllables
    length_mask = (offsets - onsets) >= min_syllable_length_s

    return {
        "spec": spec,
        "vocal_envelope": vocal_envelope.astype("float32"),
        "min_level_db": min_level_db,
        "onsets": onsets[length_mask],
        "offsets": offsets[length_mask],
    }
コード例 #4
0
def process_bird_wav(
    bird,
    wav_info,
    wav_time,
    params,
    save_to_folder,
    visualize=False,
    skip_created=False,
    seconds_timeout=300,
    save_spectrograms=True,
    verbose=False,
):
    """splits a wav file into periods of silence and periods of sound based on params
    """
    # Load up the WAV
    rate, data = load_wav(wav_info)
    params["sample_rate"] = rate
    if rate is None or data is None:
        return

    # bandpass filter
    data = butter_bandpass_filter(data.astype("float32"),
                                  params["lowcut"],
                                  params["highcut"],
                                  rate,
                                  order=2)
    data = float32_to_int16(data)

    # we only want one channel
    if len(np.shape(data)) == 2:
        data = data[:, 0]

    # threshold the (root mean squared of the) audio
    rms_data, sound_threshed = RMS(
        data,
        rate,
        params["rms_stride"],
        params["rms_window"],
        params["rms_padding"],
        params["noise_thresh"],
    )
    # Find the onsets/offsets of sound
    onset_sounds, offset_sounds = detect_onsets_offsets(
        np.repeat(sound_threshed, int(params["rms_stride"] * rate)),
        threshold=0,
        min_distance=0,
    )
    # make sure all onset sounds are at least zero (due to downsampling in RMS)
    onset_sounds[onset_sounds < 0] = 0

    # threshold clips of sound
    for onset_sound, offset_sound in zip(onset_sounds, offset_sounds):

        # segment the clip
        clip = data[onset_sound:offset_sound]
        ### if the clip is thresholded, as noise, do not save it into dataset
        # bin width in Hz of spectrogram
        freq_step_size_Hz = (rate / 2) / params["num_freq"]
        bout_spec = threshold_clip(clip,
                                   rate,
                                   freq_step_size_Hz,
                                   params,
                                   visualize=visualize,
                                   verbose=verbose)
        if bout_spec is None:
            # visualize spectrogram if desired
            if visualize:
                # compute spectrogram of clip
                wav_spectrogram = spectrogram(int16_to_float32(clip), params)
                visualize_spec(wav_spectrogram, show=True)
            continue

        # determine the datetime of this clip
        start_time = wav_time + timedelta(seconds=onset_sound / float(rate))
        time_string = start_time.strftime("%Y-%m-%d_%H-%M-%S-%f")

        # create a subfolder for the individual bird if it doesn't already exist
        bird_folder = Path(save_to_folder).resolve() / bird
        ensure_dir(bird_folder)

        # save data
        save_bout_wav(data, rate, bird_folder, bird, wav_info, time_string,
                      skip_created)

        # save the spectrogram of the data
        if save_spectrograms:
            save_bout_spec(bird_folder, bout_spec, time_string, skip_created)
コード例 #5
0
def threshold_clip(clip,
                   rate,
                   freq_step_size_Hz,
                   params,
                   visualize=False,
                   verbose=False):
    """ determines if a clip is a bout, or noise based on threshold parameters
    """
    # get the length of the segment
    segment_length = len(clip) / float(rate)

    # return if the clip is the wrong length
    if segment_length <= params["min_segment_length_s"]:
        if verbose:
            print("Segment length {} less than minimum of {}".format(
                segment_length, params["min_segment_length_s"]))
        return
    if segment_length >= params["max_segment_length_s"]:
        if verbose:
            print("Segment length {} greather than maximum of {}".format(
                segment_length, params["max_segment_length_s"]))
        return

    # compute spectrogram of clip
    wav_spectrogram = spectrogram(int16_to_float32(clip), params)
    # determine the power of the spectral envelope
    norm_power = np.mean(wav_spectrogram, axis=0)
    norm_power = (norm_power - np.min(norm_power)) / (np.max(norm_power) -
                                                      np.min(norm_power))

    # get the maximum power region of the frequency envelope
    peak_power_Hz = np.argmax(norm_power) * freq_step_size_Hz

    # threshold for the location of peak power
    if peak_power_Hz < params["vocal_range_Hz"][0]:
        if verbose:
            print("Peak power {} Hz less than minimum of {}".format(
                peak_power_Hz, params["vocal_range_Hz"][0]))
        return

    # threshold based on silence
    vocal_power = zero_one_norm(
        np.sum(
            wav_spectrogram[:,
                            int(params["vocal_range_Hz"][0] / freq_step_size_Hz
                                ):int(params["vocal_range_Hz"][1] /
                                      freq_step_size_Hz), ],
            axis=1,
        ))
    # the percent of the spectrogram below the noise threshold
    pct_silent = np.sum(vocal_power <= params["noise_thresh"]) / float(
        len(vocal_power))
    if pct_silent < params["min_silence_pct"]:
        if verbose:
            print("Percent silent {} /% less than maximum of {}".format(
                pct_silent, params["min_silence_pct"]))
        return

    if visualize:
        visualize_spec(wav_spectrogram, show=True)

    # compute threshold statistics
    return wav_spectrogram