def _get_specs(audio_dirs, seg_dirs, p, n_samples=None, max_len=None): """ Make a bunch of spectrograms. Parameters ---------- audio_dirs : list of str Directories containing audio files seg_dirs : list of str Directories containing segmenting decisions p : dict Segementing parameters. TO DO: ADD REFERENCE! n_samples : {int, None}, optional ... max_len : {int, None}, optional ... Returns ------- specs : .. ... max_len : ... ... all_fns : ... ... """ # Get the filenames. audio_fns, seg_fns = get_audio_seg_filenames(audio_dirs, seg_dirs) # Reproducibly shuffle. audio_fns, seg_fns = np.array(audio_fns), np.array(seg_fns) np.random.seed(42) perm = np.random.permutation(len(audio_fns)) np.random.seed(None) audio_fns, seg_fns = audio_fns[perm], seg_fns[perm] # Collect spectrograms. specs, all_fns = [], [] for audio_fn, seg_fn in zip(audio_fns, seg_fns): onsets, offsets = _read_onsets_offsets(seg_fn) fs, audio = wavfile.read(audio_fn) for onset, offset in zip(onsets, offsets): i1, i2 = int(onset * fs), int(offset * fs) assert i1 >= 0, audio_fn + ", " + seg_fn spec, _, _ = get_spec(audio[i1:i2], p) specs.append(spec) all_fns.append(os.path.split(seg_fn)[-1]) if len(specs) >= n_samples: break if len(specs) >= n_samples: break # Zero-pad. assert len(specs) > 0, "Found no spectrograms!" n_freq_bins = specs[0].shape[0] if max_len is None: max_len = max(spec.shape[1] for spec in specs) for i in range(len(specs)): spec = np.zeros((n_freq_bins, max_len)) spec[:,:specs[i].shape[1]] = specs[i][:,:max_len] specs[i] = spec return specs, max_len, all_fns
def get_onsets_offsets(audio, p, return_traces=False): """ Segment the spectrogram using thresholds on its ampltiude. Parameters ---------- audio : numpy.ndarray Raw audio samples. p : dict Parameters. return_traces : bool, optional Whether to return traces. Defaults to `False`. Returns ------- onsets : numpy array Onset times, in seconds offsets : numpy array Offset times, in seconds traces : list of a single numpy array The amplitude trace used in segmenting decisions. Returned if return_traces is `True`. """ spec, dt, _ = get_spec(audio, p) min_syll_len = int(np.floor(p['min_dur'] / dt)) max_syll_len = int(np.ceil(p['max_dur'] / dt)) th_1, th_2, th_3 = p['th_1'], p['th_2'], p['th_3'] # treshholds smoothing_time = p['smoothing_timescale'] / dt onsets, offsets = [], [] too_short, too_long = 0, 0 if p['softmax']: amps = _softmax(spec, t=p['temperature']) else: amps = np.sum(spec, axis=0) # Smooth. amps = gaussian_filter(amps, smoothing_time) # Find local maxima greater than th_3. local_maxima = [] for i in range(1, len(amps) - 1, 1): if amps[i] > th_3 and amps[i] == np.max(amps[i - 1:i + 2]): local_maxima.append(i) # Then search to the left and right for onsets and offsets. for local_max in local_maxima: if len(offsets) > 1 and local_max < offsets[-1]: continue i = local_max - 1 while i > 0: if amps[i] < th_1: onsets.append(i) break elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]): onsets.append(i) break i -= 1 if len(onsets) != len(offsets) + 1: onsets = onsets[:len(offsets)] continue i = local_max + 1 while i < len(amps): if amps[i] < th_1: offsets.append(i) break elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]): offsets.append(i) break i += 1 if len(onsets) != len(offsets): onsets = onsets[:len(offsets)] continue # Throw away syllables that are too long or too short. new_onsets = [] new_offsets = [] for i in range(len(offsets)): t1, t2 = onsets[i], offsets[i] if t2 - t1 + 1 <= max_syll_len and t2 - t1 + 1 >= min_syll_len: new_onsets.append(t1 * dt) new_offsets.append(t2 * dt) elif t2 - t1 + 1 > max_syll_len: too_long += 1 else: too_short += 1 # Return decisions. if return_traces: return new_onsets, new_offsets, [amps] return new_onsets, new_offsets
def get_onsets_offsets(audio, p, return_traces=False): """ Segment the spectrogram using thresholds on its amplitude. A syllable is detected if the amplitude trace exceeds `p['th_3']`. An offset is then detected if there is a subsequent local minimum in the amplitude trace with amplitude less than `p['th_2']`, or when the amplitude drops below `p['th_1']`, whichever comes first. Syllable onset is determined analogously. Note ---- `p['th_1'] <= p['th_2'] <= p['th_3']` Parameters ---------- audio : numpy.ndarray Raw audio samples. p : dict Parameters. return_traces : bool, optional Whether to return traces. Defaults to `False`. Returns ------- onsets : numpy array Onset times, in seconds offsets : numpy array Offset times, in seconds traces : list of a single numpy array The amplitude trace used in segmenting decisions. Returned if `return_traces` is `True`. """ if len(audio) < p['nperseg']: if return_traces: return [], [], None return [], [] spec, dt, _ = get_spec(audio, p) min_syll_len = int(np.floor(p['min_dur'] / dt)) max_syll_len = int(np.ceil(p['max_dur'] / dt)) th_1, th_2, th_3 = p['th_1'], p['th_2'], p['th_3'] # tresholds onsets, offsets = [], [] too_short, too_long = 0, 0 # Calculate amplitude and smooth. if p['softmax']: amps = softmax(spec, t=p['temperature']) else: amps = np.sum(spec, axis=0) amps = gaussian_filter(amps, p['smoothing_timescale'] / dt) # Find local maxima greater than th_3. local_maxima = [] for i in range(1, len(amps) - 1, 1): if amps[i] > th_3 and amps[i] == np.max(amps[i - 1:i + 2]): local_maxima.append(i) # Then search to the left and right for onsets and offsets. for local_max in local_maxima: if len(offsets) > 1 and local_max < offsets[-1]: continue i = local_max - 1 while i > 0: if amps[i] < th_1: onsets.append(i) break elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]): onsets.append(i) break i -= 1 if len(onsets) != len(offsets) + 1: onsets = onsets[:len(offsets)] continue i = local_max + 1 while i < len(amps): if amps[i] < th_1: offsets.append(i) break elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]): offsets.append(i) break i += 1 if len(onsets) != len(offsets): onsets = onsets[:len(offsets)] continue # Throw away syllables that are too long or too short. new_onsets = [] new_offsets = [] for i in range(len(offsets)): t1, t2 = onsets[i], offsets[i] if t2 - t1 + 1 <= max_syll_len and t2 - t1 + 1 >= min_syll_len: new_onsets.append(t1 * dt) new_offsets.append(t2 * dt) elif t2 - t1 + 1 > max_syll_len: too_long += 1 else: too_short += 1 # Return decisions. if return_traces: return new_onsets, new_offsets, [amps] return new_onsets, new_offsets
def tune_segmenting_params(audio_dirs, p, img_fn='temp.pdf'): """ Tune segementing parameters by visualizing segmenting decisions. Parameters ---------- audio_dirs : list of str Directories containing audio files. p : dict Segmenting parameters. TO DO: ADD REFERENCE! img_fn : str, optional Where to save segmenting images. Returns ------- p : dict Adjusted segmenting parameters. """ print("Tune segmenting parameters\n---------------------------") # Collect filenames. filenames = [] for load_dir in audio_dirs: filenames += [os.path.join(load_dir, i) for i in os.listdir(load_dir) \ if _is_audio_file(i)] if len(filenames) == 0: warnings.warn("Found no audio files in directories: " + str(audio_dirs)) return # Set the amount of audio to display. if 'window_dur' in p: window_dur = p['window_dur'] else: window_dur = 2.0 * p['max_dur'] window_samples = int(window_dur * p['fs']) # Main loop: keep tuning parameters... while True: # Tune the parameters. for key in p: # Skip non-tunable parameters. if key in ['num_time_bins', 'num_freq_bins' ] or not _is_number(p[key]): continue temp = 'not number and not empty' while not _is_number_or_empty(temp): temp = input('Set value for ' + key + ': [' + str(p[key]) + '] ') if temp != '': p[key] = float(temp) # Visualize segmenting decisions. temp = 'not (s or r)' iteration = 0 while temp != 's' and temp != 'r': # Get a random audio file. file_index = np.random.randint(len(filenames)) filename = filenames[file_index] # Get spectrogram. fs, audio = wavfile.read(filename) assert fs == p['fs'], 'Found fs=' + str(fs) + ', expected ' + str( p['fs']) if len(audio) < 3 * window_samples: temp = len(audio) / p['fs'] warnings.warn( \ "Skipping short file: "+filename+" ("+str(temp)+"s)") continue start_index = np.random.randint(len(audio) - 3 * window_samples) stop_index = start_index + 3 * window_samples audio = audio[start_index:stop_index] spec, dt, f = get_spec(audio, p) # Get onsets and offsets. onsets, offsets, traces = \ p['algorithm'](audio, p, return_traces=True) onsets = [onset / dt for onset in onsets] offsets = [offset / dt for offset in offsets] # Plot. i1 = int(window_dur / dt) i2 = 2 * i1 t1, t2 = i1 * dt, i2 * dt _, axarr = plt.subplots(2, 1, sharex=True) axarr[0].set_title(filename, fontsize=7) axarr[0].imshow(spec[:,i1:i2], origin='lower', \ aspect='auto', \ extent=[t1, t2, f[0]/1e3, f[-1]/1e3]) axarr[0].set_ylabel('Frequency (kHz)') for j in range(len(onsets)): if onsets[j] >= i1 and onsets[j] < i2: time = onsets[j] * dt for k in [0, 1]: axarr[k].axvline(x=time, c='b', lw=0.5) if offsets[j] >= i1 and offsets[j] < i2: time = offsets[j] * dt for k in [0, 1]: axarr[k].axvline(x=time, c='r', lw=0.5) for key in ['th_1', 'th_2', 'th_3']: # NOTE: clean this if key in p: axarr[1].axhline(y=p[key], lw=0.5, c='b') xvals = np.linspace(t1, t2, i2 - i1) for trace in traces: axarr[1].plot(xvals, trace[i1:i2]) axarr[1].set_xlabel('Time (s)') plt.savefig(img_fn) plt.close('all') # Continue. all_events = [j for j in onsets if j>i1 and j<i2] + \ [j for j in offsets if j>i1 and j<i2] if len(all_events) > 0 or (iteration + 1) % 20 == 0: temp = input( 'Continue? [y] or [s]top tuning or [r]etune params: ') else: iteration += 1 print("searching") temp = 'not (s or r)' if temp == 's': return p
def _get_specs(audio_dirs, seg_dirs, p, max_num_specs=None, max_len=None, \ return_segs=False): """ Make a bunch of spectrograms. Parameters ---------- audio_dirs : list of str Directories containing audio files seg_dirs : list of str Directories containing segmenting decisions p : dict Segementing parameters. TO DO: ADD REFERENCE! max_num_specs : {int, None}, optional Defaults to ``None``. max_len : {int, None}, optional Maximum number of spectrogram time bins. return_segs : bool, optional Defaults to ``False``. Returns ------- specs : list of numpy.ndarray Spectrograms. max_len : int Maximum number of spectrogram time bins. all_fns : ... ... segs : numpy.ndarray Onsets and offsets for each spectrogram. Returned if ``return_segs``. """ # Get the filenames. audio_fns, seg_fns = get_audio_seg_filenames(audio_dirs, seg_dirs) # Reproducibly shuffle. audio_fns, seg_fns = np.array(audio_fns), np.array(seg_fns) np.random.seed(42) perm = np.random.permutation(len(audio_fns)) np.random.seed(None) audio_fns, seg_fns = audio_fns[perm], seg_fns[perm] # Collect spectrograms. specs, all_fns, segs = [], [], [] for audio_fn, seg_fn in zip(audio_fns, seg_fns): onsets, offsets = _read_onsets_offsets(seg_fn) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=WavFileWarning) fs, audio = wavfile.read(audio_fn) assert len(audio) >= p['nperseg'], "Short audio file: " + audio_fn + \ ", duration: " + str(len(audio)/fs) for onset, offset in zip(onsets, offsets): i1, i2 = int(onset * fs), int(offset * fs) if i2 - i1 <= p['nperseg']: continue assert i1 >= 0, audio_fn + ", " + seg_fn spec, dt, _ = get_spec(audio[i1:i2], p) specs.append(spec) all_fns.append(os.path.split(seg_fn)[-1]) segs.append(np.array([onset, 0.0])) # Offsets added below. if max_num_specs is not None and len(specs) >= max_num_specs: break if max_num_specs is not None and len(specs) >= max_num_specs: break # Zero-pad. assert len(specs) > 0, "Found no spectrograms!" n_freq_bins = specs[0].shape[0] if max_len is None: max_len = max(spec.shape[1] for spec in specs) for i in range(len(specs)): spec = np.zeros((n_freq_bins, max_len)) spec[:, :specs[i].shape[1]] = specs[i][:, :max_len] specs[i] = spec segs[i][1] = segs[i][0] + dt * max_len if return_segs: segs = np.array(segs) return specs, max_len, all_fns, segs return specs, max_len, all_fns