コード例 #1
0
def _get_specs(audio_dirs, seg_dirs, p, n_samples=None, max_len=None):
	"""
	Make a bunch of spectrograms.

	Parameters
	----------
	audio_dirs : list of str
		Directories containing audio files
	seg_dirs : list of str
		Directories containing segmenting decisions
	p : dict
		Segementing parameters. TO DO: ADD REFERENCE!
	n_samples : {int, None}, optional
		...
	max_len : {int, None}, optional
		...

	Returns
	-------
	specs : ..
		...
	max_len : ...
		...
	all_fns : ...
		...

	"""
	# Get the filenames.
	audio_fns, seg_fns = get_audio_seg_filenames(audio_dirs, seg_dirs)
	# Reproducibly shuffle.
	audio_fns, seg_fns = np.array(audio_fns), np.array(seg_fns)
	np.random.seed(42)
	perm = np.random.permutation(len(audio_fns))
	np.random.seed(None)
	audio_fns, seg_fns = audio_fns[perm], seg_fns[perm]
	# Collect spectrograms.
	specs, all_fns = [], []
	for audio_fn, seg_fn in zip(audio_fns, seg_fns):
		onsets, offsets = _read_onsets_offsets(seg_fn)
		fs, audio = wavfile.read(audio_fn)
		for onset, offset in zip(onsets, offsets):
			i1, i2 = int(onset * fs), int(offset * fs)
			assert i1 >= 0, audio_fn + ", " + seg_fn
			spec, _, _ = get_spec(audio[i1:i2], p)
			specs.append(spec)
			all_fns.append(os.path.split(seg_fn)[-1])
			if len(specs) >= n_samples:
				break
		if len(specs) >= n_samples:
			break
	# Zero-pad.
	assert len(specs) > 0, "Found no spectrograms!"
	n_freq_bins = specs[0].shape[0]
	if max_len is None:
		max_len = max(spec.shape[1] for spec in specs)
	for i in range(len(specs)):
		spec = np.zeros((n_freq_bins, max_len))
		spec[:,:specs[i].shape[1]] = specs[i][:,:max_len]
		specs[i] = spec
	return specs, max_len, all_fns
def get_onsets_offsets(audio, p, return_traces=False):
    """
	Segment the spectrogram using thresholds on its ampltiude.

	Parameters
	----------
	audio : numpy.ndarray
		Raw audio samples.
	p : dict
		Parameters.
	return_traces : bool, optional
		Whether to return traces. Defaults to `False`.
	Returns
	-------
	onsets : numpy array
		Onset times, in seconds
	offsets : numpy array
		Offset times, in seconds
	traces : list of a single numpy array
		The amplitude trace used in segmenting decisions. Returned if
		return_traces is `True`.
	"""
    spec, dt, _ = get_spec(audio, p)
    min_syll_len = int(np.floor(p['min_dur'] / dt))
    max_syll_len = int(np.ceil(p['max_dur'] / dt))
    th_1, th_2, th_3 = p['th_1'], p['th_2'], p['th_3']  # treshholds
    smoothing_time = p['smoothing_timescale'] / dt
    onsets, offsets = [], []
    too_short, too_long = 0, 0

    if p['softmax']:
        amps = _softmax(spec, t=p['temperature'])
    else:
        amps = np.sum(spec, axis=0)
    # Smooth.
    amps = gaussian_filter(amps, smoothing_time)

    # Find local maxima greater than th_3.
    local_maxima = []
    for i in range(1, len(amps) - 1, 1):
        if amps[i] > th_3 and amps[i] == np.max(amps[i - 1:i + 2]):
            local_maxima.append(i)

    # Then search to the left and right for onsets and offsets.
    for local_max in local_maxima:
        if len(offsets) > 1 and local_max < offsets[-1]:
            continue
        i = local_max - 1
        while i > 0:
            if amps[i] < th_1:
                onsets.append(i)
                break
            elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]):
                onsets.append(i)
                break
            i -= 1
        if len(onsets) != len(offsets) + 1:
            onsets = onsets[:len(offsets)]
            continue
        i = local_max + 1
        while i < len(amps):
            if amps[i] < th_1:
                offsets.append(i)
                break
            elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]):
                offsets.append(i)
                break
            i += 1
        if len(onsets) != len(offsets):
            onsets = onsets[:len(offsets)]
            continue

    # Throw away syllables that are too long or too short.
    new_onsets = []
    new_offsets = []
    for i in range(len(offsets)):
        t1, t2 = onsets[i], offsets[i]
        if t2 - t1 + 1 <= max_syll_len and t2 - t1 + 1 >= min_syll_len:
            new_onsets.append(t1 * dt)
            new_offsets.append(t2 * dt)
        elif t2 - t1 + 1 > max_syll_len:
            too_long += 1
        else:
            too_short += 1

    # Return decisions.
    if return_traces:
        return new_onsets, new_offsets, [amps]
    return new_onsets, new_offsets
コード例 #3
0
def get_onsets_offsets(audio, p, return_traces=False):
    """
	Segment the spectrogram using thresholds on its amplitude.

	A syllable is detected if the amplitude trace exceeds `p['th_3']`. An offset
	is then detected if there is a subsequent local minimum in the amplitude
	trace with amplitude less than `p['th_2']`, or when the amplitude drops
	below `p['th_1']`, whichever comes first. Syllable onset is determined
	analogously.

	Note
	----
	`p['th_1'] <= p['th_2'] <= p['th_3']`

	Parameters
	----------
	audio : numpy.ndarray
		Raw audio samples.
	p : dict
		Parameters.
	return_traces : bool, optional
		Whether to return traces. Defaults to `False`.

	Returns
	-------
	onsets : numpy array
		Onset times, in seconds
	offsets : numpy array
		Offset times, in seconds
	traces : list of a single numpy array
		The amplitude trace used in segmenting decisions. Returned if
		`return_traces` is `True`.
	"""
    if len(audio) < p['nperseg']:
        if return_traces:
            return [], [], None
        return [], []
    spec, dt, _ = get_spec(audio, p)
    min_syll_len = int(np.floor(p['min_dur'] / dt))
    max_syll_len = int(np.ceil(p['max_dur'] / dt))
    th_1, th_2, th_3 = p['th_1'], p['th_2'], p['th_3']  # tresholds
    onsets, offsets = [], []
    too_short, too_long = 0, 0

    # Calculate amplitude and smooth.
    if p['softmax']:
        amps = softmax(spec, t=p['temperature'])
    else:
        amps = np.sum(spec, axis=0)
    amps = gaussian_filter(amps, p['smoothing_timescale'] / dt)

    # Find local maxima greater than th_3.
    local_maxima = []
    for i in range(1, len(amps) - 1, 1):
        if amps[i] > th_3 and amps[i] == np.max(amps[i - 1:i + 2]):
            local_maxima.append(i)

    # Then search to the left and right for onsets and offsets.
    for local_max in local_maxima:
        if len(offsets) > 1 and local_max < offsets[-1]:
            continue
        i = local_max - 1
        while i > 0:
            if amps[i] < th_1:
                onsets.append(i)
                break
            elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]):
                onsets.append(i)
                break
            i -= 1
        if len(onsets) != len(offsets) + 1:
            onsets = onsets[:len(offsets)]
            continue
        i = local_max + 1
        while i < len(amps):
            if amps[i] < th_1:
                offsets.append(i)
                break
            elif amps[i] < th_2 and amps[i] == np.min(amps[i - 1:i + 2]):
                offsets.append(i)
                break
            i += 1
        if len(onsets) != len(offsets):
            onsets = onsets[:len(offsets)]
            continue

    # Throw away syllables that are too long or too short.
    new_onsets = []
    new_offsets = []
    for i in range(len(offsets)):
        t1, t2 = onsets[i], offsets[i]
        if t2 - t1 + 1 <= max_syll_len and t2 - t1 + 1 >= min_syll_len:
            new_onsets.append(t1 * dt)
            new_offsets.append(t2 * dt)
        elif t2 - t1 + 1 > max_syll_len:
            too_long += 1
        else:
            too_short += 1

    # Return decisions.
    if return_traces:
        return new_onsets, new_offsets, [amps]
    return new_onsets, new_offsets
コード例 #4
0
def tune_segmenting_params(audio_dirs, p, img_fn='temp.pdf'):
    """
	Tune segementing parameters by visualizing segmenting decisions.

	Parameters
	----------
	audio_dirs : list of str
		Directories containing audio files.
	p : dict
		Segmenting parameters. TO DO: ADD REFERENCE!
	img_fn : str, optional
		Where to save segmenting images.

	Returns
	-------
	p : dict
		Adjusted segmenting parameters.
	"""
    print("Tune segmenting parameters\n---------------------------")
    # Collect filenames.
    filenames = []
    for load_dir in audio_dirs:
        filenames += [os.path.join(load_dir, i) for i in os.listdir(load_dir) \
          if _is_audio_file(i)]
    if len(filenames) == 0:
        warnings.warn("Found no audio files in directories: " +
                      str(audio_dirs))
        return
    # Set the amount of audio to display.
    if 'window_dur' in p:
        window_dur = p['window_dur']
    else:
        window_dur = 2.0 * p['max_dur']
    window_samples = int(window_dur * p['fs'])

    # Main loop: keep tuning parameters...
    while True:

        # Tune the parameters.
        for key in p:
            # Skip non-tunable parameters.
            if key in ['num_time_bins', 'num_freq_bins'
                       ] or not _is_number(p[key]):
                continue
            temp = 'not number and not empty'
            while not _is_number_or_empty(temp):
                temp = input('Set value for ' + key + ': [' + str(p[key]) +
                             '] ')
            if temp != '':
                p[key] = float(temp)

        # Visualize segmenting decisions.
        temp = 'not (s or r)'
        iteration = 0
        while temp != 's' and temp != 'r':

            # Get a random audio file.
            file_index = np.random.randint(len(filenames))
            filename = filenames[file_index]

            # Get spectrogram.
            fs, audio = wavfile.read(filename)
            assert fs == p['fs'], 'Found fs=' + str(fs) + ', expected ' + str(
                p['fs'])
            if len(audio) < 3 * window_samples:
                temp = len(audio) / p['fs']
                warnings.warn( \
                  "Skipping short file: "+filename+" ("+str(temp)+"s)")
                continue
            start_index = np.random.randint(len(audio) - 3 * window_samples)
            stop_index = start_index + 3 * window_samples
            audio = audio[start_index:stop_index]
            spec, dt, f = get_spec(audio, p)

            # Get onsets and offsets.
            onsets, offsets, traces = \
              p['algorithm'](audio, p, return_traces=True)
            onsets = [onset / dt for onset in onsets]
            offsets = [offset / dt for offset in offsets]

            # Plot.
            i1 = int(window_dur / dt)
            i2 = 2 * i1
            t1, t2 = i1 * dt, i2 * dt
            _, axarr = plt.subplots(2, 1, sharex=True)
            axarr[0].set_title(filename, fontsize=7)
            axarr[0].imshow(spec[:,i1:i2], origin='lower', \
              aspect='auto', \
              extent=[t1, t2, f[0]/1e3, f[-1]/1e3])
            axarr[0].set_ylabel('Frequency (kHz)')
            for j in range(len(onsets)):
                if onsets[j] >= i1 and onsets[j] < i2:
                    time = onsets[j] * dt
                    for k in [0, 1]:
                        axarr[k].axvline(x=time, c='b', lw=0.5)
                if offsets[j] >= i1 and offsets[j] < i2:
                    time = offsets[j] * dt
                    for k in [0, 1]:
                        axarr[k].axvline(x=time, c='r', lw=0.5)
            for key in ['th_1', 'th_2', 'th_3']:  # NOTE: clean this
                if key in p:
                    axarr[1].axhline(y=p[key], lw=0.5, c='b')
            xvals = np.linspace(t1, t2, i2 - i1)
            for trace in traces:
                axarr[1].plot(xvals, trace[i1:i2])
            axarr[1].set_xlabel('Time (s)')
            plt.savefig(img_fn)
            plt.close('all')

            # Continue.
            all_events = [j for j in onsets if j>i1 and j<i2] + \
              [j for j in offsets if j>i1 and j<i2]
            if len(all_events) > 0 or (iteration + 1) % 20 == 0:
                temp = input(
                    'Continue? [y] or [s]top tuning or [r]etune params: ')
            else:
                iteration += 1
                print("searching")
                temp = 'not (s or r)'
            if temp == 's':
                return p
コード例 #5
0
def _get_specs(audio_dirs, seg_dirs, p, max_num_specs=None, max_len=None, \
 return_segs=False):
    """
	Make a bunch of spectrograms.

	Parameters
	----------
	audio_dirs : list of str
		Directories containing audio files
	seg_dirs : list of str
		Directories containing segmenting decisions
	p : dict
		Segementing parameters. TO DO: ADD REFERENCE!
	max_num_specs : {int, None}, optional
		Defaults to ``None``.
	max_len : {int, None}, optional
		Maximum number of spectrogram time bins.
	return_segs : bool, optional
		Defaults to ``False``.

	Returns
	-------
	specs : list of numpy.ndarray
		Spectrograms.
	max_len : int
		Maximum number of spectrogram time bins.
	all_fns : ...
		...
	segs : numpy.ndarray
		Onsets and offsets for each spectrogram. Returned if ``return_segs``.
	"""
    # Get the filenames.
    audio_fns, seg_fns = get_audio_seg_filenames(audio_dirs, seg_dirs)
    # Reproducibly shuffle.
    audio_fns, seg_fns = np.array(audio_fns), np.array(seg_fns)
    np.random.seed(42)
    perm = np.random.permutation(len(audio_fns))
    np.random.seed(None)
    audio_fns, seg_fns = audio_fns[perm], seg_fns[perm]
    # Collect spectrograms.
    specs, all_fns, segs = [], [], []
    for audio_fn, seg_fn in zip(audio_fns, seg_fns):
        onsets, offsets = _read_onsets_offsets(seg_fn)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=WavFileWarning)
            fs, audio = wavfile.read(audio_fn)
        assert len(audio) >= p['nperseg'], "Short audio file: " + audio_fn + \
          ", duration: " + str(len(audio)/fs)
        for onset, offset in zip(onsets, offsets):
            i1, i2 = int(onset * fs), int(offset * fs)
            if i2 - i1 <= p['nperseg']:
                continue
            assert i1 >= 0, audio_fn + ", " + seg_fn
            spec, dt, _ = get_spec(audio[i1:i2], p)
            specs.append(spec)
            all_fns.append(os.path.split(seg_fn)[-1])
            segs.append(np.array([onset, 0.0]))  # Offsets added below.
            if max_num_specs is not None and len(specs) >= max_num_specs:
                break
        if max_num_specs is not None and len(specs) >= max_num_specs:
            break
    # Zero-pad.
    assert len(specs) > 0, "Found no spectrograms!"
    n_freq_bins = specs[0].shape[0]
    if max_len is None:
        max_len = max(spec.shape[1] for spec in specs)
    for i in range(len(specs)):
        spec = np.zeros((n_freq_bins, max_len))
        spec[:, :specs[i].shape[1]] = specs[i][:, :max_len]
        specs[i] = spec
        segs[i][1] = segs[i][0] + dt * max_len
    if return_segs:
        segs = np.array(segs)
        return specs, max_len, all_fns, segs
    return specs, max_len, all_fns