def create_segment_profile(audio_file, duration_frames, filepath, window_len, step_size=1): """ Each audio file has a number of segments. We'll slide a window through it spectrogam to create segments. then we create a mask for each segment with the same length corresponding to the frame number of the segment. For each frame of the segments that falls into the boundary of a syllable, its corresponding value in the mask is 1, otherwise 0. Each of these windowed segments is given a unique ID that is constructible from the audio file's ID and the timestamp :param step_size: how many frames is the next segment ahead of the :param window_len: length of the sliding window :param audio_file: :return: a dictionary of (fake) segment IDs and their corresponding audiofile, start and end indices """ noverlap = window_len - step_size real_segments = Segment.objects.filter(audio_file=audio_file) real_segments_timestamps = real_segments.values_list('start_time_ms', 'end_time_ms') # Construct a mask for the entire audiofile, then simply slicing it into fake segments duration_ms = int(audio_file.length / audio_file.fs * 1000) mask = np.zeros((duration_frames, 1), dtype=np.float32) for beg, end in real_segments_timestamps: beg_frame = int(beg / duration_ms * duration_frames) end_frame = int(end / duration_ms * duration_frames) mask[beg_frame:end_frame, :] = 1 nwindows, windows = split_segments(duration_frames, window_len, noverlap, incltail=False) profiles = {} for beg, end in windows: windowed_id = '{}_{}'.format(audio_file.id, beg) windowed_mask = mask[beg:end, :].tolist() profiles[windowed_id] = (filepath, beg, end, windowed_mask) return profiles
def test_segments_without_tail(self): nsegs, segs = split_segments(86, 32, 16, incltail=False) correct_segs = np.array([[0, 32], [16, 48], [32, 64], [48, 80]]) correct_nsegs = len(correct_segs) self.assertEqual(nsegs, correct_nsegs) self.assertTrue((segs == correct_segs).all())
def run_segmentation(duration_frames, psd, encoder, session, window_len, step_size=1): noverlap = window_len - step_size nwindows, windows = split_segments(duration_frames, window_len, noverlap, incltail=False) mask = np.zeros((duration_frames,), dtype=np.float32) windoweds = [] for beg, end in windows: windowed = psd[:, beg:end].T windoweds.append(windowed) predicteds = encoder.predict(windoweds, session) for predicted, (beg, end) in zip(predicteds, windows): predicted_binary = predicted.reshape(window_len) > 0.5 mask[beg: end] += predicted_binary threshold = window_len * 0.3 syllable_frames = mask > threshold syllables = [] current_syl = None opening = False for i in range(duration_frames - 1): this_frame = syllable_frames[i] next_frame = syllable_frames[i + 1] if this_frame and next_frame: if opening is False: opening = True current_syl = [i] elif this_frame and opening: opening = False current_syl.append(i) syllables.append(current_syl) current_syl = None return syllables, None
def _harmonic_and_pitch(args): """ Computes harmonic ratio and pitch """ sig = get_sig(args) fs, noverlap, win_length = unroll_args(args, ['fs', 'noverlap', 'win_length']) siglen = len(sig) nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False) HRs = [] F0s = [] for i in range(nsegs): seg_beg, seg_end = segs[i] frame = sig[seg_beg:seg_end] M = int(np.round(0.016 * fs) - 1) R = np.correlate(frame, frame, mode='full') g = R[len(frame) - 1] R = R[len(frame):-1] # estimate m0 (as the first zero crossing of R) [ a, ] = np.nonzero(np.diff(np.sign(R))) if len(a) == 0: m0 = len(R) - 1 else: m0 = a[0] if M > len(R): M = len(R) - 1 Gamma = np.zeros(M, dtype=np.float64) CSum = np.cumsum(frame**2) Gamma[m0:M] = R[m0:M] / (np.sqrt((g * CSum[M:m0:-1])) + eps) if len(Gamma) == 0: hr = 1.0 f0 = 0.0 else: # Find the first 3 candidates, since there's lots of noise that can distort the result if we # only consider the max blags = np.argsort(Gamma)[-3:][::-1] f0_candidates = fs / (blags + eps) # The FF should be the smallest of all candidates smallest_f0_index = np.argmin(f0_candidates) f0 = f0_candidates[smallest_f0_index] blag = blags[smallest_f0_index] hr = Gamma[blag] HRs.append(hr) F0s.append(f0) return np.array(HRs), np.array(F0s)
def my_stft(sig, fs, window, noverlap, nfft): siglen = len(sig) freq_range = nfft // 2 + 1 window_size = len(window) nsegs, segs = split_segments(siglen, window_size, noverlap, incltail=False) mat = np.ndarray((freq_range, nsegs), dtype=np.complex128) for i in range(nsegs): seg = segs[i] subsig = sig[seg[0]: seg[1]] spectrum = fft(subsig * window, nfft) mat[:, i] = spectrum[:freq_range] return mat
def lp_coefficients(args): sig = get_sig(args) nfft, fs, noverlap, win_length, order = unroll_args( args, ['nfft', 'fs', 'noverlap', 'win_length', 'order']) hann_window = _cached_get_window('hanning', nfft) window = unroll_args(args, [('window', hann_window)]) siglen = len(sig) nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False) lp_coeffs = np.zeros((order, nsegs), dtype=np.float32) for i in range(nsegs): seg_beg, seg_end = segs[i] frame = sig[seg_beg:seg_end] lp_coeffs[:, i] = lp_coefficients_frame(frame * window, order) return lp_coeffs
def lpc_spectrum(args): sig = get_sig(args) nfft, fs, noverlap, win_length, order = unroll_args( args, ['nfft', 'fs', 'noverlap', 'win_length', 'order']) hann_window = _cached_get_window('hanning', nfft) window = unroll_args(args, [('window', hann_window)]) siglen = len(sig) nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False) lpcs = np.zeros((nfft, nsegs), dtype=np.complex64) for i in range(nsegs): seg_beg, seg_end = segs[i] frame = sig[seg_beg:seg_end] lpcs[:, i] = lpc_spectrum_frame(frame * window, order, nfft) return np.log10(abs(lpcs))
def _harmonic_and_pitch(args): """ Computes harmonic ratio and pitch """ sig = get_sig(args) fs, noverlap, win_length = unroll_args(args, ['fs', 'noverlap', 'win_length']) siglen = len(sig) nsegs, segs = split_segments(siglen, win_length, noverlap, incltail=False) HRs = [] F0s = [] for i in range(nsegs): seg_beg, seg_end = segs[i, :] frame = sig[seg_beg:seg_end] M = np.round(0.016 * fs) - 1 R = np.correlate(frame, frame, mode='full') g = R[len(frame) - 1] R = R[len(frame):-1] # estimate m0 (as the first zero crossing of R) [a, ] = np.nonzero(np.diff(np.sign(R))) if len(a) == 0: m0 = len(R) - 1 else: m0 = a[0] if M > len(R): M = len(R) - 1 Gamma = np.zeros(M, dtype=np.float64) CSum = np.cumsum(frame ** 2) Gamma[m0:M] = R[m0:M] / (np.sqrt((g * CSum[M:m0:-1])) + eps) ZCR = frame_zcr(Gamma) if ZCR > 0.15: HR = 0.0 f0 = 0.0 else: if len(Gamma) == 0: HR = 1.0 blag = 0.0 Gamma = np.zeros(M, dtype=np.float64) else: HR = np.max(Gamma) blag = np.argmax(Gamma) # Get fundamental frequency: f0 = fs / (blag + eps) if f0 > 5000: f0 = 0.0 if HR < 0.1: f0 = 0.0 HRs.append(HR) F0s.append(f0) return np.array(HRs), np.array(F0s)