def decibels(signal, name=None): ''' Get the number of decibels (10 * log10(signal))) for a tensor of raw magnitudes ''' name = scoping.adapt_name(name, "decibels") with tf.name_scope(name): return 10 * tf.maximum(tf.log(signal) / np.log(10), -50, name=name)
def magnitude(complex_spec, name=None): ''' Get the raw magnitude spectrogram for a complex spectrogram ''' name = scoping.adapt_name(name, "magnitude") with tf.name_scope(name): return tf.abs(complex_spec, name=name)
def energy(complex_spec, name=None): ''' Get the raw energy spectrogram for a complex spectrogram ''' name = scoping.adapt_name(name, "energy") with tf.name_scope(name): return tf.cast(complex_spec * tf.conj(complex_spec), tf.float64, name=name)
def apply_filterbank(spec, filter_bank, name=None): ''' Params: - spec: BLDC where D is half the number of FFT bins (the lower half of the spec), the magnitude spectrum - filter_bank: [Half FFT bins, number filters] tensor, the filter bank Returns: - BLDC tensor where D is the number of filters, the filter bank features ''' name = scoping.adapt_name(name, "apply_filterbank") with tf.name_scope(name): shape = tf.shape(spec) assert (DEPTH_AXIS == 2) spec = tf.transpose(spec, [0, 1, 3, 2]) # BLCD two_d = tf.reshape(spec, [-1, shape[DEPTH_AXIS]]) # *D feats = tf.matmul(two_d, filter_bank) # *D' feats = tf.reshape(feats, [shape[0], shape[1], shape[3], -1]) # BLCD' feats = tf.transpose(feats, [0, 1, 3, 2]) # BLD'C return tf.identity(feats, name)
def sliding_window(inp, frame_length, frame_shift, max_number_frames=None, padding="VALID", name=None): ''' Runs a sliding window across a signal (of audio data, for example). Params: - inp: A signal tensor in BLC format where L is the number of SAMPLES - frame_length: The length of each frame in number of samples (a python integer) - frame_shift: How many samples to shift the window (a python integer) - padding: How to pad the ends, can be "SAME" or "VALID" Returns: - A BLDC signal tensor where L is the number of FRAMES and D is frame_length. ''' assert (len(inp.shape) == 3) # BLC name = scoping.adapt_name(name, "sliding_window") with tf.name_scope(name): expanded = tf.expand_dims(inp, 3) lengths = [1, 1, 1, 1] shifts = [1, 1, 1, 1] lengths[LENGTH_AXIS] = frame_length shifts[LENGTH_AXIS] = frame_shift # Window the signal frames = tf.extract_image_patches(expanded, lengths, shifts, [1, 1, 1, 1], padding) if max_number_frames != None: # Clip the signal to only be the first max_number_frames frames slice_lengths = [ -1 if i != LENGTH_AXIS else tf.cast(max_number_frames, tf.int32) for i in range(4) ] frames = tf.slice(frames, [0, 0, 0, 0], tf.stack(slice_lengths)) frames = tf.transpose(frames, [0, 1, 3, 2]) # BLCD --> BLDC frames = tf.identity(frames, name=name) return frames
def timeseries_to_spec(frames, frame_length, window_type='hamming', N_fft=None, zero_pad=True, name=None): ''' Converts a timeseries to a spectrogram (preprocessing by removing the DC offset, zero padding, and applying a window function) Params: - frames: A BLDC tensor where L is the number of frames and D is the frame length - frame_length: python integer frame_length - window_type: the type of window (the same types supported as get_window) - N_fft: the number of FFT points to use (defaults to the next higher order of 2 after or equal to frame_length) - zero_pad: whether to zero_pad the frames to the next highest order of 2 for more efficient FFT Returns: N_fft, magnitude spec, energy spec, log magnitude spec (decibels), log energy spec (decibels) The spec is a BLDC tensor where L is the frame_length and D is the FFT bin count. FFT bin count is (N_fft or the next highest order of 2 above the frame_length) // 2 + 1 (to get the Nyquist frequency) ''' name = scoping.adapt_name(name, "spec") with tf.name_scope(name): # The window to convolve the sample with window = tf.constant(get_window(window_type, frame_length).astype(np.float64), name="window") frames = tf.multiply( frames, tf.reshape(window, [1 if i != DEPTH_AXIS else -1 for i in range(4)]), name="windowing") # Padding/clipping to N_fft if zero_pad: if N_fft is None: N_fft = get_Nfft(frame_length) if N_fft > frame_length: # Pad the frames to N_fft padding = [ [0, 0] if i != DEPTH_AXIS else [0, N_fft - frame_length] for i in range(4) ] frames = tf.pad(frames, padding, "CONSTANT") elif N_fft < frame_length: # Downsample the frames to N_fft assert (DEPTH_AXIS == 2) frames = tf.image.resize_images(frames, [tf.shape(frames)[1], N_fft]) ## New FFT #frames = tf.cast(tf.transpose(frames, [0, 1, 3, 2]), tf.float32) # BLDC -> BLCD #mag_spec = tf.spectral.rfft(frames, fft_length=[N_fft] if N_fft is not None else None) #mag_spec = tf.cast(tf.transpose(mag_spec, [0, 1, 3, 2]), tf.float64) # BLCD -> BLDC # FFT complex_frames = tf.complex(tf.cast(frames, tf.float32), tf.zeros(tf.shape(frames))) complex_frames = tf.transpose(complex_frames, [0, 1, 3, 2]) # BLDC -> BLCD spec = tf.fft(complex_frames) # Clip second half of spec: complex_spec = tf.slice(spec, tf.stack([0, 0, 0, 0]), tf.stack([-1, -1, -1, N_fft // 2 + 1]), name=name) complex_spec = tf.transpose(complex_spec, [0, 1, 3, 2]) # BLCD -> BLDC complex_spec = tf.cast(complex_spec, tf.complex128) mag_spec = magnitude(complex_spec, name="magnitude_spec") energy_spec = tf.square(mag_spec, name="energy_spec") log_mag_spec = decibels(mag_spec, name="log_magnetude_spec") log_energy_spec = 2 * log_mag_spec return N_fft, mag_spec, energy_spec, log_mag_spec, log_energy_spec
def __init__(self, audioPreprocessing, signal, name=None): name = scoping.adapt_name(name, "signal") with tf.name_scope(name): self.audio = audioPreprocessing self.signal = signal # Pad: with tf.name_scope("padding"): self.signal_padded = tf.pad( self.signal, tf.stack([ tf.stack([0, 0]) if i != LENGTH_AXIS else tf.stack( [0, self.audio.total_padding]) for i in range(3) ]), name="signal_padded") self.windowed = sliding_window(self.signal_padded, self.audio.frame_length_py, self.audio.frame_shift_py, max_number_frames=tf.reduce_max( self.audio.frame_counts), name="windowed_frames") self.frame_energy = tf.identity( tf.maximum( np.float64(-50.0), tf.log( tf.reduce_sum(tf.square(self.windowed), axis=[DEPTH_AXIS ]))), # / tf.log(10.0)), name="frame_energy") self.frame_energy_db = tf.identity(10.0 * self.frame_energy, name="frame_energy_db") # Spec: _, self.magnitude_spectrogram, self.energy_spectrogram, \ self.log_magnitude_spectrogram, self.log_energy_spectrogram = \ timeseries_to_spec(self.windowed, self.audio.frame_length_py, window_type='hamming', N_fft=self.audio.N_fft_py, zero_pad=True, name="spectrogram") # Fbanks: self.mel_fbank_features = apply_filterbank( self.energy_spectrogram, self.audio.filterbank, name="mel_fbank_features") self.mel_fbank_features_from_magnitude = apply_filterbank( self.magnitude_spectrogram, self.audio.filterbank, name="mel_fbank_features_from_magnitude") self.log_mel_fbank_features = decibels( self.mel_fbank_features, name="log_mel_fbank_features") # MFCCs: with tf.name_scope("mfccs"): self.mfscs = tf.maximum(tf.log(self.mel_fbank_features), -50, name="mfscs") mfscs = tf.transpose(self.mfscs, [0, 1, 3, 2]) # BLDC -> BLCD mfscs_flat = tf.reshape(mfscs, [-1, self.audio.filterbank_size_py]) mfccs_flat = tf.matmul(mfscs_flat, self.audio.dct_matrix) shape = tf.concat([ tf.slice(tf.shape(mfscs), [0], [3]), [self.audio.mfcc_size] ], axis=0, name="mfccs_shape") mfccs = tf.reshape(mfccs_flat, shape) mfccs = tf.transpose(mfccs, [0, 1, 3, 2]) # BLCD -> BLDC mfccs = tf.identity(mfccs, "mfccs") self.mfccs = mfccs
def __init__(self, raw_waveforms, raw_waveform_lengths, sample_rate, frame_length_ms, frame_shift_ms, last_sof=None, last_sin=None, online=True, channels=1, filterbank_size=23, mfcc_size=13, preemphasis=0.97, max_length=None, N_fft=512, name=None): ''' Preprocess audio, setting ops to remove DC offset, window the function, perform FFT, calculate mel filterbanks and mfccs, etc. Params: - raw_waveforms: a BLC signal tensor in float format with values from -1 to 1. L is the number of samples. - raw_waveform_lengths: a B integer tensor with the length in samples of each example in the batch - sample_rate: a python integer. The sample rate for the raw_waveforms - frame_length_ms: a python integer. The frame length in milliseconds for signal processing - frame_shift_ms: a python integer. The frame shift in milliseconds for signal processing - channels: a python integer: the number of channels for the signal - filterbank_size: a python integer. The number of mel filters to use - mfcc_size: a python integer. The number of MFCCs to use - preemphasis: a python float. The preemphasis factor (between 0 and 1) to apply: c in Y = (1 - Rc)X - max_length: the max_length in samples for the raw_waveforms (the tails will be clipped if they exceed it); None for no max length - N_fft: the number of FFT points to use ''' name = scoping.adapt_name(name, "audio_preprocessing") with tf.name_scope(name): with tf.name_scope("params"): def convert_ms_to_samples(ms, name="ms_to_samples"): x = np.int32( float(ms) / float(ms_per_sec) * float(sample_rate)) return constantify(x, name) self.frame_length_py, self.frame_length = convert_ms_to_samples( frame_length_ms, name="frame_length") self.frame_shift_py, self.frame_shift = convert_ms_to_samples( frame_shift_ms, name="frame_shift") if N_fft is None: N_fft = get_Nfft(self.frame_length_py) self.N_fft_py, self.N_fft = constantify(N_fft, "N_fft") self.raw_waveforms = raw_waveforms self.raw_waveform_lengths = raw_waveform_lengths self.sample_rate_py, self.sample_rate = constantify( sample_rate, name="sample_rate") self.frame_length_ms_py, self.frame_length_ms = constantify( frame_length_ms, "frame_length_ms") self.frame_shift_ms = constantify(frame_shift_ms, "frame_shift_ms") self.channels_py, self.channels = constantify( channels, "channels") self.preemphasis_py, self.preemphasis = constantify( np.float64(preemphasis), "preemphasis") self.filterbank_size_py, self.filterbank_size = constantify( filterbank_size, "filterbank_size") self.mfcc_size_py, self.mfcc_size = constantify( mfcc_size, "mfcc_size") preemphasis, preemphasis_py = variableify(preemphasis, name="preemphasis") with tf.name_scope("mask"): idxes = tf.tile( tf.expand_dims( tf.range(0, tf.shape(self.raw_waveforms)[LENGTH_AXIS], 1, dtype=tf.int32), 0), [tf.shape(self.raw_waveforms)[BATCH_AXIS], 1]) self.mask = idxes < tf.expand_dims(self.raw_waveform_lengths, 1) self.mask = tf.tile( tf.expand_dims(self.mask, -1), [1, 1, tf.shape(self.raw_waveforms)[-1]]) self.dc_offset, self.s_of, self.last_sof, self.last_sin = remove_dc( self.raw_waveforms, self.raw_waveform_lengths, self.mask, last_sof=last_sof, last_sin=last_sin, online=online, name="remove_dc") with tf.name_scope("preemphasis"): ''' kernel = [ [[-1.0 * self.preemphasis if i == j else 0.0 for j in range(channels)] for i in range(channels)], [[1.0 if i == j else 0.0 for j in range(channels)] for i in range(channels)]] self.s_pe = tf.nn.conv1d( tf.cast(self.s_of, tf.float32), tf.cast(kernel, tf.float32), 1, "SAME") self.s_pe = tf.cast(self.s_pe, tf.float64) ''' zeros = tf.zeros([tf.shape(self.s_of)[0], 1, self.channels_py], dtype=tf.float64) s_of = tf.concat([self.s_of, zeros], axis=LENGTH_AXIS) s_of_R = tf.concat([zeros, self.s_of], axis=LENGTH_AXIS) s_pe = s_of - self.preemphasis * s_of_R s_pe = tf.slice(s_pe, [0, 0, 0], tf.stack([-1, tf.shape(s_pe)[1] - 1, -1])) self.s_pe = tf.identity(s_pe, name="s_pe") with tf.name_scope("padding"): self.padding = self.frame_length - tf.mod( self.raw_waveform_lengths - self.frame_length, self.frame_shift) self.padding = tf.where(tf.equal(self.padding, self.frame_length), 0 * self.padding, self.padding, name="padding") self.padded_waveform_lengths = tf.add( self.raw_waveform_lengths, self.padding, name="padded_waveform_lengths") self.frame_counts = tf.divide(self.padded_waveform_lengths - self.frame_length, self.frame_shift, name="frame_counts") self.max_padded_length = tf.reduce_max( self.padded_waveform_lengths, name="max_padded_length") self.total_padding = tf.identity( self.max_padded_length - tf.reduce_max(self.raw_waveform_lengths), name="total_padding") self.filterbank = tf.constant(mel_filterbank( self.N_fft_py, self.sample_rate_py, num_bands=self.filterbank_size_py), name="filterbank") self.dct_matrix = tf.constant(dct_matrix( self.filterbank_size_py, self.mfcc_size_py).astype(np.float64), name="dct_matrix") self.s_of = SignalPreprocessing(self, self.s_of, name="s_of") self.s_pe = SignalPreprocessing(self, self.s_pe, name="s_pe") self.aurora_features = tf.concat([ tf.expand_dims(self.s_of.frame_energy, DEPTH_AXIS), self.s_pe.mfccs ], axis=DEPTH_AXIS, name="aurora_features")
def remove_dc(signal, signal_lengths, signal_mask, last_sof=None, last_sin=None, online=True, name=None): name = scoping.adapt_name(name, "remove_dc") with tf.variable_scope(name): if online: batch_size = tf.shape(signal)[0] length = tf.shape(signal)[1] channels = tf.shape(signal)[-1] zeros = tf.zeros([batch_size, 1, channels], dtype=tf.float64) if last_sof is None: last_sof = zeros if last_sin is None: last_sin = zeros sof = last_sof i = tf.constant(0) def body(i, sof, last_sin): last_sof = tf.slice(sof, [0, i, 0], [-1, 1, -1]) cur_sin = tf.slice(signal, [0, i, 0], [-1, 1, -1]) cur_mask = tf.slice(signal_mask, [0, i, 0], [-1, 1, -1]) cur_sof = tf.where(cur_mask, cur_sin - last_sin + 0.999 * last_sof, zeros) new_sof = tf.concat([sof, cur_sof], axis=1) newi = tf.add(i, 1) return newi, new_sof, cur_sin _, sof, _ = tf.while_loop( lambda i, sof, last_sin: tf.less(i, length), body, loop_vars=[i, sof, last_sin], shape_invariants=[ i.get_shape(), tf.TensorShape([None, None, None]), last_sin.get_shape() ], back_prop=False) sof = tf.slice(sof, [0, 1, 0], [-1, -1, -1]) sof = tf.reshape(sof, tf.shape(signal)) demeaned = tf.identity(sof, "demeaned") mean = tf.identity(signal - demeaned, "mean") last_sin = tf.slice(last_sin, [0, tf.shape(signal)[1] - 1, 0], [-1, 1, -1]) last_sof = tf.slice(demeaned, [0, tf.shape(demeaned)[1] - 1, 0], [-1, 1, -1]) return mean, demeaned, last_sof, last_sin else: # If index < signal_lengths, count in the mean and remove mean zeros = tf.zeros(tf.shape(signal), dtype=tf.float64) mean = tf.reduce_sum(tf.where(signal_mask, signal, zeros), axis=LENGTH_AXIS, keep_dims=True) / tf.cast( tf.expand_dims(signal_lengths, 1), tf.float64) mean = tf.identity(mean, "mean") demeaned = tf.where(signal_mask, signal - mean, zeros) return mean, demeaned, None, None