Example #1
0
def decibels(signal, name=None):
    '''
    Get the number of decibels (10 * log10(signal))) for a tensor of raw magnitudes
    '''
    name = scoping.adapt_name(name, "decibels")
    with tf.name_scope(name):
        return 10 * tf.maximum(tf.log(signal) / np.log(10), -50, name=name)
Example #2
0
def magnitude(complex_spec, name=None):
    '''
    Get the raw magnitude spectrogram for a complex spectrogram
    '''
    name = scoping.adapt_name(name, "magnitude")
    with tf.name_scope(name):
        return tf.abs(complex_spec, name=name)
Example #3
0
def energy(complex_spec, name=None):
    '''
    Get the raw energy spectrogram for a complex spectrogram
    '''
    name = scoping.adapt_name(name, "energy")
    with tf.name_scope(name):
        return tf.cast(complex_spec * tf.conj(complex_spec),
                       tf.float64,
                       name=name)
Example #4
0
def apply_filterbank(spec, filter_bank, name=None):
    '''
    Params:
      - spec: BLDC where D is half the number of FFT bins (the lower half of the spec), the magnitude spectrum
      - filter_bank: [Half FFT bins, number filters] tensor, the filter bank 
    Returns:
      - BLDC tensor where D is the number of filters, the filter bank features
    '''
    name = scoping.adapt_name(name, "apply_filterbank")
    with tf.name_scope(name):
        shape = tf.shape(spec)
        assert (DEPTH_AXIS == 2)
        spec = tf.transpose(spec, [0, 1, 3, 2])  # BLCD
        two_d = tf.reshape(spec, [-1, shape[DEPTH_AXIS]])  # *D
        feats = tf.matmul(two_d, filter_bank)  # *D'
        feats = tf.reshape(feats, [shape[0], shape[1], shape[3], -1])  # BLCD'
        feats = tf.transpose(feats, [0, 1, 3, 2])  # BLD'C
        return tf.identity(feats, name)
Example #5
0
def sliding_window(inp,
                   frame_length,
                   frame_shift,
                   max_number_frames=None,
                   padding="VALID",
                   name=None):
    '''
    Runs a sliding window across a signal (of audio data, for example).
    Params:
      - inp:  A signal tensor in BLC format where L is the number of SAMPLES
      - frame_length:  The length of each frame in number of samples (a python integer)
      - frame_shift:  How many samples to shift the window (a python integer)
      - padding:  How to pad the ends, can be "SAME" or "VALID"
    Returns:
      - A BLDC signal tensor where L is the number of FRAMES and D is frame_length.
    '''
    assert (len(inp.shape) == 3)  # BLC
    name = scoping.adapt_name(name, "sliding_window")
    with tf.name_scope(name):
        expanded = tf.expand_dims(inp, 3)
        lengths = [1, 1, 1, 1]
        shifts = [1, 1, 1, 1]
        lengths[LENGTH_AXIS] = frame_length
        shifts[LENGTH_AXIS] = frame_shift
        # Window the signal
        frames = tf.extract_image_patches(expanded, lengths, shifts,
                                          [1, 1, 1, 1], padding)
        if max_number_frames != None:
            # Clip the signal to only be the first max_number_frames frames
            slice_lengths = [
                -1
                if i != LENGTH_AXIS else tf.cast(max_number_frames, tf.int32)
                for i in range(4)
            ]
            frames = tf.slice(frames, [0, 0, 0, 0], tf.stack(slice_lengths))
        frames = tf.transpose(frames, [0, 1, 3, 2])  # BLCD --> BLDC
        frames = tf.identity(frames, name=name)
    return frames
Example #6
0
def timeseries_to_spec(frames,
                       frame_length,
                       window_type='hamming',
                       N_fft=None,
                       zero_pad=True,
                       name=None):
    '''
    Converts a timeseries to a spectrogram (preprocessing by removing the DC offset, zero padding,
        and applying a window function)
    Params:
        - frames: A BLDC tensor where L is the number of frames and D is the frame length
        - frame_length: python integer frame_length
        - window_type: the type of window (the same types supported as get_window)
        - N_fft: the number of FFT points to use (defaults to the next higher order of 2 after or equal to frame_length)
        - zero_pad: whether to zero_pad the frames to the next highest order of 2 for more efficient FFT
    Returns:
      N_fft, magnitude spec, energy spec, log magnitude spec (decibels), log energy spec (decibels)
      The spec is a BLDC tensor where L is the frame_length and D is the FFT bin count.
        FFT bin count is (N_fft or the next highest
        order of 2 above the frame_length) // 2 + 1 (to get the Nyquist frequency)
    '''
    name = scoping.adapt_name(name, "spec")
    with tf.name_scope(name):
        # The window to convolve the sample with
        window = tf.constant(get_window(window_type,
                                        frame_length).astype(np.float64),
                             name="window")
        frames = tf.multiply(
            frames,
            tf.reshape(window,
                       [1 if i != DEPTH_AXIS else -1 for i in range(4)]),
            name="windowing")
        # Padding/clipping to N_fft
        if zero_pad:
            if N_fft is None:
                N_fft = get_Nfft(frame_length)
            if N_fft > frame_length:
                # Pad the frames to N_fft
                padding = [
                    [0, 0] if i != DEPTH_AXIS else [0, N_fft - frame_length]
                    for i in range(4)
                ]
                frames = tf.pad(frames, padding, "CONSTANT")
            elif N_fft < frame_length:
                # Downsample the frames to N_fft
                assert (DEPTH_AXIS == 2)
                frames = tf.image.resize_images(frames,
                                                [tf.shape(frames)[1], N_fft])
        ## New FFT
        #frames = tf.cast(tf.transpose(frames, [0, 1, 3, 2]), tf.float32) # BLDC -> BLCD
        #mag_spec = tf.spectral.rfft(frames, fft_length=[N_fft] if N_fft is not None else None)
        #mag_spec = tf.cast(tf.transpose(mag_spec, [0, 1, 3, 2]), tf.float64) # BLCD -> BLDC
        # FFT
        complex_frames = tf.complex(tf.cast(frames, tf.float32),
                                    tf.zeros(tf.shape(frames)))
        complex_frames = tf.transpose(complex_frames,
                                      [0, 1, 3, 2])  # BLDC -> BLCD
        spec = tf.fft(complex_frames)
        # Clip second half of spec:
        complex_spec = tf.slice(spec,
                                tf.stack([0, 0, 0, 0]),
                                tf.stack([-1, -1, -1, N_fft // 2 + 1]),
                                name=name)
        complex_spec = tf.transpose(complex_spec, [0, 1, 3, 2])  # BLCD -> BLDC
        complex_spec = tf.cast(complex_spec, tf.complex128)
        mag_spec = magnitude(complex_spec, name="magnitude_spec")

        energy_spec = tf.square(mag_spec, name="energy_spec")
        log_mag_spec = decibels(mag_spec, name="log_magnetude_spec")
        log_energy_spec = 2 * log_mag_spec
        return N_fft, mag_spec, energy_spec, log_mag_spec, log_energy_spec
Example #7
0
 def __init__(self, audioPreprocessing, signal, name=None):
     name = scoping.adapt_name(name, "signal")
     with tf.name_scope(name):
         self.audio = audioPreprocessing
         self.signal = signal
         # Pad:
         with tf.name_scope("padding"):
             self.signal_padded = tf.pad(
                 self.signal,
                 tf.stack([
                     tf.stack([0, 0]) if i != LENGTH_AXIS else tf.stack(
                         [0, self.audio.total_padding]) for i in range(3)
                 ]),
                 name="signal_padded")
             self.windowed = sliding_window(self.signal_padded,
                                            self.audio.frame_length_py,
                                            self.audio.frame_shift_py,
                                            max_number_frames=tf.reduce_max(
                                                self.audio.frame_counts),
                                            name="windowed_frames")
             self.frame_energy = tf.identity(
                 tf.maximum(
                     np.float64(-50.0),
                     tf.log(
                         tf.reduce_sum(tf.square(self.windowed),
                                       axis=[DEPTH_AXIS
                                             ]))),  # / tf.log(10.0)),
                 name="frame_energy")
             self.frame_energy_db = tf.identity(10.0 * self.frame_energy,
                                                name="frame_energy_db")
         # Spec:
         _, self.magnitude_spectrogram, self.energy_spectrogram, \
                 self.log_magnitude_spectrogram, self.log_energy_spectrogram = \
                 timeseries_to_spec(self.windowed, self.audio.frame_length_py,
                                    window_type='hamming', N_fft=self.audio.N_fft_py,
                                    zero_pad=True, name="spectrogram")
         # Fbanks:
         self.mel_fbank_features = apply_filterbank(
             self.energy_spectrogram,
             self.audio.filterbank,
             name="mel_fbank_features")
         self.mel_fbank_features_from_magnitude = apply_filterbank(
             self.magnitude_spectrogram,
             self.audio.filterbank,
             name="mel_fbank_features_from_magnitude")
         self.log_mel_fbank_features = decibels(
             self.mel_fbank_features, name="log_mel_fbank_features")
         # MFCCs:
         with tf.name_scope("mfccs"):
             self.mfscs = tf.maximum(tf.log(self.mel_fbank_features),
                                     -50,
                                     name="mfscs")
             mfscs = tf.transpose(self.mfscs, [0, 1, 3, 2])  # BLDC -> BLCD
             mfscs_flat = tf.reshape(mfscs,
                                     [-1, self.audio.filterbank_size_py])
             mfccs_flat = tf.matmul(mfscs_flat, self.audio.dct_matrix)
             shape = tf.concat([
                 tf.slice(tf.shape(mfscs), [0], [3]),
                 [self.audio.mfcc_size]
             ],
                               axis=0,
                               name="mfccs_shape")
             mfccs = tf.reshape(mfccs_flat, shape)
             mfccs = tf.transpose(mfccs, [0, 1, 3, 2])  # BLCD -> BLDC
             mfccs = tf.identity(mfccs, "mfccs")
             self.mfccs = mfccs
Example #8
0
    def __init__(self,
                 raw_waveforms,
                 raw_waveform_lengths,
                 sample_rate,
                 frame_length_ms,
                 frame_shift_ms,
                 last_sof=None,
                 last_sin=None,
                 online=True,
                 channels=1,
                 filterbank_size=23,
                 mfcc_size=13,
                 preemphasis=0.97,
                 max_length=None,
                 N_fft=512,
                 name=None):
        '''
        Preprocess audio, setting ops to remove DC offset, window the function, perform FFT, calculate 
            mel filterbanks and mfccs, etc.
        Params:
            - raw_waveforms: a BLC signal tensor in float format with values from -1 to 1. L is the number of samples.
            - raw_waveform_lengths: a B integer tensor with the length in samples of each example in the batch
            - sample_rate: a python integer. The sample rate for the raw_waveforms
            - frame_length_ms: a python integer. The frame length in milliseconds for signal processing
            - frame_shift_ms: a python integer. The frame shift in milliseconds for signal processing
            - channels: a python integer: the number of channels for the signal
            - filterbank_size: a python integer. The number of mel filters to use
            - mfcc_size: a python integer. The number of MFCCs to use
            - preemphasis: a python float. The preemphasis factor (between 0 and 1) to apply: c in Y = (1 - Rc)X
            - max_length: the max_length in samples for the raw_waveforms (the tails will be clipped if they exceed it); None for no max length
            - N_fft: the number of FFT points to use
        '''
        name = scoping.adapt_name(name, "audio_preprocessing")
        with tf.name_scope(name):
            with tf.name_scope("params"):

                def convert_ms_to_samples(ms, name="ms_to_samples"):
                    x = np.int32(
                        float(ms) / float(ms_per_sec) * float(sample_rate))
                    return constantify(x, name)

                self.frame_length_py, self.frame_length = convert_ms_to_samples(
                    frame_length_ms, name="frame_length")
                self.frame_shift_py, self.frame_shift = convert_ms_to_samples(
                    frame_shift_ms, name="frame_shift")
                if N_fft is None:
                    N_fft = get_Nfft(self.frame_length_py)
                self.N_fft_py, self.N_fft = constantify(N_fft, "N_fft")
                self.raw_waveforms = raw_waveforms
                self.raw_waveform_lengths = raw_waveform_lengths
                self.sample_rate_py, self.sample_rate = constantify(
                    sample_rate, name="sample_rate")
                self.frame_length_ms_py, self.frame_length_ms = constantify(
                    frame_length_ms, "frame_length_ms")
                self.frame_shift_ms = constantify(frame_shift_ms,
                                                  "frame_shift_ms")
                self.channels_py, self.channels = constantify(
                    channels, "channels")
                self.preemphasis_py, self.preemphasis = constantify(
                    np.float64(preemphasis), "preemphasis")
                self.filterbank_size_py, self.filterbank_size = constantify(
                    filterbank_size, "filterbank_size")
                self.mfcc_size_py, self.mfcc_size = constantify(
                    mfcc_size, "mfcc_size")
                preemphasis, preemphasis_py = variableify(preemphasis,
                                                          name="preemphasis")
            with tf.name_scope("mask"):
                idxes = tf.tile(
                    tf.expand_dims(
                        tf.range(0,
                                 tf.shape(self.raw_waveforms)[LENGTH_AXIS],
                                 1,
                                 dtype=tf.int32), 0),
                    [tf.shape(self.raw_waveforms)[BATCH_AXIS], 1])
                self.mask = idxes < tf.expand_dims(self.raw_waveform_lengths,
                                                   1)

                self.mask = tf.tile(
                    tf.expand_dims(self.mask, -1),
                    [1, 1, tf.shape(self.raw_waveforms)[-1]])
            self.dc_offset, self.s_of, self.last_sof, self.last_sin = remove_dc(
                self.raw_waveforms,
                self.raw_waveform_lengths,
                self.mask,
                last_sof=last_sof,
                last_sin=last_sin,
                online=online,
                name="remove_dc")
            with tf.name_scope("preemphasis"):
                '''
                kernel = [
                    [[-1.0 * self.preemphasis if i == j else 0.0 for j in range(channels)]
                        for i in range(channels)],
                    [[1.0 if i == j else 0.0 for j in range(channels)]
                        for i in range(channels)]]
                self.s_pe = tf.nn.conv1d(
                        tf.cast(self.s_of, tf.float32),
                        tf.cast(kernel, tf.float32),
                        1, "SAME")
                self.s_pe = tf.cast(self.s_pe, tf.float64)
                '''
                zeros = tf.zeros([tf.shape(self.s_of)[0], 1, self.channels_py],
                                 dtype=tf.float64)
                s_of = tf.concat([self.s_of, zeros], axis=LENGTH_AXIS)
                s_of_R = tf.concat([zeros, self.s_of], axis=LENGTH_AXIS)
                s_pe = s_of - self.preemphasis * s_of_R
                s_pe = tf.slice(s_pe, [0, 0, 0],
                                tf.stack([-1, tf.shape(s_pe)[1] - 1, -1]))
                self.s_pe = tf.identity(s_pe, name="s_pe")
            with tf.name_scope("padding"):
                self.padding = self.frame_length - tf.mod(
                    self.raw_waveform_lengths - self.frame_length,
                    self.frame_shift)
                self.padding = tf.where(tf.equal(self.padding,
                                                 self.frame_length),
                                        0 * self.padding,
                                        self.padding,
                                        name="padding")
                self.padded_waveform_lengths = tf.add(
                    self.raw_waveform_lengths,
                    self.padding,
                    name="padded_waveform_lengths")
                self.frame_counts = tf.divide(self.padded_waveform_lengths -
                                              self.frame_length,
                                              self.frame_shift,
                                              name="frame_counts")
                self.max_padded_length = tf.reduce_max(
                    self.padded_waveform_lengths, name="max_padded_length")
                self.total_padding = tf.identity(
                    self.max_padded_length -
                    tf.reduce_max(self.raw_waveform_lengths),
                    name="total_padding")
            self.filterbank = tf.constant(mel_filterbank(
                self.N_fft_py,
                self.sample_rate_py,
                num_bands=self.filterbank_size_py),
                                          name="filterbank")
            self.dct_matrix = tf.constant(dct_matrix(
                self.filterbank_size_py, self.mfcc_size_py).astype(np.float64),
                                          name="dct_matrix")
            self.s_of = SignalPreprocessing(self, self.s_of, name="s_of")
            self.s_pe = SignalPreprocessing(self, self.s_pe, name="s_pe")
            self.aurora_features = tf.concat([
                tf.expand_dims(self.s_of.frame_energy, DEPTH_AXIS),
                self.s_pe.mfccs
            ],
                                             axis=DEPTH_AXIS,
                                             name="aurora_features")
Example #9
0
def remove_dc(signal,
              signal_lengths,
              signal_mask,
              last_sof=None,
              last_sin=None,
              online=True,
              name=None):
    name = scoping.adapt_name(name, "remove_dc")
    with tf.variable_scope(name):
        if online:
            batch_size = tf.shape(signal)[0]
            length = tf.shape(signal)[1]
            channels = tf.shape(signal)[-1]
            zeros = tf.zeros([batch_size, 1, channels], dtype=tf.float64)
            if last_sof is None:
                last_sof = zeros
            if last_sin is None:
                last_sin = zeros
            sof = last_sof
            i = tf.constant(0)

            def body(i, sof, last_sin):
                last_sof = tf.slice(sof, [0, i, 0], [-1, 1, -1])
                cur_sin = tf.slice(signal, [0, i, 0], [-1, 1, -1])
                cur_mask = tf.slice(signal_mask, [0, i, 0], [-1, 1, -1])
                cur_sof = tf.where(cur_mask,
                                   cur_sin - last_sin + 0.999 * last_sof,
                                   zeros)
                new_sof = tf.concat([sof, cur_sof], axis=1)
                newi = tf.add(i, 1)
                return newi, new_sof, cur_sin

            _, sof, _ = tf.while_loop(
                lambda i, sof, last_sin: tf.less(i, length),
                body,
                loop_vars=[i, sof, last_sin],
                shape_invariants=[
                    i.get_shape(),
                    tf.TensorShape([None, None, None]),
                    last_sin.get_shape()
                ],
                back_prop=False)
            sof = tf.slice(sof, [0, 1, 0], [-1, -1, -1])
            sof = tf.reshape(sof, tf.shape(signal))
            demeaned = tf.identity(sof, "demeaned")
            mean = tf.identity(signal - demeaned, "mean")
            last_sin = tf.slice(last_sin, [0, tf.shape(signal)[1] - 1, 0],
                                [-1, 1, -1])
            last_sof = tf.slice(demeaned, [0, tf.shape(demeaned)[1] - 1, 0],
                                [-1, 1, -1])
            return mean, demeaned, last_sof, last_sin
        else:
            # If index < signal_lengths, count in the mean and remove mean
            zeros = tf.zeros(tf.shape(signal), dtype=tf.float64)
            mean = tf.reduce_sum(tf.where(signal_mask, signal, zeros),
                                 axis=LENGTH_AXIS,
                                 keep_dims=True) / tf.cast(
                                     tf.expand_dims(signal_lengths, 1),
                                     tf.float64)
            mean = tf.identity(mean, "mean")
            demeaned = tf.where(signal_mask, signal - mean, zeros)
            return mean, demeaned, None, None