def get_features(model_settings):
    if model_settings['preprocess'] == 'micro':
        window_size_ms = (model_settings['window_size_samples'] *
                          1000) / model_settings['sample_rate']
        window_step_ms = (model_settings['window_stride_samples'] *
                          1000) / model_settings['sample_rate']
        int16_input = tf.cast(tf.multiply(input_data, 32768), tf.int16)
        # print(int16_input.shape)

        # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py
        micro_frontend = frontend_op.audio_microfrontend(
            int16_input,
            sample_rate=model_settings['sample_rate'],
            window_size=window_size_ms,
            window_step=window_step_ms,
            num_channels=model_settings['fingerprint_width'],
            out_scale=1,
            out_type=tf.float32)
        output = tf.multiply(micro_frontend, (10.0 / 256.0))
        return output

    elif model_settings['preprocess'] == 'mfcc':
        # https://www.tensorflow.org/api_docs/python/tf/raw_ops/AudioSpectrogram
        spectrogram = audio_ops.audio_spectrogram(
                  input_data,
                  window_size=model_settings['window_size_samples'],
                  stride=model_settings['window_stride_samples'],
                  magnitude_squared=True)
        output = audio_ops.mfcc(
                spectrogram,
                model_settings['sample_rate'],
                dct_coefficient_count=model_settings['fingerprint_width'])
        return output[0,:,:] #just return channel 0 as 2D tensor

    elif model_settings['preprocess'] == 'average':
        spectrogram = audio_ops.audio_spectrogram(
                  input_data,
                  window_size=model_settings['window_size_samples'],
                  stride=model_settings['window_stride_samples'],
                  magnitude_squared=True)
        output = tf.nn.pool(
                  input=tf.expand_dims(spectrogram, -1),
                  window_shape=[1, model_settings['average_window_width']],
                  strides=[1, model_settings['average_window_width']],
                  pooling_type='AVG',
                  padding='SAME')
        return output[0,:,:,0] #just return channel 0 as 2D tensor

    else:
        raise ValueError(f'Unknown model setting: {model_settings["preprocess"]}')
def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride,
                   num_mfcc):
    """
    Calculate MFCC(Mel Frequency Cepstral Coefficient) for a given audio signal
    Args:
        audio_signal: Raw audio signal in range [-1, 1]
        audio_sample_rate: sample rate for signal
        window_size: window size in samples for calculating spectrogram
        window_stride: window stride
        num_mfcc: number of mfcc features

    Returns:
        calculated mfcc feature

    """
    spectrogram = audio_ops.audio_spectrogram(input=audio_signal,
                                              window_size=window_size,
                                              stride=window_stride,
                                              magnitude_squared=True)
    mfcc_features = audio_ops.mfcc(spectrogram,
                                   audio_sample_rate,
                                   dct_coefficient_count=num_mfcc)

    # *Note*, api has been changed in tf2.x > tf2.3
    # TODO add another implementation

    return mfcc_features
def run_mfcc(input_width=40,
             window_size_samples=480,
             window_stride_samples=320.0,
             sample_rate=16000):
    """ Run MFCC on a .wav file
        
        Args:
            filename: the path to wav_file
            sess: Current Session being run
        Returns:
            Return 1-D of mfcc with 1960 data points

        """
    wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = tf.audio.decode_wav(wav_loader,
                                      desired_channels=1,
                                      desired_samples=sample_rate)
    #background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0)
    spectrogram = audio_ops.audio_spectrogram(wav_decoder.audio,
                                              window_size=window_size_samples,
                                              stride=window_stride_samples,
                                              magnitude_squared=True)

    mfcc = audio_ops.mfcc(spectrogram,
                          wav_decoder.sample_rate,
                          dct_coefficient_count=input_width)

    return mfcc, wav_filename_placeholder
Exemple #4
0
    def _mfcc_op(self, inputs):
        # MFCC implementation based on TF custom op (supported by TFLite)
        # It reduces model size in comparison to _mfcc_tf
        if (self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE
                or self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE):
            outputs = self.data_frame(inputs)
            # in streaming mode there is only one frame for FFT calculation
            # dims will be [batch=1, time=1, frame],
            # but audio_spectrogram requre 2D input data, so we remove time dim
            outputs = tf.squeeze(outputs, axis=1)
        else:
            outputs = inputs

        # outputs has dims [batch, time]
        # but audio_spectrogram expects [time, channels/batch] so transpose it
        outputs = tf.transpose(outputs, [1, 0])

        # outputs: [time, channels/batch]
        outputs = audio_ops.audio_spectrogram(
            outputs,
            window_size=self.frame_size,
            stride=self.frame_step,
            magnitude_squared=self.params['fft_magnitude_squared'])
        # outputs: [channels/batch, frames, fft_feature]

        outputs = audio_ops.mfcc(
            outputs,
            self.params['sample_rate'],
            upper_frequency_limit=self.params['mel_upper_edge_hertz'],
            lower_frequency_limit=self.params['mel_lower_edge_hertz'],
            filterbank_channel_count=self.params['mel_num_bins'],
            dct_coefficient_count=self.params['dct_num_features'])
        # outputs: [channels/batch, frames, dct_coefficient_count]
        outputs = self.spec_augment(outputs)
        return outputs
Exemple #5
0
def get_mfcc(waveform):
    # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs
    # background_clamp dims: [time, channels]
    sample_rate = 16000

    spectrogram = audio_ops.audio_spectrogram(waveform,
                                              window_size=320,
                                              stride=160)
    # spectrogram: [channels/batch, frames, fft_feature]

    # extract mfcc features from spectrogram by audio_ops.mfcc:
    # 1 Input is spectrogram frames.
    # 2 Weighted spectrogram into bands using a triangular mel filterbank
    # 3 Logarithmic scaling
    # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count
    mfccs = audio_ops.mfcc(spectrogram=spectrogram,
                           sample_rate=sample_rate,
                           upper_frequency_limit=7600,
                           lower_frequency_limit=60,
                           filterbank_channel_count=40,
                           dct_coefficient_count=20)
    # mfcc: [channels/batch, frames, dct_coefficient_count]
    # remove channel dim

    return mfccs
Exemple #6
0
def samples_to_mfccs_orig(samples, sample_rate, train_phase=False):
    #tf.print('window_size: ', Config.audio_window_samples, ' stride: ', Config.audio_step_samples)
    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)

    # Data Augmentations
    if train_phase:
        if FLAGS.augmentation_spec_dropout_keeprate < 1:
            spectrogram = augment_dropout(spectrogram,
                                          keep_prob=FLAGS.augmentation_spec_dropout_keeprate)

        if FLAGS.augmentation_freq_and_time_masking:
            spectrogram = augment_freq_time_mask(spectrogram,
                                                 frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range,
                                                 time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range,
                                                 frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks,
                                                 time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks)

        if FLAGS.augmentation_pitch_and_tempo_scaling:
            spectrogram = augment_pitch_and_tempo(spectrogram,
                                                  max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo,
                                                  max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch,
                                                  min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch)

        if FLAGS.augmentation_speed_up_std > 0:
            spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std)

    mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])
    #tf.print('dct_count: ', Config.n_input)
    return mfccs, tf.shape(input=mfccs)[0]
Exemple #7
0
def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phase=False, augmentations=None, sample_id=None):
    if train_phase:
        # We need the lambdas to make TensorFlow happy.
        # pylint: disable=unnecessary-lambda
        tf.cond(tf.math.not_equal(sample_rate, FLAGS.audio_sample_rate),
                lambda: tf.print('WARNING: sample rate of sample', sample_id, '(', sample_rate, ') '
                                 'does not match FLAGS.audio_sample_rate. This can lead to incorrect results.'),
                lambda: tf.no_op(),
                name='matching_sample_rate')

    if train_phase and augmentations:
        audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock)

    spectrogram = contrib_audio.audio_spectrogram(audio,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)

    if train_phase and augmentations:
        spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock)

    features = contrib_audio.mfcc(spectrogram=spectrogram,
                                  sample_rate=sample_rate,
                                  dct_coefficient_count=Config.n_input,
                                  upper_frequency_limit=FLAGS.audio_sample_rate / 2)
    features = tf.reshape(features, [-1, Config.n_input])

    if train_phase and augmentations:
        features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock)

    return features, tf.shape(input=features)[0]
def get_deepspeech_mfccs(samples, sample_rate=16000):
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    spectrogram = contrib_audio.audio_spectrogram(decoded.audio,
                                                  window_size=512,
                                                  stride=320,
                                                  magnitude_squared=True)
    return contrib_audio.mfcc(spectrogram=spectrogram,
                              sample_rate=decoded.sample_rate,
                              dct_coefficient_count=26,
                              upper_frequency_limit=sample_rate / 2)
Exemple #9
0
def AudioToMfcc(sample_rate, audio, window_size_ms, window_stride_ms,
                num_coefficients):
    window_size_samples = sample_rate * window_size_ms // 1000
    window_stride_samples = sample_rate * window_stride_ms // 1000
    spectrogram = audio_ops.audio_spectrogram(audio,
                                              window_size=window_size_samples,
                                              stride=window_stride_samples,
                                              magnitude_squared=True)
    mfcc = audio_ops.mfcc(spectrogram,
                          sample_rate,
                          dct_coefficient_count=num_coefficients)
    return mfcc
Exemple #10
0
def samples_to_mfccs(samples, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(
        samples,
        window_size=Config.audio_window_samples,
        stride=Config.audio_step_samples,
        magnitude_squared=True)
    mfccs = contrib_audio.mfcc(spectrogram,
                               sample_rate,
                               dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(input=mfccs)[0]
def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None):
    if train_phase:
        # We need the lambdas to make TensorFlow happy.
        # pylint: disable=unnecessary-lambda
        tf.cond(tf.math.not_equal(sample_rate, FLAGS.audio_sample_rate),
                lambda: tf.print('WARNING: sample rate of sample', sample_id, '(', sample_rate, ') '
                                 'does not match FLAGS.audio_sample_rate. This can lead to incorrect results.'),
                lambda: tf.no_op(),
                name='matching_sample_rate')

    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)

    # Data Augmentations
    if train_phase:
        if FLAGS.augmentation_spec_dropout_keeprate < 1:
            spectrogram = augment_dropout(spectrogram,
                                          keep_prob=FLAGS.augmentation_spec_dropout_keeprate)

        # sparse warp must before freq/time masking
        if FLAGS.augmentation_sparse_warp:
            spectrogram = augment_sparse_warp(spectrogram,
                                              time_warping_para=FLAGS.augmentation_sparse_warp_time_warping_para,
                                              interpolation_order=FLAGS.augmentation_sparse_warp_interpolation_order,
                                              regularization_weight=FLAGS.augmentation_sparse_warp_regularization_weight,
                                              num_boundary_points=FLAGS.augmentation_sparse_warp_num_boundary_points,
                                              num_control_points=FLAGS.augmentation_sparse_warp_num_control_points)

        if FLAGS.augmentation_freq_and_time_masking:
            spectrogram = augment_freq_time_mask(spectrogram,
                                                 frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range,
                                                 time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range,
                                                 frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks,
                                                 time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks)

        if FLAGS.augmentation_pitch_and_tempo_scaling:
            spectrogram = augment_pitch_and_tempo(spectrogram,
                                                  max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo,
                                                  max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch,
                                                  min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch)

        if FLAGS.augmentation_speed_up_std > 0:
            spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std)

    mfccs = contrib_audio.mfcc(spectrogram=spectrogram,
                               sample_rate=sample_rate,
                               dct_coefficient_count=Config.n_input,
                               upper_frequency_limit=FLAGS.audio_sample_rate/2)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(input=mfccs)[0]
 def make_features(self, audio: np.ndarray) -> np.ndarray:
     """ Use `python_speech_features` lib to extract log filter banks from
     the features file. """
     spectrogram = contrib_audio.audio_spectrogram(
         audio.audio,
         window_size=self.window_size,
         stride=self.window_step,
         magnitude_squared=True)
     mfccs = contrib_audio.mfcc(spectrogram=spectrogram,
                                sample_rate=self.sample_rate,
                                dct_coefficient_count=self.features_num,
                                upper_frequency_limit=self.sample_rate // 2)
     return self.standardize(
         mfccs[0]) if self.is_standardization else mfccs[0]
Exemple #13
0
def samples_to_mfccs(samples, sample_rate):

    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=512,
                                                  stride=320,
                                                  magnitude_squared=True)

    mfccs = contrib_audio.mfcc(spectrogram=spectrogram,
                               sample_rate=sample_rate,
                               dct_coefficient_count=26,
                               upper_frequency_limit=4000)
    mfccs = tf.reshape(mfccs, [-1, 26])

    return mfccs, tf.shape(input=mfccs)[0]
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        channel_count = model_settings['channel_count']
        sample_rate = model_settings['sample_rate']
        self.foreground_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, channel_count])
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(self.foreground_data_placeholder_,
                                        self.foreground_volume_placeholder_)
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        self.waveform_ = scaled_foreground
        spectrograms = []
        for ichannel in range(channel_count):
            spectrograms.append(
                audio_ops.audio_spectrogram(
                    tf.slice(scaled_foreground, [0, ichannel], [-1, 1]),
                    window_size=model_settings['window_size_samples'],
                    stride=model_settings['window_stride_samples'],
                    magnitude_squared=True))
        self.spectrogram_ = tf.stack(spectrograms, -1)
        mfccs = []
        for ichannel in range(channel_count):
            mfccs.append(
                audio_ops.mfcc(
                    spectrograms[ichannel],
                    sample_rate,
                    upper_frequency_limit=model_settings['sample_rate'] // 2,
                    filterbank_channel_count=model_settings[
                        'filterbank_channel_count'],
                    dct_coefficient_count=model_settings[
                        'dct_coefficient_count']))
        self.mfcc_ = tf.stack(mfccs, -1)
    def make_features(self, audio: np.ndarray) -> np.ndarray:
        """Use Tensorflow  lib to extract
           log filter banks from the features file. """
        audio = audio[:, np.newaxis]
        spectrogram = contrib_audio.audio_spectrogram(
            audio,
            window_size=self.window_size,
            stride=self.window_step,
            magnitude_squared=True)
        mfccs = contrib_audio.mfcc(spectrogram=spectrogram,
                                   sample_rate=self.sample_rate,
                                   dct_coefficient_count=self.features_num,
                                   upper_frequency_limit=8000)

        # take the first channel only
        return mfccs[0]
Exemple #16
0
def samples_to_mfccs(samples, sample_rate):
    # 16000 = default sample rate
    # 32 = default feature extraction audio window length in milliseconds
    audio_window_samples = 16000 * (32 / 1000)
    # 20 = default feature extraction window step length in milliseconds
    audio_step_samples = 16000 * (20 / 1000)
    spectrogram = contrib_audio.audio_spectrogram(
        samples,
        window_size=audio_window_samples,
        stride=audio_step_samples,
        magnitude_squared=True)

    mfccs = contrib_audio.mfcc(spectrogram,
                               sample_rate,
                               dct_coefficient_count=n_input)
    mfccs = tf.reshape(mfccs, [-1, n_input])

    return mfccs, tf.shape(input=mfccs)[0]
Exemple #17
0
def samples_to_mfccs(samples, sample_rate, train_phase=False):
    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)

    # Data Augmentations
    if train_phase:
        if FLAGS.augmentation_spec_dropout_keeprate < 1:
            spectrogram = augment_dropout(spectrogram,
                                          keep_prob=FLAGS.augmentation_spec_dropout_keeprate)

        # sparse warp must before freq/time masking
        if FLAGS.augmentation_sparse_warp:
            spectrogram = augment_sparse_warp(spectrogram,
                                              time_warping_para=FLAGS.augmentation_sparse_warp_time_warping_para,
                                              interpolation_order=FLAGS.augmentation_sparse_warp_interpolation_order,
                                              regularization_weight=FLAGS.augmentation_sparse_warp_regularization_weight,
                                              num_boundary_points=FLAGS.augmentation_sparse_warp_num_boundary_points,
                                              num_control_points=FLAGS.augmentation_sparse_warp_num_control_points)

        if FLAGS.augmentation_freq_and_time_masking:
            spectrogram = augment_freq_time_mask(spectrogram,
                                                 frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range,
                                                 time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range,
                                                 frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks,
                                                 time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks)

        if FLAGS.augmentation_pitch_and_tempo_scaling:
            spectrogram = augment_pitch_and_tempo(spectrogram,
                                                  max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo,
                                                  max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch,
                                                  min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch)

        if FLAGS.augmentation_speed_up_std > 0:
            spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std)

    mfccs = contrib_audio.mfcc(spectrogram=spectrogram,
                               sample_rate=sample_rate,
                               dct_coefficient_count=Config.n_input,
                               upper_frequency_limit=FLAGS.audio_sample_rate/2)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(input=mfccs)[0]
Exemple #18
0
def callback(input_data, frame_count, time_info, flags):
    global samples
    # print("Got audio " + str(frame_count))
    new_samples = np.frombuffer(input_data, np.float32)
    samples = np.concatenate((samples, new_samples))
    samples = samples[-16000:]
    
    if len(samples) == 16000:
        start = time.perf_counter()
        # normalise the samples
        normalised = samples - np.mean(samples)
        max = np.max(normalised)
        if max > 0:
            normalised = normalised / max

        # create the spectrogram
        spectrogram = audio_ops.audio_spectrogram(
            np.reshape(normalised, (16000, 1)),
            window_size=320,
            stride=160,
            magnitude_squared=True)
        # reduce the number of frequency bins in our spectrogram to a more sensible level
        spectrogram = tf.nn.pool(
            input=tf.expand_dims(spectrogram, -1),
            window_shape=[1, 6],
            strides=[1, 6],
            pooling_type='AVG',
            padding='SAME')
        # remove the first 1 index
        spectrogram = tf.squeeze(spectrogram, axis=0)
        spectrogram = np.log10(spectrogram + 1e-6)
        prediction = model.predict(np.reshape(spectrogram, (1, 99, 43, 1)))
        if prediction[0][0] > 0.9:
            print(f"{datetime.now().time()} - Here I am, brain the size of a planet.... {prediction[0][0]}")
        end = time.perf_counter()
        # print((end-start)*1000)

    return input_data, pyaudio.paContinue
def get_spectrogram(filename, window_size_samples, window_stride_samples,
                    sess):
    """Create Spectrogram from the PCM-encoded audio data

     Args:
        wav_data: 2D array of float PCM-encoded audio data.
        sess: current session being run
     Returns:
         2-D spectrogram of audio
     
      """
    wav_data = load_wav_file(filename, sess)
    #print(wav_data.shape)
    wav_data_placeholder = tf.compat.v1.placeholder(tf.float32, [None, 1])
    spectrogram = audio_ops.audio_spectrogram(wav_data_placeholder,
                                              window_size=window_size_samples,
                                              stride=window_stride_samples,
                                              magnitude_squared=True)

    spectrogram = sess.run(
        spectrogram,
        feed_dict={wav_data_placeholder: np.reshape(wav_data, (-1, 1))})

    return spectrogram
Exemple #20
0
def powspec_feat(samples,
                 sr=8000,
                 nfft=512,
                 winlen=0.025,
                 winstep=0.010,
                 lowfreq=0,
                 highfreq=None,
                 preemph=0.97):
    '''
  params:
    samples: [nsample, channels]
  returns:
    powspec: power spectrogram, shape [channels, nframe, nfft / 2 + 1] '''
    del nfft
    del lowfreq
    del highfreq
    del preemph

    #pylint: disable=no-member
    feat = audio_ops.audio_spectrogram(samples,
                                       window_size=winlen * sr,
                                       stride=winstep * sr,
                                       magnitude_squared=True)
    return feat
Exemple #21
0
def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride,
                   num_mfcc):
    """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.

    Args:
        audio_signal: Raw audio signal in range [-1, 1]
        audio_sample_rate: Audio signal sample rate
        window_size: Window size in samples for calculating spectrogram
        window_stride: Window stride in samples for calculating spectrogram
        num_mfcc: The number of MFCC features wanted.

    Returns:
        Calculated mffc features.
    """
    spectrogram = audio_ops.audio_spectrogram(input=audio_signal,
                                              window_size=window_size,
                                              stride=window_stride,
                                              magnitude_squared=True)

    mfcc_features = audio_ops.mfcc(spectrogram,
                                   audio_sample_rate,
                                   dct_coefficient_count=num_mfcc)

    return mfcc_features
Exemple #22
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Returns:
    Input and output tensor objects.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, feature_bin_count, preprocess)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                    name='wav_data')
    decoded_sample_data = tf.audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = audio_ops.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if preprocess == 'average':
        fingerprint_input = tf.nn.pool(
            input=tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
    elif preprocess == 'mfcc':
        fingerprint_input = audio_ops.mfcc(
            spectrogram,
            sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
    elif preprocess == 'micro':
        if not frontend_op:
            raise Exception(
                'Micro frontend op is currently not available when running TensorFlow'
                ' directly from Python, you need to build and run through Bazel, for'
                ' example'
                ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
            )
        sample_rate = model_settings['sample_rate']
        window_size_ms = (model_settings['window_size_samples'] *
                          1000) / sample_rate
        window_step_ms = (model_settings['window_stride_samples'] *
                          1000) / sample_rate
        int16_input = tf.cast(tf.multiply(decoded_sample_data.audio, 32767),
                              tf.int16)
        micro_frontend = frontend_op.audio_microfrontend(
            int16_input,
            sample_rate=sample_rate,
            window_size=window_size_ms,
            window_step=window_step_ms,
            num_channels=model_settings['fingerprint_width'],
            out_scale=1,
            out_type=tf.float32)
        fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))

    elif preprocess == "rune":
        fingerprint_input = np.random.uniform(0, 26, 1960).astype(np.float32)

    else:
        raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                        ' "average", or "micro")' % (preprocess))

    fingerprint_size = model_settings['fingerprint_size']
    reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    softmax = tf.nn.softmax(logits, name='labels_softmax')

    return reshaped_input, softmax
Exemple #23
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.

        Creates a graph that loads a WAVE file, decodes it, scales the volume,
        shifts it in time, adds in background noise, calculates a spectrogram, and
        then builds an MFCC fingerprint from that.

        This must be called with an active TensorFlow session running, and it
        creates multiple placeholder inputs, and one output:

          - wav_filename_placeholder_: Filename of the WAV to load.
          - foreground_volume_placeholder_: How loud the main clip should be.
          - time_shift_padding_placeholder_: Where to pad the clip.
          - time_shift_offset_placeholder_: How much to move the clip in time.
          - background_data_placeholder_: PCM sample data for background noise.
          - background_volume_placeholder_: Loudness of mixed-in background.
          - output_: Output 2D fingerprint of processed audio.

        Args:
          model_settings: Information about the current model being trained.
          summaries_dir: Path to save training summary information to.

        Raises:
          ValueError: If the preprocessing mode isn't recognized.
          Exception: If the preprocessor wasn't compiled in.
        """
        with tf.compat.v1.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = audio_ops.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            # remove summary
            # tf.compat.v1.summary.image(
            #     'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    input=tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                # tf.compat.v1.summary.image('shrunk_spectrogram',
                #                            self.output_,
                #                            max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = audio_ops.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                # tf.compat.v1.summary.image(
                #     'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
            elif model_settings['preprocess'] == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                sample_rate = model_settings['sample_rate']
                window_size_ms = (model_settings['window_size_samples'] *
                                  1000) / sample_rate
                window_step_ms = (model_settings['window_stride_samples'] *
                                  1000) / sample_rate
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=sample_rate,
                    window_size=window_size_ms,
                    window_step=window_step_ms,
                    num_channels=model_settings['fingerprint_width'],
                    out_scale=1,
                    out_type=tf.float32)
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                # tf.compat.v1.summary.image(
                #     'micro',
                #     tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
                #     max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc", '
                    ' "average", or "micro")' % (model_settings['preprocess']))
Exemple #24
0
        def preprocess(audio, label):
            # If we're time shifting, set up the offset for this sample.
            if time_shift > 0:
                time_shift_amount = tf.random.uniform([],
                                                      -time_shift,
                                                      time_shift,
                                                      dtype=tf.int32)
            else:
                time_shift_amount = 0
            if time_shift_amount > 0:
                time_shift_padding = [[time_shift_amount, 0], [0, 0]]
                time_shift_offset = [0, 0]
            else:
                time_shift_padding = [[0, -time_shift_amount], [0, 0]]
                time_shift_offset = [-time_shift_amount, 0]

            # Choose a section of background noise to mix in.
            if use_background or label == SILENCE_INDEX:
                background_index = tf.random.uniform(
                    [], 0, self.background_data.shape[0], dtype=tf.int32)
                background_samples = self.background_data[background_index]
                background_offset = tf.random.uniform(
                    [],
                    0,
                    tf.shape(background_samples)[0] - desired_samples,
                    dtype=tf.int32)
                background_clipped = background_samples[background_offset:(
                    background_offset + desired_samples)]
                background_data = tf.reshape(background_clipped,
                                             [desired_samples, 1])

                if label == SILENCE_INDEX:
                    background_volume = tf.random.uniform([], 0, 1)
                elif tf.random.uniform([], 0, 1) < background_frequency:
                    background_volume = tf.random.uniform(
                        [], 0, background_volume_range)
                else:
                    background_volume = 0.0
            else:
                background_data = tf.zeros([desired_samples, 1])
                background_volume = 0.0

            # If we want silence, mute out the main sample but leave the background.
            foreground_volume = 0.0 if label == SILENCE_INDEX else 1.0

            # Allow the audio sample's volume to be adjusted.
            scaled_foreground = tf.multiply(audio, foreground_volume)
            # Shift the sample's start position, and pad any gaps with zeros.
            padded_foreground = tf.pad(tensor=scaled_foreground,
                                       paddings=time_shift_padding,
                                       mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground, time_shift_offset,
                                         [desired_samples, -1])
            sliced_foreground.set_shape((sliced_foreground.shape[0], 1))

            # Mix in background noise.
            background_volume = tf.cast(background_volume, tf.float32)
            background_mul = tf.multiply(background_data, background_volume)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            spectrogram = audio_ops.audio_spectrogram(background_clamp,
                                                      window_size=frame_length,
                                                      stride=frame_step,
                                                      magnitude_squared=True)
            x = audio_ops.mfcc(spectrogram,
                               sample_rate,
                               dct_coefficient_count=num_channels,
                               upper_frequency_limit=7500,
                               lower_frequency_limit=20)
            x = tf.reshape(x, (spectrogram_length, num_channels, 1))
            return x, label
Exemple #25
0
    def prepare_processing_graph(self, flags):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = flags.desired_samples
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)

            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            # signal resampling to generate more training data
            # it will stretch or squeeze input signal proportinally to:
            self.foreground_resampling_placeholder_ = tf.placeholder(
                tf.float32, [])

            if self.foreground_resampling_placeholder_ != 1.0:
                image = tf.expand_dims(wav_decoder.audio, 0)
                image = tf.expand_dims(image, 2)
                shape = tf.shape(wav_decoder.audio)
                image_resized = tf.image.resize(
                    images=image,
                    size=(tf.cast((tf.cast(shape[0], tf.float32) *
                                   self.foreground_resampling_placeholder_),
                                  tf.int32), 1),
                    preserve_aspect_ratio=False)
                image_resized_cropped = tf.image.resize_with_crop_or_pad(
                    image_resized,
                    target_height=desired_samples,
                    target_width=1,
                )
                image_resized_cropped = tf.squeeze(image_resized_cropped,
                                                   axis=[0, 3])
                scaled_foreground = tf.multiply(
                    image_resized_cropped, self.foreground_volume_placeholder_)
            else:
                scaled_foreground = tf.multiply(
                    wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            if flags.preprocess == 'raw':
                # background_clamp dims: [time, channels]
                # remove channel dim
                self.output_ = tf.squeeze(background_clamp, axis=1)
            # below options are for backward compatibility with previous
            # version of hotword detection on microcontrollers
            # in this case audio feature extraction is done separately from
            # neural net and user will have to manage it.
            elif flags.preprocess == 'mfcc':
                # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs
                # background_clamp dims: [time, channels]
                spectrogram = audio_ops.audio_spectrogram(
                    background_clamp,
                    window_size=flags.window_size_samples,
                    stride=flags.window_stride_samples,
                    magnitude_squared=flags.fft_magnitude_squared)
                # spectrogram: [channels/batch, frames, fft_feature]

                # extract mfcc features from spectrogram by audio_ops.mfcc:
                # 1 Input is spectrogram frames.
                # 2 Weighted spectrogram into bands using a triangular mel filterbank
                # 3 Logarithmic scaling
                # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count
                mfcc = audio_ops.mfcc(
                    spectrogram=spectrogram,
                    sample_rate=flags.sample_rate,
                    upper_frequency_limit=flags.mel_upper_edge_hertz,
                    lower_frequency_limit=flags.mel_lower_edge_hertz,
                    filterbank_channel_count=flags.mel_num_bins,
                    dct_coefficient_count=flags.dct_num_features)
                # mfcc: [channels/batch, frames, dct_coefficient_count]
                # remove channel dim
                self.output_ = tf.squeeze(mfcc, axis=0)
            elif flags.preprocess == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                int16_input = tf.cast(
                    tf.multiply(background_clamp, MAX_ABS_INT16), tf.int16)
                # audio_microfrontend does:
                # 1. A slicing window function of raw audio
                # 2. Short-time FFTs
                # 3. Filterbank calculations
                # 4. Noise reduction
                # 5. PCAN Auto Gain Control
                # 6. Logarithmic scaling

                # int16_input dims: [time, channels]
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=flags.sample_rate,
                    window_size=flags.window_size_ms,
                    window_step=flags.window_stride_ms,
                    num_channels=flags.mel_num_bins,
                    upper_band_limit=flags.mel_upper_edge_hertz,
                    lower_band_limit=flags.mel_lower_edge_hertz,
                    out_scale=1,
                    out_type=tf.float32)
                # int16_input dims: [frames, num_channels]
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "raw", '
                    ' "mfcc", or "micro")' % (flags.preprocess))
def gen_spectrogram_onnx_test_model(model_path,
                                    window_count,
                                    window_size,
                                    stride,
                                    magnitude_squared=True):

    # Tensor sizes.
    input_length = window_size + (window_count - 1) * stride
    fft_length = int(2**np.ceil(np.log2(window_size)))
    input_shape = [1, input_length]
    spectrogram_length = int(fft_length / 2 + 1)
    spectrogram_shape = [window_count, spectrogram_length]

    # Generate random input data.
    np.random.seed(1)
    input_data = np.random.randn(*input_shape)

    # ----------------------------------------- COMPUTE TensorFlow REFERENCE -------------------------------------------
    # Define TensorFlow model.
    tf_input = tf.constant(input_data.reshape([input_length, 1]),
                           name='input',
                           dtype=tf.float32)
    tf_spectrogram = audio_ops.audio_spectrogram(
        tf_input,
        window_size=window_size,
        stride=stride,
        magnitude_squared=magnitude_squared)

    # Run TensorFlow model and get reference output.
    with tf.Session() as sess:
        spectrogram_ref = sess.run(tf_spectrogram)
    spectrogram_ref = np.reshape(spectrogram_ref, spectrogram_shape)

    # ---------------------------------------------- NODE DEFINITION  --------------------------------------------------
    # AudioSpectrogram node definition.
    spectrogram_node_def = onnx.helper.make_node(
        'AudioSpectrogram',
        name='audio_spectrogram',
        inputs=['input'],
        outputs=['spectrogram'],
        window_size=int(window_size),
        stride=int(stride),
        magnitude_squared=int(magnitude_squared))

    # Error node definition.
    err_node_def = onnx.helper.make_node(
        'Sub',
        name='error',
        inputs=['spectrogram', 'spectrogram_ref'],
        outputs=['spectrogram_err'])

    # --------------------------------------------- GRAPH DEFINITION  --------------------------------------------------
    graph_input = list()
    graph_init = list()
    graph_output = list()

    # Graph inputs.
    graph_input.append(
        helper.make_tensor_value_info('input', TensorProto.FLOAT, input_shape))
    graph_input.append(
        helper.make_tensor_value_info('spectrogram_ref', TensorProto.FLOAT,
                                      spectrogram_shape))

    # Graph initializers.
    graph_init.append(make_init('input', TensorProto.FLOAT, input_data))
    graph_init.append(
        make_init('spectrogram_ref', TensorProto.FLOAT, spectrogram_ref))

    # Graph outputs.
    graph_output.append(
        helper.make_tensor_value_info('spectrogram_err', TensorProto.FLOAT,
                                      spectrogram_shape))

    # Graph name.
    graph_name = 'audio_spectrogram_test'

    # Define graph (GraphProto).
    graph_def = helper.make_graph([spectrogram_node_def, err_node_def],
                                  graph_name,
                                  inputs=graph_input,
                                  outputs=graph_output)

    # Set initializers.
    graph_def.initializer.extend(graph_init)

    # --------------------------------------------- MODEL DEFINITION  --------------------------------------------------
    # Define model (ModelProto).
    model_def = helper.make_model(graph_def,
                                  producer_name='onnx-audio-spectrogram')

    # Print model.
    with open(model_path, 'w') as f:
        f.write(str(model_def))
Exemple #27
0
def create_inference_graph(
        wanted_words, sample_rate, nchannels, clip_duration_ms, clip_stride_ms,
        representation, window_size_ms, window_stride_ms, nwindows,
        dct_coefficient_count, filterbank_channel_count, model_architecture,
        filter_counts, filter_sizes, final_filter_len, dropout_prob,
        batch_size, dilate_after_layer, stride_after_layer, connection_type,
        silence_percentage, unknown_percentage):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               silence_percentage,
                                               unknown_percentage)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, nchannels, clip_duration_ms,
        representation, window_size_ms, window_stride_ms, nwindows,
        dct_coefficient_count, filterbank_channel_count, filter_counts,
        filter_sizes, final_filter_len, dropout_prob, batch_size,
        dilate_after_layer, stride_after_layer, connection_type)

    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = audio_ops.decode_wav(
        wav_data_placeholder,
        desired_channels=nchannels,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrograms = []
    for ichannel in range(nchannels):
        spectrograms.append(
            audio_ops.audio_spectrogram(
                decoded_sample_data.audio,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True))
    spectrogram = tf.stack(spectrograms, -1)
    mfccs = []
    for ichannel in range(nchannels):
        mfccs.append(
            audio_ops.mfcc(spectrograms[ichannel],
                           decoded_sample_data.sample_rate,
                           upper_frequency_limit=sample_rate // 2,
                           filterbank_channel_count=filterbank_channel_count,
                           dct_coefficient_count=dct_coefficient_count))
    mfcc = tf.stack(mfccs, -1)

    if representation == 'waveform':
        fingerprint_input = decoded_sample_data.audio
    elif representation == 'spectrogram':
        fingerprint_input = spectrogram
    elif representation == 'mel-cepstrum':
        fingerprint_input = mfcc

    reshaped_input = tf.reshape(fingerprint_input,
                                [-1, model_settings['fingerprint_size']])

    hidden_layers, final = models.create_model(
        reshaped_input,
        model_settings,
        model_architecture,
        is_training=False,
        runtime_settings=runtime_settings)

    # Create an output to use for inference.
    for i in range(len(hidden_layers)):
        tf.identity(hidden_layers[i], name='hidden_layer' + str(i))
    tf.nn.softmax(final, name='output_layer')
Exemple #28
0
def gen_mfcc_onnx_test_model(model_path, window_count, window_size, stride, sample_rate, lower_frequency_limit,
                             upper_frequency_limit, filterbank_channel_count, dct_coefficient_count):

    # Tensor sizes.
    input_length = window_size + (window_count - 1) * stride
    fft_length = int(2 ** np.ceil(np.log2(window_size)))
    input_shape = [1, input_length]
    spectrogram_length = int(fft_length / 2 + 1)
    spectrogram_shape = [window_count, spectrogram_length]
    coefficients_shape = [window_count, dct_coefficient_count]

    # Generate random input data.
    np.random.seed(1)
    input_data = np.random.randn(*input_shape)

    # ----------------------------------------- COMPUTE TensorFlow REFERENCE -------------------------------------------
    # Define TensorFlow model.
    tf_input = tf.constant(input_data.reshape(
        [input_length, 1]), name='input', dtype=tf.float32)
    tf_spectrogram = audio_ops.audio_spectrogram(tf_input,
                                                 window_size=window_size,
                                                 stride=stride,
                                                 magnitude_squared=True)
    tf_mfcc = audio_ops.mfcc(spectrogram=tf_spectrogram,
                             sample_rate=sample_rate,
                             upper_frequency_limit=upper_frequency_limit,
                             lower_frequency_limit=lower_frequency_limit,
                             filterbank_channel_count=filterbank_channel_count,
                             dct_coefficient_count=dct_coefficient_count)

    # Run TensorFlow model and get spectrogram input.
    with tf.Session() as sess:
        spectrogram = sess.run(tf_spectrogram)
    spectrogram = np.reshape(spectrogram, spectrogram_shape)

    # Run TensorFlow model and get reference output coefficients.
    with tf.Session() as sess:
        coefficients_ref = sess.run(tf_mfcc)
    coefficients_ref = np.reshape(coefficients_ref, coefficients_shape)

    # ---------------------------------------------- NODE DEFINITION  --------------------------------------------------
    # MFCC node definition.
    mfcc_node_def = onnx.helper.make_node(
        'MFCC',
        name='mfcc',
        inputs=['spectrogram'],
        outputs=['coefficients'],
        sample_rate=float(sample_rate),
        lower_frequency_limit=float(lower_frequency_limit),
        upper_frequency_limit=float(upper_frequency_limit),
        filterbank_channel_count=int(filterbank_channel_count),
        dct_coefficient_count=int(dct_coefficient_count)
    )

    # Error node definition.
    err_node_def = onnx.helper.make_node(
        'Sub',
        name='error',
        inputs=['coefficients', 'coefficients_ref'],
        outputs=['coefficients_err']
    )

    # --------------------------------------------- GRAPH DEFINITION  --------------------------------------------------
    graph_input = list()
    graph_init = list()
    graph_output = list()

    # Graph inputs.
    graph_input.append(helper.make_tensor_value_info(
        'spectrogram', TensorProto.FLOAT, spectrogram_shape))
    graph_input.append(helper.make_tensor_value_info(
        'coefficients_ref', TensorProto.FLOAT, coefficients_shape))

    # Graph initializers.
    graph_init.append(make_init('spectrogram', TensorProto.FLOAT, spectrogram))
    graph_init.append(make_init('coefficients_ref',
                                TensorProto.FLOAT, coefficients_ref))

    # Graph outputs.
    graph_output.append(helper.make_tensor_value_info(
        'coefficients_err', TensorProto.FLOAT, coefficients_shape))

    # Graph name.
    graph_name = 'mfcc_test'

    # Define graph (GraphProto).
    graph_def = helper.make_graph(
        [mfcc_node_def, err_node_def], graph_name, inputs=graph_input, outputs=graph_output)

    # Set initializers.
    graph_def.initializer.extend(graph_init)

    # --------------------------------------------- MODEL DEFINITION  --------------------------------------------------
    # Define model (ModelProto).
    model_def = helper.make_model(graph_def, producer_name='onnx-mfcc')

    # Print model.
    with open(model_path, 'w') as f:
        f.write(str(model_def))
Exemple #29
0
    def prepare_processing_graph(self, data_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      data_settings: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = data_settings.desired_samples
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            # signal resampling to generate more training data
            # it will stretch or squeeze input signal proportinally to:
            self.foreground_resampling_placeholder_ = tf.placeholder(
                tf.float32, [])

            if self.foreground_resampling_placeholder_ != 1.0:
                image = tf.expand_dims(wav_decoder.audio, 0)
                image = tf.expand_dims(image, 2)
                shape = tf.shape(wav_decoder.audio)
                image_resized = tf.image.resize(
                    images=image,
                    size=(tf.cast((tf.cast(shape[0], tf.float32) *
                                   self.foreground_resampling_placeholder_),
                                  tf.int32), 1),
                    preserve_aspect_ratio=False)
                image_resized_cropped = tf.image.resize_with_crop_or_pad(
                    image_resized,
                    target_height=desired_samples,
                    target_width=1,
                )
                image_resized_cropped = tf.squeeze(image_resized_cropped,
                                                   axis=[0, 3])
                scaled_foreground = tf.multiply(
                    image_resized_cropped, self.foreground_volume_placeholder_)
            else:
                scaled_foreground = tf.multiply(
                    wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            if data_settings.preprocess == 'raw':
                # return raw audio
                self.output_ = background_clamp
                tf.summary.image('input_audio',
                                 tf.expand_dims(
                                     tf.expand_dims(background_clamp, -1), -1),
                                 max_outputs=1)
            else:
                # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint'
                spectrogram = audio_ops.audio_spectrogram(
                    background_clamp,
                    window_size=data_settings.window_size_samples,
                    stride=data_settings.window_stride_samples,
                    magnitude_squared=True)
                tf.summary.image('spectrogram',
                                 tf.expand_dims(spectrogram, -1),
                                 max_outputs=1)
                # The number of buckets in each FFT row in the spectrogram will depend
                # on how many input samples there are in each window. This can be quite
                # large, with a 160 sample window producing 127 buckets for example. We
                # don't need this level of detail for classification, so we often want
                # to shrink them down to produce a smaller result. That's what this
                # section implements. One method is to use average pooling to merge
                # adjacent buckets, but a more sophisticated approach is to apply the
                # MFCC algorithm to shrink the representation.
                if data_settings.preprocess == 'average':
                    self.output_ = tf.nn.pool(
                        input=tf.expand_dims(spectrogram, -1),
                        window_shape=[1, data_settings.average_window_width],
                        strides=[1, data_settings.average_window_width],
                        pooling_type='AVG',
                        padding='SAME')
                    tf.summary.image('shrunk_spectrogram',
                                     self.output_,
                                     max_outputs=1)
                elif data_settings.preprocess == 'mfcc':
                    self.output_ = audio_ops.mfcc(
                        spectrogram,
                        wav_decoder.sample_rate,
                        dct_coefficient_count=data_settings.fingerprint_width)
                    tf.summary.image('mfcc',
                                     tf.expand_dims(self.output_, -1),
                                     max_outputs=1)
                elif data_settings.preprocess == 'micro':
                    if not frontend_op:
                        raise Exception(
                            'Micro frontend op is currently not available when running'
                            ' TensorFlow directly from Python, you need to build and run'
                            ' through Bazel')
                    sample_rate = data_settings.sample_rate
                    window_size_ms = (data_settings.window_size_samples *
                                      1000) / sample_rate
                    window_step_ms = (data_settings.window_stride_samples *
                                      1000) / sample_rate
                    int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                          tf.int16)
                    micro_frontend = frontend_op.audio_microfrontend(
                        int16_input,
                        sample_rate=sample_rate,
                        window_size=window_size_ms,
                        window_step=window_step_ms,
                        num_channels=data_settings.fingerprint_width,
                        out_scale=1,
                        out_type=tf.float32)
                    self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                    tf.summary.image('micro',
                                     tf.expand_dims(
                                         tf.expand_dims(self.output_, -1), 0),
                                     max_outputs=1)
                else:
                    raise ValueError(
                        'Unknown preprocess mode "%s" (should be "mfcc", '
                        ' "average", or "micro")' % (data_settings.preprocess))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            if data_settings.summaries_dir:
                self.summary_writer_ = tf.summary.FileWriter(
                    data_settings.summaries_dir + '/data',
                    tf.get_default_graph())
Exemple #30
0
import numpy as np
import tensorflow.compat.v1 as tf
from tensorflow.python.ops import gen_audio_ops as contrib_audio

# tf.disable_eager_execution()

signal = tf.placeholder(tf.float32, [None], name='signal')
spectrogram = contrib_audio.audio_spectrogram(tf.expand_dims(signal, 1),
                                              window_size=512,
                                              stride=320,
                                              magnitude_squared=True)
mfccs = contrib_audio.mfcc(spectrogram=spectrogram,
                           sample_rate=16000,
                           dct_coefficient_count=26,
                           upper_frequency_limit=16000 / 2)
mfccs = tf.reshape(mfccs, [-1, 26])

sess = tf.Session()


def audio2mfcc(samples):
    ret = sess.run(mfccs, feed_dict={signal: samples})
    return ret


if __name__ == '__main__':
    audio = Audio.read('test.wav', 16000)
    energy = np.abs(audio)
    silence_threshold = np.percentile(energy, 95)
    offsets = np.where(energy > silence_threshold)[0]
    # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate  # frame_id to duration (ms)