Beispiel #1
0
 def get_unprocessed_data(self, how_many, model_settings, mode):
   """Gets sample data without transformations."""
   candidates = self.data_index[mode]
   if how_many == -1:
     sample_count = len(candidates)
   else:
     sample_count = how_many
   desired_samples = model_settings['desired_samples']
   words_list = self.words_list
   data = np.zeros((sample_count, desired_samples))
   labels = []
   with tf.Session(graph=tf.Graph()) as sess:
     wav_filename_placeholder = tf.placeholder(tf.string, [], name='filename')
     wav_loader = io_ops.read_file(wav_filename_placeholder)
     wav_decoder = contrib_audio.decode_wav(
         wav_loader, desired_channels=1, desired_samples=desired_samples)
     foreground_volume_placeholder = tf.placeholder(
         tf.float32, [], name='foreground_volume')
     scaled_foreground = tf.multiply(wav_decoder.audio,
                                     foreground_volume_placeholder)
     for i in range(sample_count):
       if how_many == -1:
         sample_index = i
       else:
         sample_index = np.random.randint(len(candidates))
       sample = candidates[sample_index]
       input_dict = {wav_filename_placeholder: sample['file']}
       if sample['label'] == SILENCE_LABEL:
         input_dict[foreground_volume_placeholder] = 0
       else:
         input_dict[foreground_volume_placeholder] = 1
       data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
       label_index = self.word_to_index[sample['label']]
       labels.append(words_list[label_index])
   return data, labels
Beispiel #2
0
  def prepare_background_data(self):
    """Searches a folder for background noise audio, and loads it into memory.

    It's expected that the background audio samples will be in a subdirectory
    named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
    the sample rate of the training data, but can be much longer in duration.

    If the '_background_noise_' folder doesn't exist at all, this isn't an
    error, it's just taken to mean that no background noise augmentation should
    be used. If the folder does exist, but it's empty, that's treated as an
    error.

    Returns:
      List of raw PCM-encoded audio samples of background noise.

    Raises:
      Exception: If files aren't found in the folder.
    """
    self.background_data = []
    background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME)
    if not os.path.exists(background_dir):
      return self.background_data
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
      search_path = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME,
                                 '*.wav')
      for wav_path in gfile.Glob(search_path):
        wav_data = sess.run(
            wav_decoder,
            feed_dict={wav_filename_placeholder: wav_path}).audio.flatten()
        self.background_data.append(wav_data)
      if not self.background_data:
        raise Exception('No background wav files were found in ' + search_path)
Beispiel #3
0
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
Beispiel #4
0
def load_wav_file(filename):
  """Loads an audio file and returns a float PCM-encoded array of samples."""
  with tf.Session(graph=tf.Graph()) as sess:
    wav_filename_placeholder = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    return sess.run(
        wav_decoder, feed_dict={
            wav_filename_placeholder: filename
        }).audio.flatten()
Beispiel #5
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)
  fingerprint_input = contrib_audio.mfcc(
      spectrogram,
      decoded_sample_data.sample_rate,
      dct_coefficient_count=dct_coefficient_count)
  fingerprint_frequency_size = model_settings['dct_coefficient_count']
  fingerprint_time_size = model_settings['spectrogram_length']
  reshaped_input = tf.reshape(fingerprint_input, [
      -1, fingerprint_time_size * fingerprint_frequency_size
  ])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
Beispiel #6
0
 def prepare_processing_graph(self, model_settings):
   """Builds a TensorFlow graph to apply the input distortions"""
   desired_samples = model_settings['desired_samples']
   self.wav_filename_placeholder_ = tf.placeholder(
       tf.string, [], name='filename')
   wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
   wav_decoder = contrib_audio.decode_wav(
       wav_loader, desired_channels=1, desired_samples=desired_samples)
   # Allow the audio sample's volume to be adjusted.
   self.foreground_volume_placeholder_ = tf.placeholder(
       tf.float32, [], name='foreground_volme')
   scaled_foreground = tf.multiply(wav_decoder.audio,
                                   self.foreground_volume_placeholder_)
   # Shift the sample's start position, and pad any gaps with zeros.
   self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift')
   shifted_foreground = tf_roll(scaled_foreground,
                                self.time_shift_placeholder_)
   # Mix in background noise.
   self.background_data_placeholder_ = tf.placeholder(
       tf.float32, [desired_samples, 1], name='background_data')
   self.background_volume_placeholder_ = tf.placeholder(
       tf.float32, [], name='background_volume')
   background_mul = tf.multiply(self.background_data_placeholder_,
                                self.background_volume_placeholder_)
   background_add = tf.add(background_mul, shifted_foreground)
   # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
   self.background_clamp_ = background_add
   self.background_clamp_ = tf.reshape(self.background_clamp_,
                                       (1, model_settings['desired_samples']))
   # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
   stfts = tf.contrib.signal.stft(
       self.background_clamp_,
       frame_length=model_settings['window_size_samples'],
       frame_step=model_settings['window_stride_samples'],
       fft_length=None)
   self.spectrogram_ = tf.abs(stfts)
   num_spectrogram_bins = self.spectrogram_.shape[-1].value
   lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
   linear_to_mel_weight_matrix = \
       tf.contrib.signal.linear_to_mel_weight_matrix(
           model_settings['dct_coefficient_count'],
           num_spectrogram_bins, model_settings['sample_rate'],
           lower_edge_hertz, upper_edge_hertz)
   mel_spectrograms = tf.tensordot(self.spectrogram_,
                                   linear_to_mel_weight_matrix, 1)
   mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
       linear_to_mel_weight_matrix.shape[-1:]))
   log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
   self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
       log_mel_spectrograms)[:, :, :
                             model_settings['num_log_mel_features']]  # :13
Beispiel #7
0
def load_wav_file(filename):
  """Loads an audio file and returns a float PCM-encoded array of samples.

  Args:
    filename: Path to the .wav file to load.

  Returns:
    Numpy array holding the sample data as floats between -1.0 and 1.0.
  """
  with tf.Session(graph=tf.Graph()) as sess:
    wav_filename_placeholder = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    return sess.run(
        wav_decoder,
        feed_dict={wav_filename_placeholder: filename}).audio.flatten()
Beispiel #8
0
  def get_unprocessed_data(self, how_many, model_settings, mode):
    """Retrieve sample data for the given partition, with no transformations.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = how_many
    desired_samples = model_settings['desired_samples']
    words_list = self.words_list
    data = np.zeros((sample_count, desired_samples))
    labels = []
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      foreground_volume_placeholder = tf.placeholder(tf.float32, [])
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      foreground_volume_placeholder)
      for i in range(sample_count):
        if how_many == -1:
          sample_index = i
        else:
          sample_index = np.random.randint(len(candidates))
        sample = candidates[sample_index]
        input_dict = {wav_filename_placeholder: sample['file']}
        if sample['label'] == SILENCE_LABEL:
          input_dict[foreground_volume_placeholder] = 0
        else:
          input_dict[foreground_volume_placeholder] = 1
        data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
        label_index = self.word_to_index[sample['label']]
        labels.append(words_list[label_index])
    return data, labels
def clip_to_waveform(clip, clip_dir=None):
  """Decodes a WAV clip into a waveform tensor."""
  '''
    data , sampling_rate = librosa.load('data/sound.wav', sr=SAMPLE_RATE)
    # for use in tensorflow
    data_tensor = tf.convert_to_tensor( data )
  '''

  # Decode the WAV-format clip into a waveform tensor where
  # the values lie in [-1, +1].
  clip_path = tf.string_join([clip_dir, clip], separator=os.sep)
  clip_data = tf.read_file(clip_path)
  waveform, sr = tf_audio.decode_wav(clip_data)
  # Assert that the clip has the expected sample rate.
  check_sr = tf.assert_equal(sr, sr)
  # and check that it is mono.
  check_channels = tf.assert_equal(tf.shape(waveform)[1], 1)
  with tf.control_dependencies([tf.group(check_sr, check_channels)]):
    return tf.squeeze(waveform)
Beispiel #10
0
 def prepare_background_data(self):
   """Searches a folder for background noise audio, and loads it into memory"""
   self.background_data = []
   background_dir = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME)
   if not os.path.exists(background_dir):
     return self.background_data
   with tf.Session(graph=tf.Graph()) as sess:
     wav_filename_placeholder = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(wav_filename_placeholder)
     wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
     search_path = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME,
                                '*.wav')
     for wav_path in gfile.Glob(search_path):
       wav_data = sess.run(
           wav_decoder, feed_dict={
               wav_filename_placeholder: wav_path
           }).audio.flatten()
       self.background_data.append(wav_data)
     if not self.background_data:
       raise Exception('No background wav files were found in ' + search_path)
Beispiel #11
0
 def process_wav():
     file_range = 0
     #Each file gets its own two-dimensional array.
     fileIdx = 0
     for file in iglob('audio_wav' + '/*.wav'):
         #So this doesn't take too long.
         if fileIdx < numbertotrain:
             audio_binary = tf.read_file(file)
             wav_decoder = audio_ops.decode_wav(audio_binary,
                                                desired_channels=2)
             sample_rate, audio = sess.run(
                 [wav_decoder.sample_rate, wav_decoder.audio])
             fileAudio = np.array(audio)
             #Only use sounds of the same length, this length seems to match most.
             if len(fileAudio) == 5294592:
                 #Audio is split into two channels, cut each one into inputSize sized chunks and store them sequentially.
                 #Use discrete fourier transforms to map from the time domain into the frequency domain.
                 leftAudio = rfft(audio[:, 0])
                 rightAudio = rfft(audio[:, 1])
                 #Split the both arrays into subarrays of length inputSize
                 lower = 0
                 upper = inputSize
                 #Sliding window.
                 while upper < len(leftAudio):
                     leftAudioSection = leftAudio[lower:upper]
                     rightAudioSection = rightAudio[lower:upper]
                     #Add them in sequential order
                     x.append(leftAudioSection)
                     x.append(rightAudioSection)
                     lower += inputSize
                     upper += inputSize
                 #Now x contains the subarrays we'd like, in the order [ [left1], [right1,], [left2], [right2]...[leftn], right[n]]
             else:
                 pass
             print("preprocessed file: " + str(file) + ", Number: " +
                   str(fileIdx))
         fileIdx += 1
Beispiel #12
0
    def get_unprocessed_data(self, how_many, model_settings, mode):

        candidates = self.data_index[mode]
        if how_many == -1:
            sample_count = len(candidates)
        else:
            sample_count = how_many
        desired_samples = model_settings['desired_samples']
        words_list = self.words_list
        data = np.zeros((sample_count, desired_samples))
        labels = []
        with tf.Session(graph=tf.Graph()) as sess:
            wav_filename_placeholder = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(wav_filename_placeholder)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)
            foreground_volume_placeholder = tf.placeholder(tf.float32, [])
            scaled_foreground = tf.multiply(wav_decoder.audio,
                                            foreground_volume_placeholder)
            for i in range(sample_count):
                if how_many == -1:
                    sample_index = i
                else:
                    sample_index = np.random.randint(len(candidates))
                sample = candidates[sample_index]
                input_dict = {wav_filename_placeholder: sample['file']}
                if sample['label'] == SILENCE_LABEL:
                    input_dict[foreground_volume_placeholder] = 0
                else:
                    input_dict[foreground_volume_placeholder] = 1
                data[i, :] = sess.run(scaled_foreground,
                                      feed_dict=input_dict).flatten()
                label_index = self.word_to_index[sample['label']]
                labels.append(words_list[label_index])
        return data, labels
  def prepare_background_data(self):
    """Searches a folder for background noise audio, and loads it into memory.

    It's expected that the background audio samples will be in a subdirectory
    named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
    the sample rate of the training data, but can be much longer in duration.

    If the '_background_noise_' folder doesn't exist at all, this isn't an
    error, it's just taken to mean that no background noise augmentation should
    be used. If the folder does exist, but it's empty, that's treated as an
    error.

    Returns:
      List of raw PCM-encoded audio samples of background noise.

    Raises:
      Exception: If files aren't found in the folder.
    """
    self.background_data = []
    background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME)
    if not os.path.exists(background_dir):
      return self.background_data
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])

      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
      search_path = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME,
                                 '*.wav')
      for wav_path in gfile.Glob(search_path):
        print(wav_path)
        wav_data = sess.run(
            wav_decoder,
            feed_dict={wav_filename_placeholder: wav_path}).audio.flatten()
        self.background_data.append(wav_data)
      if not self.background_data:
        raise Exception('No background wav files were found in ' + search_path)
Beispiel #14
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)
    prediction_node = tf.argmax(logits, axis=-1)
    return wav_data_placeholder, prediction_node
Beispiel #15
0
    def __init__(self, FLAGS):
        model_settings = prepare_settings(
            FLAGS.num_classes, FLAGS.sample_rate,
            FLAGS.clip_duration_ms, FLAGS.window_size_ms,
            FLAGS.window_stride_ms, FLAGS.dct_coefficient_count
        )
        runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms}

        # Perform preprocessing
        self.wav_data_placeholder = tf.placeholder(tf.string, [], name='wav')
        wav_data = io_ops.read_file(self.wav_data_placeholder)
        decoded_sample_data = contrib_audio.decode_wav(
            wav_data,
            desired_channels=1,
            desired_samples=model_settings['desired_samples'],
            name='decoded_sample_data'
        )
        spectrogram = contrib_audio.audio_spectrogram(
            decoded_sample_data.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True
        )
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            dct_coefficient_count=FLAGS.dct_coefficient_count
        )
        fingerprint_frequency_size = model_settings['dct_coefficient_count']
        fingerprint_time_size = model_settings['spectrogram_length']

        # Add channel dimension
        self.reshaped_input = tf.reshape(
            fingerprint_input,
            [fingerprint_time_size, fingerprint_frequency_size, 1]
        )
Beispiel #16
0
def build_preproc_graph_for_cnn():
    model_settings = init_cnn_model_settings()
    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    audio_binary = tf.read_file(wav_data_placeholder)
    decoded_sample_data = contrib_audio.decode_wav(
        audio_binary,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_datal')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    mfcc_tensor = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    mfcc_tensor_flatten = tf.reshape(mfcc_tensor, [
        -1, fingerprint_time_size * fingerprint_frequency_size
    ])
    return mfcc_tensor_flatten, mfcc_tensor
Beispiel #17
0
def prepare_processing_graph(file, window_size_samples, window_stride_samples,
                             dct_coefficient_count):
    desired_samples = 16000
    wav_filename_placeholder_ = file
    wav_loader = io_ops.read_file(wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        # wav_loader, desired_channels=1, desired_samples=desired_samples)
        wav_loader,
        desired_channels=1,
        desired_samples=16000)
    # Allow the audio sample's volume to be adjusted.
    # Shift the sample's start position, and pad any gaps with zeros.
    wave_input = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0)
    ######################  M F C C #################################
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        wave_input,
        window_size=window_size_samples,
        stride=window_stride_samples,
        magnitude_squared=True)
    mfcc_ = contrib_audio.mfcc(spectrogram,
                               wav_decoder.sample_rate,
                               dct_coefficient_count=dct_coefficient_count)
    return mfcc_
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture,
                           input_type, model_size_info):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, 100)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    #input_spectrogram = tf.placeholder(tf.float32, shape=[49,513], name='speech_signal')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    #spectrogram = input_spectrogram
    if (input_type == 'log-mel'):
        print("log-mel energies")
        # Warp the linear-scale, magnitude spectrograms into the mel-scale.
        num_spectrogram_bins = spectrogram.shape[
            -1].value  # magnitude_spectrograms.shape[-1].value
        lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, model_settings[
            'dct_coefficient_count']
        linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            num_mel_bins, num_spectrogram_bins, model_settings['sample_rate'],
            lower_edge_hertz, upper_edge_hertz)
        mel_spectrograms = tf.tensordot(spectrogram,
                                        linear_to_mel_weight_matrix, 1)
        # Note: Shape inference for `tf.tensordot` does not currently handle this case.
        mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
        log_offset = 1e-6
        log_mel_spectrograms = tf.log(mel_spectrograms + log_offset)
        fingerprint_input = log_mel_spectrograms
    elif (input_type == 'MFCC'):
        print('MFCC-features')
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            dct_coefficient_count=model_settings['dct_coefficient_count'])
    #fingerprint_input = tf.placeholder(tf.float32,shape=[49,20],name='fingerprint')
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits, dropout_prob = models.create_model(
        reshaped_input,
        model_settings,
        model_architecture,
        model_size_info,
        is_training=True,
        runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
def create_inference_graph_batched(wanted_words,
                                   sample_rate,
                                   clip_duration_ms,
                                   window_size_ms,
                                   window_stride_ms,
                                   dct_coefficient_count,
                                   model_architecture,
                                   model_size_info=None):
    """Creates an audio model with the nodes needed for inference.
    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.
    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      clip_stride_ms: How often to run recognition. Useful for models with cache.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model_architecture: Name of the kind of model to generate.
    """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)
    if (model_architecture == 'dnc'):
        model_settings['batch_size'] = 1000
    fingerprint_size = model_settings['fingerprint_size']
    #Wav Data Placeholder
    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    mfcc_output = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=dct_coefficient_count,
        name='mfcc')
    #Batched Input Placeholder
    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']

    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info=model_size_info,
                                 is_training=False)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
def main(_):

    print(FLAGS.model_size_info)
    reg_conv_bits = FLAGS.bit_widths[0]
    dw_conv_bits = FLAGS.bit_widths[1]
    pw_conv_bits = FLAGS.bit_widths[2]
    fc_bits = FLAGS.bit_widths[3]
    activations_bits = FLAGS.bit_widths[4]

    print("Regular Conv-weights bit width: " + str(reg_conv_bits))
    print("Depthwise Conv-weights bit width: " + str(dw_conv_bits))
    print("Pointwise Conv-weights bit width: " + str(pw_conv_bits))
    print("FC-weights bit width: " + str(fc_bits))
    print("Activations bit width: " + str(activations_bits))
    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)

    # Start a new TensorFlow session.
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
        FLAGS.window_size_ms, FLAGS.window_stride_ms,
        FLAGS.dct_coefficient_count, 100)
    clip_stride_ms = 260
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    # input_spectrogram = tf.placeholder(tf.float32, shape=[49,513], name='speech_signal')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    # spectrogram = input_spectrogram
    if (FLAGS.input_type == 'log-mel'):
        print("log-mel energies")
        # Warp the linear-scale, magnitude spectrograms into the mel-scale.
        num_spectrogram_bins = spectrogram.shape[
            -1].value  # magnitude_spectrograms.shape[-1].value
        lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, model_settings[
            'dct_coefficient_count']
        linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            num_mel_bins, num_spectrogram_bins, model_settings['sample_rate'],
            lower_edge_hertz, upper_edge_hertz)
        mel_spectrograms = tf.tensordot(spectrogram,
                                        linear_to_mel_weight_matrix, 1)
        # Note: Shape inference for `tf.tensordot` does not currently handle this case.
        mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
        log_offset = 1e-6
        log_mel_spectrograms = tf.log(mel_spectrograms + log_offset)
        fingerprint_input = log_mel_spectrograms
    elif (FLAGS.input_type == 'MFCC'):
        print('MFCC-features')
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            dct_coefficient_count=model_settings['dct_coefficient_count'])
    # fingerprint_input = tf.placeholder(tf.float32,shape=[49,20],name='fingerprint')
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    training = tf.placeholder(tf.bool, name='training')

    logits, net_c1 = models.create_model(reshaped_input,
                                         model_settings,
                                         FLAGS.model_architecture,
                                         FLAGS.model_size_info,
                                         is_training=True,
                                         runtime_settings=runtime_settings)
    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')

    saver = tf.train.Saver(tf.global_variables())

    tf.global_variables_initializer().run()

    start_step = 1

    if FLAGS.start_checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
    for v in tf.trainable_variables():
        print(v.name)
    v_backup = tf.trainable_variables()
    eps = 0.001
    # Layer information [weights, biases, channel means, channel variances, input fractional bits, output fractional bits, name for .h file]
    conv_1 = [
        'DS-CNN/conv_1/weights', 'DS-CNN/conv_1/biases',
        'DS-CNN/conv_1/batch_norm/moving_mean',
        'DS-CNN/conv_1/batch_norm/moving_variance', 2, 5, 'CONV1',
        'DS-CNN/conv_1/batch_norm/beta'
    ]
    dw_conv_1 = [
        'DS-CNN/conv_ds_1/depthwise_conv/depthwise_weights',
        'DS-CNN/conv_ds_1/depthwise_conv/biases',
        'DS-CNN/conv_ds_1/dw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_1/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV1',
        'DS-CNN/conv_ds_1/dw_batch_norm/beta'
    ]
    pw_conv_1 = [
        'DS-CNN/conv_ds_1/pointwise_conv/weights',
        'DS-CNN/conv_ds_1/pointwise_conv/biases',
        'DS-CNN/conv_ds_1/pw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_1/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV1',
        'DS-CNN/conv_ds_1/pw_batch_norm/beta'
    ]
    dw_conv_2 = [
        'DS-CNN/conv_ds_2/depthwise_conv/depthwise_weights',
        'DS-CNN/conv_ds_2/depthwise_conv/biases',
        'DS-CNN/conv_ds_2/dw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_2/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV2',
        'DS-CNN/conv_ds_2/dw_batch_norm/beta'
    ]
    pw_conv_2 = [
        'DS-CNN/conv_ds_2/pointwise_conv/weights',
        'DS-CNN/conv_ds_2/pointwise_conv/biases',
        'DS-CNN/conv_ds_2/pw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_2/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV2',
        'DS-CNN/conv_ds_2/pw_batch_norm/beta'
    ]
    dw_conv_3 = [
        'DS-CNN/conv_ds_3/depthwise_conv/depthwise_weights',
        'DS-CNN/conv_ds_3/depthwise_conv/biases',
        'DS-CNN/conv_ds_3/dw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_3/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV3',
        'DS-CNN/conv_ds_3/dw_batch_norm/beta'
    ]
    pw_conv_3 = [
        'DS-CNN/conv_ds_3/pointwise_conv/weights',
        'DS-CNN/conv_ds_3/pointwise_conv/biases',
        'DS-CNN/conv_ds_3/pw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_3/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV3',
        'DS-CNN/conv_ds_3/pw_batch_norm/beta'
    ]
    dw_conv_4 = [
        'DS-CNN/conv_ds_4/depthwise_conv/depthwise_weights',
        'DS-CNN/conv_ds_4/depthwise_conv/biases',
        'DS-CNN/conv_ds_4/dw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_4/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV4',
        'DS-CNN/conv_ds_4/dw_batch_norm/beta'
    ]
    pw_conv_4 = [
        'DS-CNN/conv_ds_4/pointwise_conv/weights',
        'DS-CNN/conv_ds_4/pointwise_conv/biases',
        'DS-CNN/conv_ds_4/pw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_4/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV4',
        'DS-CNN/conv_ds_4/pw_batch_norm/beta'
    ]
    dw_conv_5 = [
        'DS-CNN/conv_ds_5/depthwise_conv/depthwise_weights',
        'DS-CNN/conv_ds_5/depthwise_conv/biases',
        'DS-CNN/conv_ds_5/dw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_5/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV5',
        'DS-CNN/conv_ds_5/dw_batch_norm/beta'
    ]
    pw_conv_5 = [
        'DS-CNN/conv_ds_5/pointwise_conv/weights',
        'DS-CNN/conv_ds_5/pointwise_conv/biases',
        'DS-CNN/conv_ds_5/pw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_5/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV5',
        'DS-CNN/conv_ds_5/pw_batch_norm/beta'
    ]
    dw_conv_6 = [
        'DS-CNN/conv_ds_6/depthwise_conv/depthwise_weights',
        'DS-CNN/conv_ds_6/depthwise_conv/biases',
        'DS-CNN/conv_ds_6/dw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_6/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV6',
        'DS-CNN/conv_ds_6/dw_batch_norm/beta'
    ]
    pw_conv_6 = [
        'DS-CNN/conv_ds_6/pointwise_conv/weights',
        'DS-CNN/conv_ds_6/pointwise_conv/biases',
        'DS-CNN/conv_ds_6/pw_batch_norm/moving_mean',
        'DS-CNN/conv_ds_6/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV6',
        'DS-CNN/conv_ds_6/pw_batch_norm/beta'
    ]
    layer_list = [
        conv_1, dw_conv_1, pw_conv_1, dw_conv_2, pw_conv_2, dw_conv_3,
        pw_conv_3, dw_conv_4, pw_conv_4, dw_conv_5, pw_conv_5, dw_conv_6,
        pw_conv_6
    ]
    n_filters = 76
    for layer in layer_list:
        bit_width = reg_conv_bits
        layer_name = layer[6]
        PW = False
        if (layer_name[0:2] == 'PW'):
            PW = True
            bit_width = pw_conv_bits
        DW = False
        if (layer_name[0:2] == 'DW'):
            DW = True
            bit_width = dw_conv_bits
        print("Name of node - " + layer[6])
        for v in tf.trainable_variables():
            if v.name == layer[0] + ':0':
                v_weights = v
            if v.name == layer[1] + ':0':
                v_bias = v
            if v.name == layer[7] + ':0':
                v_beta = v
        for v in tf.global_variables():
            if v.name == layer[2] + ':0':
                v_mean = v
            if v.name == layer[3] + ':0':
                v_var = v
        weights = sess.run(v_weights)
        bias = sess.run(v_bias)
        beta = sess.run(v_beta)
        mean = sess.run(v_mean)
        var = sess.run(v_var)
        #print("Weights shape: " + str(weights.shape))
        #print("Bias shape: " + str(bias.shape))
        #print("Var shape: " + str(var.shape))
        #print("Mean shape: " + str(mean.shape))
        #print("Beta shape: " + str(beta.shape))

        w_shape = weights.shape
        b_shape = bias.shape
        weights = weights.squeeze()
        weights_t1 = np.zeros(weights.shape)
        bias_t1 = np.zeros((1, n_filters))
        for i in range(0, len(bias)):
            if (PW):
                filter = weights[:, i]
            else:
                filter = weights[:, :, i]
            bias_temp = bias[i]
            mean_temp = mean[i]
            var_temp = var[i]
            beta_temp = beta[i]
            new_filter = filter / math.sqrt(var_temp + eps)
            new_bias = beta_temp + (bias_temp -
                                    mean_temp) / (math.sqrt(var_temp + eps))
            if (PW):
                weights_t1[:, i] = new_filter
            else:
                weights_t1[:, :, i] = new_filter
            bias_t1[0, i] = new_bias
            #if (i == 0):
            #print('filters : ' + str(filter))
            #print('Bias : ' + str(bias_temp))
            #print('Mean : ' + str(mean_temp))
            #print('Variance : ' + str(var_temp))
            #print("New filter : " + str(new_filter))
            #print("New Bias : " + str(new_bias))
        min_value = weights_t1.min()
        max_value = weights_t1.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))

        dec_bits_weight = min((bit_width - 1) - int_bits, 111)
        weights_quant = np.round(weights_t1 * 2**dec_bits_weight)
        weights_quant = weights_quant / (2**dec_bits_weight)
        weights_quant = weights_quant.reshape(w_shape)
        #print("input fractional bits: " + str(layer[4]))
        #print("Weights min value: " + str(min_value))
        #print("Weights max value: " + str(max_value))
        #print("Weights fractional bits: " + str(dec_bits_weight))
        min_value = bias_t1.min()
        max_value = bias_t1.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        dec_bits_bias = min((bit_width - 1) - int_bits, 10000)
        bias_quant = np.round(bias_t1 * 2**dec_bits_bias)
        bias_quant = bias_quant / (2**dec_bits_bias)
        bias_quant = bias_quant.reshape(b_shape)
        bias_left_shift = layer[4] + dec_bits_weight - dec_bits_bias
        #print("Bias min value: " + str(min_value))
        #print("Bias max value: " + str(max_value))
        #print("Bias fractional bits: " + str(dec_bits_bias))

        # update the weights in tensorflow graph for quantizing the activations
        updated_weights = sess.run(tf.assign(v_weights, weights_quant))
        updated_bias = sess.run(tf.assign(v_bias, bias_quant))

    fc_layer = ['DS-CNN/fc1/weights', 'DS-CNN/fc1/biases', 5, 3, 'FC']
    for v in tf.trainable_variables():
        if v.name == fc_layer[0] + ':0':
            v_fc_weights = v
        if v.name == fc_layer[1] + ':0':
            v_fc_bias = v
    weights = sess.run(v_fc_weights)
    bias = sess.run(v_fc_bias)
    w_shape = weights.shape
    b_shape = bias.shape
    #print("FC weights : " + str(weights.shape))
    #print(weights)
    #print("FC bias : " + str(bias.shape))
    #print(bias)
    min_value = weights.min()
    max_value = weights.max()
    int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
    dec_bits_weight = min((fc_bits - 1) - int_bits, 111)
    weights_quant = np.round(weights * 2**dec_bits_weight)
    weights_quant = weights_quant / (2**dec_bits_weight)
    weights_quant = weights_quant.reshape(w_shape)
    #print("input fractional bits: " + str(fc_layer[2]))
    #print("Weights min value: " + str(min_value))
    #print("Weights max value: " + str(max_value))
    #print("Weights fractional bits: " + str(dec_bits_weight))
    min_value = bias.min()
    max_value = bias.max()
    int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
    dec_bits_bias = min((fc_bits - 1) - int_bits, 10000)
    bias_quant = np.round(bias * 2**dec_bits_bias)
    #print("Bias min value: " + str(min_value))
    #print("Bias max value: " + str(max_value))
    #print("Bias fractional bits: " + str(dec_bits_bias))
    bias_quant = bias_quant / (2**dec_bits_bias)
    bias_quant = bias_quant.reshape(b_shape)
    #print("Quantized weights: " + str(weights_quant))
    #print("Quantized bias: " +str(bias_quant))
    updated_weights = sess.run(tf.assign(v_fc_weights, weights_quant))
    updated_bias = sess.run(tf.assign(v_fc_bias, bias_quant))
    #print("bias[0] : " + str(bias[0]))
    #print("bias_quant[0] : " + str(bias_quant[0]))

    training_step = 30000
    checkpoint_path = os.path.join(FLAGS.train_dir, 'quant',
                                   FLAGS.model_architecture + '.ckpt')
    tf.logging.info('Saving best model to "%s-%d"', checkpoint_path,
                    training_step)
    saver.save(sess, checkpoint_path, global_step=training_step)
def audiofile_to_features(wav_filename):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate)

    return features, features_len
Beispiel #22
0
  def prepare_processing_graph(self, model_settings, summaries_dir):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = model_settings['desired_samples']
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          scaled_foreground,
          self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
      # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
      spectrogram = contrib_audio.audio_spectrogram(
          background_clamp,
          window_size=model_settings['window_size_samples'],
          stride=model_settings['window_stride_samples'],
          magnitude_squared=True)
      tf.summary.image(
          'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
      # The number of buckets in each FFT row in the spectrogram will depend on
      # how many input samples there are in each window. This can be quite
      # large, with a 160 sample window producing 127 buckets for example. We
      # don't need this level of detail for classification, so we often want to
      # shrink them down to produce a smaller result. That's what this section
      # implements. One method is to use average pooling to merge adjacent
      # buckets, but a more sophisticated approach is to apply the MFCC
      # algorithm to shrink the representation.
      if model_settings['preprocess'] == 'average':
        self.output_ = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
        tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1)
      elif model_settings['preprocess'] == 'mfcc':
        self.output_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
        tf.summary.image(
            'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
      else:
        raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
                         ' "average")' % (model_settings['preprocess']))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.summary.merge_all(scope='data')
      self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
                                                   tf.get_default_graph())
Beispiel #23
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc' or 'average'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, feature_bin_count, preprocess)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    daudio = tf.identity(decoded_sample_data.audio, name='dao')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if preprocess == 'average':
        fingerprint_input = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
    elif preprocess == 'mfcc':
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
    else:
        raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
                        ' "average")' % (preprocess))

    fingerprint_size = model_settings['fingerprint_size']
    reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
Beispiel #24
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    if False:
        spectrogram = contrib_audio.audio_spectrogram(
            decoded_sample_data.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            dct_coefficient_count=dct_coefficient_count)
        fingerprint_frequency_size = model_settings['dct_coefficient_count']
        fingerprint_time_size = model_settings['spectrogram_length']
        reshaped_input = tf.reshape(
            fingerprint_input,
            [-1, fingerprint_time_size * fingerprint_frequency_size])
    else:
        audio_size = model_settings['desired_samples']
        reshaped_input = tf.reshape(decoded_sample_data.audio,
                                    [-1, audio_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 norm_binw=FLAGS.norm_binw,
                                 downsample=FLAGS.downsample,
                                 add_prefilter_bias=FLAGS.prefilter_bias,
                                 use_down_avgfilt=FLAGS.use_down_avgfilt,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
Beispiel #25
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.
        Creates a graph that loads a WAVE file, decodes it, scales the volume,
        shifts it in time, adds in background noise, calculates a spectrogram, and
        then builds an MFCC fingerprint from that.
        This must be called with an active TensorFlow session running, and it
        creates multiple placeholder inputs, and one output:
          - wav_filename_placeholder_: Filename of the WAV to load.
          - foreground_volume_placeholder_: How loud the main clip should be.
          - time_shift_padding_placeholder_: Where to pad the clip.
          - time_shift_offset_placeholder_: How much to move the clip in time.
          - background_data_placeholder_: PCM sample data for background noise.
          - background_volume_placeholder_: Loudness of mixed-in background.
          - mfcc_: Output 2D fingerprint of processed audio.
        Args:
          model_settings: Information about the current model being trained.
        """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(
            wav_loader, desired_channels=1, desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(
            scaled_foreground,
            self.time_shift_padding_placeholder_,
            mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                           [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.

#         mel_bias_ = linear_to_mel_weight_matrix(num_mel_bins=model_settings['dct_coefficient_count'],
#             num_spectrogram_bins=int(2048/2+1),
#             sample_rate=model_settings['sample_rate'],
#             lower_edge_hertz=100,
#             upper_edge_hertz=4800)
            #warp_factor=self.warp_factor_placeholder_)

        # spectrogram = tf.abs(tf.contrib.signal.stft(tf.transpose(background_clamp),
        #     model_settings['window_size_samples'],
        #     model_settings['window_stride_samples'],
        #     fft_length=2048,
        #     window_fn=tf.contrib.signal.hann_window,
        #     pad_end=False))

        # self.mfcc_ = tf.matmul(tf.reshape(tf.pow(spectrogram, 2), [-1, 1025]), mel_bias_)
        # #self.mfcc_ = tf.maximum(self.mfcc_, 1e-7)
        # self.mfcc_ = tf.log(tf.maximum(self.mfcc_, 1e-7))

        # print('/n New feature without DCT and Log by iVip-Tsinghua /n hahahahahahahaha /n')

        spectrogram = contrib_audio.audio_spectrogram(
           background_clamp,
           window_size=model_settings['window_size_samples'],
           stride=model_settings['window_stride_samples'],
           magnitude_squared=True)
        self.mfcc_ = contrib_audio.mfcc(
           spectrogram,
           wav_decoder.sample_rate,
           dct_coefficient_count=model_settings['dct_coefficient_count'])
import numpy as np

if len(sys.argv) < 3:
    raise ValueError("give me a path to model and to a file float32 .dat")
else:
    model_path = sys.argv[1]
    file_path = sys.argv[2]

FRAME_SIZE = 640
FRAME_STRIDE = 320
SAMPLE_RATE = DESIRED_SAMPLES = 16000
NUM_CEP = 10

wav_loader = io_ops.read_file(file_path)
wav_decoder = audio_ops.decode_wav(wav_loader,
                                   desired_channels=1,
                                   desired_samples=DESIRED_SAMPLES)
# Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
spectrograms_power = audio_ops.audio_spectrogram(wav_decoder.audio,
                                                 window_size=FRAME_SIZE,
                                                 stride=FRAME_STRIDE,
                                                 magnitude_squared=True)
USE_POWER = True
if USE_POWER:
    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins = spectrograms_power.shape[-1].value
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz,
        upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrograms_power,
Beispiel #27
0
    cond_fp = tf.placeholder(tf.string, [])
    cond_dataset = tf.data.TextLineDataset(
        [cond_fp])  # Multiple conditional texts per audio file
    cond_texts_iter = cond_dataset.make_initializable_iterator()
    cond_text = cond_texts_iter.get_next()

    # Conditional text embedding
    embed = hub.Module("https://tfhub.dev/google/elmo/2",
                       trainable=False,
                       name='embed')
    cond_texts_batch = tf.placeholder(tf.string, [None])
    cond_text_embeds = embed(cond_texts_batch)

    audio_fp = tf.placeholder(tf.string, [])
    audio_bin = tf.read_file(audio_fp)
    samps = contrib_audio.decode_wav(audio_bin, 1).audio[:, 0]

    if slice_len_samps is not None:
        if args.first_only:
            pad_end = True
        else:
            pad_end = False

        slices = tf.contrib.signal.frame(samps,
                                         slice_len_samps,
                                         slice_len_samps,
                                         axis=0,
                                         pad_end=pad_end)

        if args.nrg_top_k:
            nsecs = tf.cast(tf.shape(samps)[0], tf.float32) / args.fs
Beispiel #28
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """
    建立张量流图以应用输入失真。
    创建一个图形,加载一个WAVE文件,对其进行解码、缩放体积、平移,
    添加背景噪声,计算一个声谱图,然后从中生成MFCC特征。
    必须在TensorFlow会话运行时调用它,它会创建多个占位符输入和一个输出::

      - wav_filename_placeholder_: 音频文件名
      - foreground_volume_placeholder_: 主剪辑的声音应该有多大
      - time_shift_padding_placeholder_: 在哪个位置剪辑
      - time_shift_offset_placeholder_: 在剪辑上移动多少
      - background_data_placeholder_: 背景噪声的PCM采样数据
      - background_volume_placeholder_: 背景中混音的响度
      - output_: 经过处理后的二维输出

    Args:
      model_settings: 正在训练的当前模型信息
      summaries_dir: 保存训练摘要信息的路径
      
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)

            #允许调整音频样本的音量

            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)

            # 移动样本的起始位置,并用零填充任何间隙

            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(scaled_foreground,
                                       self.time_shift_padding_placeholder_,
                                       mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # 混入背景噪音
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            # 运行频谱图和MFCC节点来获取音频的二维特征

            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.summary.image('spectrogram',
                             tf.expand_dims(spectrogram, -1),
                             max_outputs=1)

            #频谱图中每个FFT行中的桶数将取决于每个窗口中有多少输入样本。
            #不需要详细分类,希望缩小它们以产生更小的结果。
            #一种方法是使用平均法来遍历相邻的bucket,更复杂的方法是应用MFCC算法来缩小表示。

            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.summary.image('shrunk_spectrogram',
                                 self.output_,
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = contrib_audio.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.summary.image('mfcc',
                                 tf.expand_dims(self.output_, -1),
                                 max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (model_settings['preprocess']))

            # 合并所有摘要并将其写入/tmp/retrain_日志

            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            self.summary_writer_ = tf.summary.FileWriter(
                summaries_dir + '/data', tf.get_default_graph())
Beispiel #29
0
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrograms_power = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if USE_POWER:
      # Warp the linear scale spectrograms into the mel-scale.
      num_spectrogram_bins = spectrograms_power.shape[-1].value
      lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40
      linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz,
        upper_edge_hertz)
      mel_spectrograms = tf.tensordot(
        spectrograms_power, linear_to_mel_weight_matrix, 1)
      mel_spectrograms.set_shape(spectrograms_power.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

      # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
      log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

      # Compute MFCCs from log_mel_spectrograms and take the first NDCT.
      mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
        log_mel_spectrograms)[..., :model_settings['dct_coefficient_count']]
      self.mfcc_ = tf.expand_dims(mfccs, axis=0)
    else:
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])

  def set_size(self, mode):
    """Calculates the number of samples in the dataset partition.

    Args:
      mode: Which partition, must be 'training', 'validation', or 'testing'.

    Returns:
      Number of samples in the partition.
    """
    return len(self.data_index[mode])

  def get_data(self, how_many, offset, model_settings, background_frequency,
               background_volume_range, time_shift, mode, sess):
    """Gather samples from the data set, applying transformations as needed.

    When the mode is 'training', a random selection of samples will be returned,
    otherwise the first N clips in the partition will be used. This ensures that
    validation always uses the same samples, reducing noise in the metrics.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      offset: Where to start when fetching deterministically.
      model_settings: Information about the current model being trained.
      background_frequency: How many clips will have background noise, 0.0 to
        1.0.
      background_volume_range: How loud the background noise will be.
      time_shift: How much to randomly shift the clips by in time.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.
      sess: TensorFlow session that was active when processor was created.

    Returns:
      List of sample data for the transformed samples, and list of labels in
      one-hot form.
    """
    # Pick one of the partitions to choose samples from.
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = max(0, min(how_many, len(candidates) - offset))
    # Data and labels will be populated and returned.
    data = np.zeros((sample_count, model_settings['fingerprint_size']))
    labels = np.zeros((sample_count, model_settings['label_count']))
    desired_samples = model_settings['desired_samples']
    use_background = self.background_data and (mode == 'training')
    pick_deterministically = (mode != 'training')
    # Use the processing graph we created earlier to repeatedly to generate the
    # final output sample data we'll use in training.
    for i in xrange(offset, offset + sample_count):
      # Pick which audio sample to use.
      if how_many == -1 or pick_deterministically:
        sample_index = i
      else:
        sample_index = np.random.randint(len(candidates))
      sample = candidates[sample_index]
      # If we're time shifting, set up the offset for this sample.
      if time_shift > 0:
        time_shift_amount = np.random.randint(-time_shift, time_shift)
      else:
        time_shift_amount = 0
      if time_shift_amount > 0:
        time_shift_padding = [[time_shift_amount, 0], [0, 0]]
        time_shift_offset = [0, 0]
      else:
        time_shift_padding = [[0, -time_shift_amount], [0, 0]]
        time_shift_offset = [-time_shift_amount, 0]
      input_dict = {
          self.wav_filename_placeholder_: sample['file'],
          self.time_shift_padding_placeholder_: time_shift_padding,
          self.time_shift_offset_placeholder_: time_shift_offset,
      }
      # Choose a section of background noise to mix in.
      if use_background:
        background_index = np.random.randint(len(self.background_data))
        background_samples = self.background_data[background_index]
        background_offset = np.random.randint(
            0, len(background_samples) - model_settings['desired_samples'])
        background_clipped = background_samples[background_offset:(
            background_offset + desired_samples)]
        background_reshaped = background_clipped.reshape([desired_samples, 1])
        if np.random.uniform(0, 1) < background_frequency:
          background_volume = np.random.uniform(0, background_volume_range)
        else:
          background_volume = 0
      else:
        background_reshaped = np.zeros([desired_samples, 1])
        background_volume = 0
      input_dict[self.background_data_placeholder_] = background_reshaped
      input_dict[self.background_volume_placeholder_] = background_volume
      # If we want silence, mute out the main sample but leave the background.
      if sample['label'] == SILENCE_LABEL:
        input_dict[self.foreground_volume_placeholder_] = 0
      else:
        input_dict[self.foreground_volume_placeholder_] = 1
      # Run the graph to produce the output audio.
      data[i - offset, :] = sess.run(self.mfcc_, feed_dict=input_dict).flatten()
      label_index = self.word_to_index[sample['label']]
      labels[i - offset, label_index] = 1
    return data, labels

  def get_wav_files(self, how_many, offset, model_settings, mode):
    """Return wav_file names and labels from train/val/test sets.
    """
    # Pick one of the partitions to choose samples from.
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = max(0, min(how_many, len(candidates) - offset))
    pick_deterministically = (mode != 'training')
    wav_files = []
    labels = np.zeros((sample_count, model_settings['label_count']))
    for i in xrange(offset, offset + sample_count):
      # Pick which audio sample to use.
      if how_many == -1 or pick_deterministically:
        sample_index = i
      else:
        sample_index = np.random.randint(len(candidates))
      sample = candidates[sample_index]
      if sample['label'] == SILENCE_LABEL:
        wav_files.append('silence.wav')
      else:
        wav_files.append(sample['file'])
      label_index = self.word_to_index[sample['label']]
      labels[i - offset, label_index] = 1
    return wav_files, labels


  def get_unprocessed_data(self, how_many, model_settings, mode):
    """Retrieve sample data for the given partition, with no transformations.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = how_many
    desired_samples = model_settings['desired_samples']
    words_list = self.words_list
    data = np.zeros((sample_count, desired_samples))
    labels = []
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      foreground_volume_placeholder = tf.placeholder(tf.float32, [])
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      foreground_volume_placeholder)
      for i in range(sample_count):
        if how_many == -1:
          sample_index = i
        else:
          sample_index = np.random.randint(len(candidates))
        sample = candidates[sample_index]
        input_dict = {wav_filename_placeholder: sample['file']}
        if sample['label'] == SILENCE_LABEL:
          input_dict[foreground_volume_placeholder] = 0
        else:
          input_dict[foreground_volume_placeholder] = 1
        data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
        label_index = self.word_to_index[sample['label']]
        labels.append(words_list[label_index])
    return data, labels
Beispiel #30
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.
    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.
    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:
      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.
    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.
    Raises:
      ValueError: If the preprocessing mode isn't recognized.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(scaled_foreground,
                                       self.time_shift_padding_placeholder_,
                                       mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.summary.image('spectrogram',
                             tf.expand_dims(spectrogram, -1),
                             max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.summary.image('shrunk_spectrogram',
                                 self.output_,
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = contrib_audio.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.summary.image('mfcc',
                                 tf.expand_dims(self.output_, -1),
                                 max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (model_settings['preprocess']))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            if summaries_dir:
                self.summary_writer_ = tf.summary.FileWriter(
                    summaries_dir + '/data', tf.get_default_graph())
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_tensor',
                        type=str,
                        default='decoded_sample_data',
                        help="""\
      Input data tensor name. Leave as is for the
      competition.\
      """)
    parser.add_argument('--final_tensor',
                        type=str,
                        default='labels_softmax',
                        help="""\
      Name of the softmax output tensor. Leave as is for the
      competition.\
      """)
    parser.add_argument('--frozen_path',
                        type=str,
                        default='tf_files/frozen.pb',
                        help="""\
      The frozen graph's filename.\
      """)
    parser.add_argument('--checkpoint_path',
                        type=str,
                        default='checkpoints_106/ep-062-vl-0.1815.hdf5',
                        help="""\
      Path to the hdf5 checkpoint that you want to freeze.\
      """)
    args, unparsed = parser.parse_known_args()
    custom_objects = {
        'relu6': relu6,
        'DepthwiseConv2D': DepthwiseConv2D,
        'overlapping_time_slice_stack': overlapping_time_slice_stack,
        'softmax': softmax,
        '<lambda>': smooth_categorical_crossentropy
    }

    model = load_model(args.checkpoint_path, custom_objects=custom_objects)

    # rename placeholders for special prize:
    # https://www.kaggle.com/c/tensorflow-speech-recognition-challenge#Prizes
    # decoded_sample_data:0, taking a [16000, 1] float tensor as input,
    # representing the audio PCM-encoded data.
    # `decode_wav` will produce two outputs. tf names them: 'name:0', 'name:1'.
    wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='wav_fn')
    wav_loader = io_ops.read_file(wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=16000,
                                           name=args.data_tensor)

    # add batch dimension and remove last one
    # keras model wants (None, 16000)
    data_reshaped = tf.reshape(wav_decoder.audio, (1, -1))
    # call keras model
    softmax_probs = model(data_reshaped)
    # remove batch dimension
    softmax_probs = tf.reshape(softmax_probs, (-1, ), name=args.final_tensor)

    frozen_graph_def = graph_util.convert_variables_to_constants(
        sess, sess.graph.as_graph_def(), [args.final_tensor])

    with gfile.FastGFile(args.frozen_path, 'wb') as f:
        f.write(frozen_graph_def.SerializeToString())

    print("Wrote frozen graph to: %s" % args.frozen_path)
Beispiel #32
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc' or 'average'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
Beispiel #33
0
# Taken from Mauri de Souza Nunes at https://mauri870.github.io/blog/posts/audio-spectrograms-in-tensorflow/

import tensorflow as tf
from tensorflow.contrib.framework.python.ops import audio_ops

# Wav file name
wav_file = tf.placeholder(tf.string)

# Read the wav file
audio_binary = tf.read_file(wav_file)

# Decode the wav mono into a 2D tensor with time in dimension 0
# and channel along dimension 1
waveform = audio_ops.decode_wav(audio_binary,
                                file_format='wav',
                                desired_channels=1)

# Compute the spectrogram
spectrogram = audio_ops.audio_spectrogram(waveform.audio,
                                          window_size=1024,
                                          stride=64)

# Custom brightness
brightness = tf.placeholder(tf.float32, shape=[])
mul = tf.multiply(spectrogram, brightness)

# Normalize pixels
min_const = tf.constant(255.)
minimum = tf.minimum(mul, min_const)

# Expand dims so we get the proper shape
Beispiel #34
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  elif preprocess == 'micro':
    if not frontend_op:
      raise Exception(
          'Micro frontend op is currently not available when running TensorFlow'
          ' directly from Python, you need to build and run through Bazel, for'
          ' example'
          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
      )
    sample_rate = model_settings['sample_rate']
    window_size_ms = (model_settings['window_size_samples'] *
                      1000) / sample_rate
    window_step_ms = (model_settings['window_stride_samples'] *
                      1000) / sample_rate
    int16_input = tf.cast(
        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=model_settings['fingerprint_width'],
        out_scale=1,
        out_type=tf.float32)
    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                    ' "average", or "micro")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
Beispiel #35
0
import tensorflow as tf
import numpy as np
from tensorflow.python.ops import io_ops
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from scipy.io import wavfile

filename = '/Users/tsingh1/Developer/kaggle/speech/data/train/audio/bed/d78858d9_nohash_1.wav'
filenameTensor = tf.constant(filename)
with tf.Session() as sess:
    wav_loader = io_ops.read_file(filenameTensor)
    wav_decoder = contrib_audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=5)
    x = wav_decoder.audio.eval().flatten()
    print('x1', x)
    print('x1', x.shape)

_, wav = wavfile.read(filename)
wav1 = wav.astype(np.float32) / np.iinfo(np.int16).max
print('w', wav)
print('w1', wav1)
print('w1', wav1.shape)
Beispiel #36
0
def parse_files_function(example):
    from tensorflow.contrib.framework.python.ops import audio_ops
    wav_loader = tf.read_file(example)
    wav_tensor = audio_ops.decode_wav(wav_loader)

    return wav_tensor
Beispiel #37
0
print(f"wav_test_data_path_start: {params.wav_test_data_path_start}")
print("+--------------+")
for f in iglob(params.wav_test_data_path_start + "*.wav"):
    print(f"File: {f}")
print("+--------------+")

sess = tf.Session()

file_number = 0
section_size = params.section_size
for f in file_arr:
    ch1_song = np.array([]).astype(float)
    ch2_song = np.array([]).astype(float)

    audio_binary = tf.read_file(f)
    wav_decoder = decode_wav(audio_binary, desired_channels=2)

    sample_rate, audio = sess.run([wav_decoder.sample_rate, wav_decoder.audio])
    audio = np.array(audio)

    print(len(audio[:, 0]))
    print(audio.shape)

    a0 = audio[:, 0]
    a1 = audio[:, 1]
    a0 = normalize(a0)
    a1 = normalize(a1)

    if params.overlap_sections:
        s_a0 = segment(a0, params.overlap_section_size, section_size)
        s_a1 = segment(a1, params.overlap_section_size, section_size)
Beispiel #38
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture,
                           model_size_info, use_mfcc):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, use_mfcc)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    if model_settings['use_mfcc'] == True:
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            dct_coefficient_count=dct_coefficient_count)
    else:
        linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            num_mel_bins=model_settings['dct_coefficient_count'],
            num_spectrogram_bins=spectrogram.shape[-1].value,
            sample_rate=model_settings['sample_rate'],
            upper_edge_hertz=7600.0,
            lower_edge_hertz=80.0)
        fingerprint_input = tf.tensordot(spectrogram,
                                         linear_to_mel_weight_matrix, 1)
        fingerprint_input.set_shape(spectrogram.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
Beispiel #39
0
    def prepare_processing_graph(self, model_settings, input_type,
                                 volume_scale):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(
            background_mul,
            sliced_foreground)  # Noise is added to clean speech signal
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
        background_clamp = tf.multiply(background_clamp, volume_scale)
        background_clamp = tf.clip_by_value(background_clamp, -1.0, 1.0)
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        spectrogram = contrib_audio.audio_spectrogram(
            background_clamp,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        if (input_type == 'log-mel'):
            print("log-mel energies")
            # Warp the linear-scale, magnitude spectrograms into the mel-scale.
            num_spectrogram_bins = spectrogram.shape[
                -1].value  #magnitude_spectrograms.shape[-1].value
            lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, model_settings[
                'dct_coefficient_count']
            linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
                num_mel_bins, num_spectrogram_bins,
                model_settings['sample_rate'], lower_edge_hertz,
                upper_edge_hertz)
            mel_spectrograms = tf.tensordot(spectrogram,
                                            linear_to_mel_weight_matrix, 1)
            # Note: Shape inference for `tf.tensordot` does not currently handle this case.
            mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
                linear_to_mel_weight_matrix.shape[-1:]))
            log_offset = 1e-6
            log_mel_spectrograms = tf.log(mel_spectrograms + log_offset)
            self.mfcc_ = log_mel_spectrograms
        elif (input_type == 'MFCC'):
            print('MFCC-features')
            self.mfcc_ = contrib_audio.mfcc(
                spectrogram,
                wav_decoder.sample_rate,
                dct_coefficient_count=model_settings['dct_coefficient_count'])
Beispiel #40
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        with tf.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)

            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)

            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')

            padded_foreground = tf.pad(scaled_foreground,
                                       self.time_shift_padding_placeholder_,
                                       mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])

            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.summary.image('spectrogram',
                             tf.expand_dims(spectrogram, -1),
                             max_outputs=1)

            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.summary.image('shrunk_spectrogram',
                                 self.output_,
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = contrib_audio.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.summary.image('mfcc',
                                 tf.expand_dims(self.output_, -1),
                                 max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc" or "average")'
                    % (model_settings['preprocess']))

            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            self.summary_writer_ = tf.summary.FileWriter(
                summaries_dir + '/data', tf.get_default_graph())
Beispiel #41
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
        ## Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        #spectrogram = contrib_audio.audio_spectrogram(
        #    background_clamp,
        #    window_size=model_settings['window_size_samples'],
        #    stride=model_settings['window_stride_samples'],
        #    magnitude_squared=True)
        #self.mfcc_ = contrib_audio.mfcc(
        #    spectrogram,
        #    wav_decoder.sample_rate,
        #    dct_coefficient_count=model_settings['dct_coefficient_count'])
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        background_clamp = tf.reshape(background_clamp, [1, -1])
        self.background_clamp = background_clamp
        stfts = tf.contrib.signal.stft(
            background_clamp,
            frame_length=model_settings['window_size_samples'],
            frame_step=model_settings['window_stride_samples'],
            fft_length=512,
            window_fn=None)
        spectrograms = tf.abs(stfts)
        ## Warp the linear scale spectrograms into the mel-scale.
        num_spectrogram_bins = stfts.shape[-1].value
        lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 3800.0, 26
        linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            num_mel_bins, num_spectrogram_bins, 16000, lower_edge_hertz,
            upper_edge_hertz)
        self.linear_to_mel_weight_matrix = linear_to_mel_weight_matrix
        self.spectrograms = spectrograms
        self.num_spectrogram_bins = num_spectrogram_bins
        mel_spectrograms = tf.tensordot(spectrograms,
                                        linear_to_mel_weight_matrix, 1)
        mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))

        ## Compute a stabilized log to get log-magnitude mel-scale spectrograms.
        log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)

        ## Compute MFCCs from log_mel_spectrograms and take the first 13.
        self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
            log_mel_spectrograms)[
                ..., :model_settings['dct_coefficient_count']]
Beispiel #42
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)

        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)

        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])

        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        spectrogram = contrib_audio.audio_spectrogram(
            background_clamp,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        self.mfcc_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['dct_coefficient_count'])
Beispiel #43
0
def audiofile_to_features(wav_filename):
    samples = tf.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate)

    return features, features_len