def __init__(self, numFeats=10):
        self.sess = tf.Session(graph=tf.get_default_graph())
        data_url = 'http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz'
        data_dir = '/tmp/speech_dataset/'
        wanted_words = 'yes,no,up,down,left,right,on,off,stop,go'
        self.sample_rate = 16000
        self.background_frequency = 0.8
        self.background_volume = 0.1
        self.time_shift_ms = 100.0
        clip_duration_ms = 1000.0
        window_size_ms = 32.0
        window_stride_ms = 16.0
        dct_coefficient_count = numFeats
        unknown_percentage = 10.0
        silence_percentage = 10.0
        validation_percentage = 10.0
        testing_percentage = 10.0

        self.model_settings = models_gd.prepare_model_settings(
            len(input_data.prepare_words_list(wanted_words.split(','))),
            self.sample_rate, clip_duration_ms, window_size_ms,
            window_stride_ms, dct_coefficient_count)
        self.audio_processor = input_data.AudioProcessor(
            data_url, data_dir, silence_percentage, unknown_percentage,
            wanted_words.split(','), validation_percentage, testing_percentage,
            self.model_settings)
Example #2
0
def get_int2label(wanted_only=False, extend_reversed=False):
    classes = get_classes(wanted_only=wanted_only,
                          extend_reversed=extend_reversed)
    classes = prepare_words_list(classes)
    int2label = {i: l for i, l in enumerate(classes)}
    int2label = OrderedDict(sorted(int2label.items(), key=lambda x: x[0]))
    return int2label
Example #3
0
def get_label2int(wanted_only=False, extend_reversed=False):
    classes = get_classes(wanted_only=wanted_only,
                          extend_reversed=extend_reversed)
    classes = prepare_words_list(classes)
    label2int = {l: i for i, l in enumerate(classes)}
    label2int = OrderedDict(sorted(label2int.items(), key=lambda x: x[1]))
    return label2int
Example #4
0
def KWS_data_loader(FLAGS, sess):
    #sess = tf.Session()
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)
    fingerprint_size = model_settings['fingerprint_size']
    label_count = model_settings['label_count']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
    fingerprint_size = model_settings['fingerprint_size']
    label_count = model_settings['label_count']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
    training_steps_list = list(
        map(int, FLAGS.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))

    validation_fingerprints, validation_ground_truth = (
        audio_processor.get_data(-1, 0, model_settings, 0.0, 0.0, 0,
                                 'validation', sess))
    input_frequency_size = model_settings['dct_coefficient_count']
    input_time_size = model_settings['spectrogram_length']
    return audio_processor, training_steps_list, learning_rates_list, model_settings, time_shift_samples, validation_fingerprints, validation_ground_truth
Example #5
0
def main(_):
    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)

    # Start a new TensorFlow session.
    sess = tf.InteractiveSession()

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    fingerprint_size = model_settings['fingerprint_size']
    print(fingerprint_size)
    # look at the audio_processor. run those data.
    label_count = model_settings['label_count']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
    print(time_shift_samples)
    train_fingerprints, train_ground_truth, _ = audio_processor.get_data(
        FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
        FLAGS.background_volume, time_shift_samples, 'training', sess)
Example #6
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list),
        sample_rate,
        clip_duration_ms,
        window_size_ms,
        window_stride_ms,
        dct_coefficient_count,
    )
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
Example #7
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)
  fingerprint_input = contrib_audio.mfcc(
      spectrogram,
      decoded_sample_data.sample_rate,
      dct_coefficient_count=dct_coefficient_count)
  fingerprint_frequency_size = model_settings['dct_coefficient_count']
  fingerprint_time_size = model_settings['spectrogram_length']
  reshaped_input = tf.reshape(fingerprint_input, [
      -1, fingerprint_time_size * fingerprint_frequency_size
  ])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
def get_set(set_type):
    wanted_words = 'yes,no,up,down,left,right,on,off,stop,go'
    sample_rate = 16000
    clip_duration_ms = 1000
    window_size_ms = 30.0
    window_stride_ms = 10.0
    dct_coefficient_count = 40

    data_url = ''
    data_dir = '/tmp/speech_dataset/'
    silence_percentage = 0
    unknown_percentage = 0
    validation_percentage = 1
    testing_percentage = 1

    # Start a new TensorFlow session.
    sess = tf.InteractiveSession()

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(wanted_words.split(','))),
        sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)
    audio_processor = input_data.AudioProcessor(
        data_url, data_dir, silence_percentage,
        unknown_percentage,
        wanted_words.split(','), validation_percentage,
        testing_percentage, model_settings)

    data, labels = audio_processor.get_unprocessed_data(-1, model_settings, 'testing')
    # print('CREATE ANNOTATION SUBSET: Printing data then labels.')
    # print(data)
    # print(labels)
    size = audio_processor.set_size(set_type)
    print('CREATE ANNOTATION SUBSET: Printing annotation set size')
    print(size)

    annotation_listing = audio_processor.data_index[set_type]
    print('CREATE ANNOTATION SUBSET: Printing annotation set names')
    print(annotation_listing)

    return annotation_listing
Example #9
0
def main(_):

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)

    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)
    print FLAGS.data_url
    print FLAGS.data_dir
    print model_settings
Example #10
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):

    graph = tf.Graph()
    with graph.as_default():
        words_list = input_data.prepare_words_list(wanted_words.split(','))
        model_settings = models.prepare_model_settings(
            len(words_list), sample_rate, clip_duration_ms, window_size_ms,
            window_stride_ms, dct_coefficient_count)
        runtime_settings = {'clip_stride_ms': clip_stride_ms}

        wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
        decoded_sample_data = contrib_audio.decode_wav(
            wav_data_placeholder,
            desired_channels=1,
            desired_samples=model_settings['desired_samples'],
            name='decoded_sample_data')
        spectrogram = contrib_audio.audio_spectrogram(
            decoded_sample_data.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            dct_coefficient_count=dct_coefficient_count)
        fingerprint_frequency_size = model_settings['dct_coefficient_count']
        fingerprint_time_size = model_settings['spectrogram_length']
        reshaped_input = tf.reshape(
            fingerprint_input,
            [-1, fingerprint_time_size * fingerprint_frequency_size])

        logits = models.create_model(reshaped_input,
                                     model_settings,
                                     model_architecture,
                                     is_training=False,
                                     runtime_settings=runtime_settings)

        # Create an output to use for inference.
        tf.nn.softmax(logits, name='labels_softmax')
    return graph
Example #11
0
def run_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms,
                  window_stride_ms, dct_coefficient_count, model_architecture,
                  model_size_info):

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)

    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    # test set
    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)

    test_fingerprints, test_ground_truth = audio_processor.get_data(
        set_size,
        0,
        model_settings,
        0.0,
        0.0,
        0,
        'testing',
        sess,
        debugging=True,
        wav_path="speech_dataset\\up\\0a2b400e_nohash_0.wav")
    #for ii in range(set_size):
    #  np.savetxt('test_data/'+str(ii)+'.txt',test_fingerprints[ii], newline=' ', header=str(np.argmax(test_ground_truth[ii])))

    print(test_fingerprints)
Example #12
0
def main(_):
    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)

    # Start a new TensorFlow session.
    sess = tf.InteractiveSession()

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)
    fingerprint_size = model_settings['fingerprint_size']
    print(fingerprint_size)
Example #13
0
def get_dataset(num_of_samples):
    import input_data
    import models

    wanted_words = 'yes,no,up,down,left,right,on,off,stop,go'
    model_settings = models.prepare_model_settings(
        label_count=len(input_data.prepare_words_list(wanted_words.split(','))),
        sample_rate=16000,
        clip_duration_ms=1000,
        window_size_ms=40.0,
        window_stride_ms=20.0,
        dct_coefficient_count=10
      )
    audio_processor = input_data.AudioProcessor(
        data_url='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
        data_dir='/tmp/speech_dataset/',
        silence_percentage=10.0,
        unknown_percentage=10.0,
        wanted_words=wanted_words.split(','),
        validation_percentage=10,
        testing_percentage=10,
        model_settings=model_settings
        )

    print(audio_processor)
    set_size = audio_processor.set_size('testing')
    batch_size = num_of_samples
    sess = tf.InteractiveSession()

    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    data, label = audio_processor.get_data(
        batch_size, 0, model_settings, 0.0, 0.0, 0, 'testing', sess)
    
    return data, label
Example #14
0
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms,
                        window_size_ms, window_stride_ms,
                        dct_coefficient_count, model_architecture,
                        model_size_info, act_max, data_url, data_dir,
                        silence_percentage, unknown_percentage, checkpoint,
                        batch_size, include_silence, lower_frequency_limit,
                        upper_frequency_limit, filterbank_channel_count,
                        is_bg_volume_constant, feature_extraction):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.

    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model_architecture: Name of the kind of model to generate.
      model_size_info: Model dimensions : different lengths for different models
    """
    tf.reset_default_graph()
    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               silence_percentage != 0)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, lower_frequency_limit,
        upper_frequency_limit, filterbank_channel_count)

    audio_processor = input_data.AudioProcessor(data_url, data_dir,
                                                silence_percentage,
                                                unknown_percentage,
                                                wanted_words.split(','), 0,
                                                100, model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 act_max,
                                 is_training=False)

    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, checkpoint)

    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        dec_bits = 7 - int_bits
        # convert to [-128,128) or int8
        var_values = np.round(var_values * 2**dec_bits)
        var_values = var_values / (2**dec_bits)
        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))

        # test set
    set_size = audio_processor.set_size('testing')
    total_accuracy = 0
    total_conf_matrix = None
    for i in xrange(0, set_size, batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess,
            is_bg_volume_constant, feature_extraction)
        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
            })
        batch_size = min(batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix

    tf.reset_default_graph()
    sess.close()
    return total_accuracy
Example #15
0
def run_full_quant_inference(
        wanted_words, sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, model_architecture,
        model_size_info, act_max, data_url, data_dir, silence_percentage,
        unknown_percentage, validation_percentage, testing_percentage,
        checkpoint, batch_size, lower_frequency_limit, upper_frequency_limit,
        filterbank_channel_count, is_bg_volume_constant, feature_extraction):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.

    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model_architecture: Name of the kind of model to generate.
      model_size_info: Model dimensions : different lengths for different models
    """

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               silence_percentage != 0)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, lower_frequency_limit,
        upper_frequency_limit, filterbank_channel_count)

    audio_processor = input_data.AudioProcessor(
        data_url, data_dir, silence_percentage, unknown_percentage,
        wanted_words.split(','), validation_percentage, testing_percentage,
        model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 act_max,
                                 is_training=False)

    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, checkpoint)

    num_layers = model_size_info[0]
    helper.write_ds_cnn_cpp_file('ds_cnn.cpp', num_layers)

    ds_cnn_h_fname = "ds_cnn.h"
    weights_h_fname = "ds_cnn_weights.h"

    f = open(ds_cnn_h_fname, 'wb')
    f.close()

    with open(ds_cnn_h_fname, 'a') as f:
        helper.write_ds_cnn_h_beginning(f, wanted_words, sample_rate,
                                        clip_duration_ms, window_size_ms,
                                        window_stride_ms,
                                        dct_coefficient_count, model_size_info,
                                        act_max)

    #   # Quantize weights to 8-bits using (min,max) and write to file
    f = open(weights_h_fname, 'wb')
    f.close()

    total_layers = len(act_max)
    layer_no = 1
    weights_dec_bits = 0
    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        dec_bits = 7 - int_bits
        # convert to [-128,128) or int8
        var_values = np.round(var_values * 2**dec_bits)
        var_name = var_name.replace('/', '_')
        var_name = var_name.replace(':', '_')

        if (len(var_values.shape) > 2):  # convolution layer weights
            transposed_wts = np.transpose(var_values, (3, 0, 1, 2))
        else:  # fully connected layer weights or biases of any layer
            transposed_wts = np.transpose(var_values)

        # convert back original range but quantized to 8-bits or 256 levels
        var_values = var_values / (2**dec_bits)
        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))
        print(var_name + ' number of wts/bias: ' + str(var_values.shape) + \
              ' dec bits: ' + str(dec_bits) + \
              ' max: (' + str(var_values.max()) + ',' + str(max_value) + ')' + \
              ' min: (' + str(var_values.min()) + ',' + str(min_value) + ')')

        conv_layer_no = layer_no // 2 + 1

        wt_or_bias = 'BIAS'
        if 'weights' in var_name:
            wt_or_bias = 'WT'

        with open(weights_h_fname, 'a') as f:
            if conv_layer_no == 1:
                f.write('#define CONV1_{} {{'.format(wt_or_bias))
            elif conv_layer_no <= num_layers:
                if layer_no % 2 == 0:
                    f.write('#define CONV{}_DS_{} {{'.format(
                        conv_layer_no, wt_or_bias))
                else:
                    f.write('#define CONV{}_PW_{} {{'.format(
                        conv_layer_no, wt_or_bias))
            else:
                f.write('#define FINAL_FC_{} {{'.format(wt_or_bias))

            transposed_wts.tofile(f, sep=", ", format="%d")
            f.write('}\n')

        if 'weights' in var_name:
            weights_dec_bits = dec_bits

        if 'biases' in var_name:
            if layer_no == total_layers - 2:  # if averege pool layer, go to the next one
                layer_no += 1
            input_dec_bits = 7 - np.log2(act_max[layer_no - 1])
            output_dec_bits = 7 - np.log2(act_max[layer_no])
            weights_x_input_dec_bits = input_dec_bits + weights_dec_bits
            bias_lshift = int(weights_x_input_dec_bits - dec_bits)
            output_rshift = int(weights_x_input_dec_bits - output_dec_bits)
            print(
                "Layer no: {} | Bias Lshift: {} | Output Rshift: {}\n".format(
                    layer_no, bias_lshift, output_rshift))
            with open('ds_cnn.h', 'a') as f:
                if conv_layer_no == 1:
                    f.write(
                        "#define CONV1_BIAS_LSHIFT {}\n".format(bias_lshift))
                    f.write(
                        "#define CONV1_OUT_RSHIFT {}\n".format(output_rshift))
                elif conv_layer_no <= num_layers:
                    if layer_no % 2 == 0:
                        f.write("#define CONV{}_DS_BIAS_LSHIFT {}\n".format(
                            conv_layer_no, bias_lshift))
                        f.write("#define CONV{}_DS_OUT_RSHIFT {}\n".format(
                            conv_layer_no, output_rshift))

                    else:
                        f.write("#define CONV{}_PW_BIAS_LSHIFT {}\n".format(
                            conv_layer_no, bias_lshift))
                        f.write("#define CONV{}_PW_OUT_RSHIFT {}\n".format(
                            conv_layer_no, output_rshift))
                else:
                    f.write("#define FINAL_FC_BIAS_LSHIFT {}\n".format(
                        bias_lshift))
                    f.write("#define FINAL_FC_OUT_RSHIFT {}\n".format(
                        output_rshift))

            layer_no += 1
    input_dec_bits = 7 - np.log2(act_max[len(act_max) - 3])
    output_dec_bits = 7 - np.log2(act_max[len(act_max) - 2])
    with open(ds_cnn_h_fname, 'a') as f:
        f.write("#define AVG_POOL_OUT_LSHIFT {}\n\n".format(
            int(output_dec_bits - input_dec_bits)))
        helper.write_ds_cnn_h_end(f, num_layers)

    # Evaluate result after quantization on testing set
    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None

    for i in xrange(0, set_size, batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess,
            is_bg_volume_constant, feature_extraction)
        test_accuracy, conf_matrix, predictions, true_labels = sess.run(
            [
                evaluation_step, confusion_matrix, predicted_indices,
                expected_indices
            ],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
            })

        batch_size = min(batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix

    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Test accuracy = %.2f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
    sess.close()
Example #16
0
def main():
    parser = create_parser()
    argcomplete.autocomplete(parser)
    args = parser.parse_args()
    print_outputs = args.print_outputs

    sess = tf.InteractiveSession()

    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(args.wanted_words.split(','))),
        args.sample_rate, args.clip_duration_ms, args.window_size_ms,
        args.window_stride_ms, args.dct_coefficient_count)
    # Build the audio processing graph + prepare data from dataset
    audio_processor = input_data.AudioProcessor(args.data_url, args.data_dir,
                                                args.silence_percentage,
                                                args.unknown_percentage,
                                                args.wanted_words.split(','),
                                                args.validation_percentage,
                                                args.testing_percentage,
                                                model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']
    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    # Build the NN graph
    if print_outputs:
        logits, first_conv_val, first_weights, first_bias, second_weights, second_bias, third_weights, second_conv_val = models.create_model(
            fingerprint_input,
            model_settings,
            args.model_architecture,
            is_training=False,
            print_outputs=True)
    else:

        logits = models.create_model(fingerprint_input,
                                     model_settings,
                                     args.model_architecture,
                                     is_training=False,
                                     print_outputs=False)

    # load weights/biases from checkpoint
    models.load_variables_from_checkpoint(sess, args.start_checkpoint)

    # Define loss and optimizer
    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', evaluation_step)

    #generate test outputs
    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)

    batch_size = args.batch_size
    directory = args.directory
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        batch_size, 0, model_settings, 0.0, 0.0, 0, 'testing', sess)

    # Run evaluation on a batch
    if print_outputs:
        outfc, outconv1, weights1, bias1, weights2, bias2, weights3, outconv2, test_accuracy, expected, predicted = sess.run(
            [
                logits, first_conv_val, first_weights, first_bias,
                second_weights, second_bias, third_weights, second_conv_val,
                evaluation_step, expected_indices, predicted_indices
            ],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth
                #dropout_prob: 1.0
            })
    else:
        outfc, test_accuracy, expected, predicted = sess.run(
            [logits, evaluation_step, expected_indices, predicted_indices],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth
                #dropout_prob: 1.0
            })

    print("expected/predicted")
    print(expected)
    print(predicted)

    # print image in a .h file, to be include
    for img_num in range(batch_size):
        in_feat = np.array(98 * 40)
        #print(test_fingerprints[img_num]*64)
        in_feat = np.reshape(test_fingerprints[img_num] * 64 + 0.5, (98 * 40))
        for i in range(0, 40 * 98):
            in_feat[i] = math.floor(in_feat[i])

        in_feat_int = np.array(98 * 40)
        in_feat_int = in_feat.astype(int)

        format = '%d'
        np.savetxt("./data/in_feat_{}_{}.txt".format(img_num,
                                                     expected[img_num]),
                   in_feat_int,
                   delimiter=", ",
                   newline=",\n",
                   fmt=format)

    if print_outputs:
        #outconv1_2D = np.reshape(outconv1,(batch_size*32,79*33))
        outconv1_2D = np.reshape(outconv1, (batch_size * 32, 39 * 16))
        first_weights_2D = np.reshape(weights1, (32, 8 * 20))
        weights2 = np.reshape(weights2, (10 * 4, 32, 32))
        second_weights_2D = np.reshape(weights2.transpose(), (32 * 32, 4 * 10))
        weights3 = np.reshape(weights3, (13 * 30, 32, 12))
        print("SHAPE WEIGHTS3")
        print(np.shape(weights3))
        third_weights_2D = np.reshape(weights3.transpose(), (12, 32 * 13 * 30))
        outconv2_2D = np.reshape(outconv2, (batch_size * 32, 30 * 13))
        print("SHAPE WEIGHTS2")
        print(np.shape(weights2))

        np.savetxt("./data/outFC.txt", outfc, delimiter=",")
        np.savetxt("./data/outConv1.txt", outconv1_2D, delimiter=",")
        np.savetxt("./data/weights1.txt", first_weights_2D, delimiter=",")
        np.savetxt("./data/bias1.txt", bias1, delimiter=",")
        np.savetxt("./data/weights2.txt",
                   second_weights_2D * 1024 * 32,
                   delimiter=",")
        np.savetxt("./data/bias2.txt", bias2, delimiter=",")
        np.savetxt("./data/outConv2.txt", outconv2_2D, delimiter=",")
        tf.logging.info('test accuracy = %.1f%% (N=%d)' %
                        (test_accuracy, batch_size))
        np.savetxt("./data/weights3.txt",
                   third_weights_2D * 1024 * 32,
                   delimiter=",\n")

    # dump file in a 40*98 pgm image with  16bits pixels
    s_16b = np.array([40 * 98], dtype=np.uint16)
    # #shift left by 7 bits

    strout = ''
    for i in range(batch_size):
        s_16b = np.floor(test_fingerprints[i] * 64 +
                         0.5)  # Q10.6 found in nntool
        s_8b = np.floor(test_fingerprints[i] / 2.40380199 +
                        0.5)  # Scale found in nntool
        #print(s_16b)
        test_fingerprints[i].tofile("./images/features_float_{}.dat".format(
            str(i)))
        with open(
                os.path.join(
                    directory,
                    "features_q16_{}_{}_{}.pgm".format(expected[i],
                                                       predicted[i], i)),
                'wb') as f:
            hdr = 'P5' + '\n' + str(40) + '  ' + str(98) + '  ' + str(
                65535) + '\n'
            f.write(hdr.encode())
            np.int16(s_16b).tofile(f)
        with open(
                os.path.join(
                    directory,
                    "features_q8_{}_{}_{}.pgm".format(expected[i],
                                                      predicted[i], i)),
                'wb') as f:
            hdr = 'P5' + '\n' + str(40) + '  ' + str(98) + '  ' + str(
                255) + '\n'
            f.write(hdr.encode())
            np.int8(s_8b).tofile(f)
        strout += 'Input:\t./images/features_float_{}.dat\tExpected:\t{}\tPredicted:\t{}\t({})\n'.format(
            str(i), expected[i], predicted[i], outfc[i])
    with open(os.path.join(directory, "output_expected_predicted.txt"),
              'w') as f:
        f.write(strout)

    print("finished: test accuracy = %.1f%%" % (test_accuracy * 100))
Example #17
0
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms,
                           window_size_ms, window_stride_ms, dct_coefficient_count, 
                           model_architecture, model_size_info):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """
  
  tf.logging.set_verbosity(tf.logging.INFO)
  sess = tf.InteractiveSession()
  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)

  audio_processor = input_data.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
  
  label_count = model_settings['label_count']
  fingerprint_size = model_settings['fingerprint_size']

  fingerprint_input = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')

  logits = models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      FLAGS.model_size_info,
      FLAGS.act_max,
      is_training=False)

  ground_truth_input = tf.placeholder(
      tf.float32, [None, label_count], name='groundtruth_input')

  predicted_indices = tf.argmax(logits, 1)
  expected_indices = tf.argmax(ground_truth_input, 1)
  correct_prediction = tf.equal(predicted_indices, expected_indices)
  confusion_matrix = tf.confusion_matrix(
      expected_indices, predicted_indices, num_classes=label_count)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)


  # Quantize weights to 8-bits using (min,max) and write to file
  f = open('weights.h','wb')
  f.close()

  for v in tf.trainable_variables():
    var_name = str(v.name)
    var_values = sess.run(v)
    min_value = var_values.min()
    max_value = var_values.max()
    int_bits = int(np.ceil(np.log2(max(abs(min_value),abs(max_value)))))
    dec_bits = 7-int_bits
    # convert to [-128,128) or int8
    var_values = np.round(var_values*2**dec_bits)
    var_name = var_name.replace('/','_')
    var_name = var_name.replace(':','_')
    with open('weights.h','a') as f:
      f.write('#define '+var_name+' {')
    if(len(var_values.shape)>2): #convolution layer weights
      transposed_wts = np.transpose(var_values,(3,0,1,2))
    else: #fully connected layer weights or biases of any layer
      transposed_wts = np.transpose(var_values)
    with open('weights.h','a') as f:
      transposed_wts.tofile(f,sep=", ",format="%d")
      f.write('}\n')
    # convert back original range but quantized to 8-bits or 256 levels
    var_values = var_values/(2**dec_bits)
    # update the weights in tensorflow graph for quantizing the activations
    var_values = sess.run(tf.assign(v,var_values))
    print(var_name+' number of wts/bias: '+str(var_values.shape)+\
            ' dec bits: '+str(dec_bits)+\
            ' max: ('+str(var_values.max())+','+str(max_value)+')'+\
            ' min: ('+str(var_values.min())+','+str(min_value)+')')
  
  # training set
  set_size = audio_processor.set_size('training')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size, FLAGS.batch_size):
    training_fingerprints, training_ground_truth = (
        audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                 0.0, 0, 'training', sess))
    training_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: training_fingerprints,
            ground_truth_input: training_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (training_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Training accuracy = %.2f%% (N=%d)' %
                  (total_accuracy * 100, set_size))

  # validation set
  set_size = audio_processor.set_size('validation')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size, FLAGS.batch_size):
    validation_fingerprints, validation_ground_truth = (
        audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                 0.0, 0, 'validation', sess))
    validation_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: validation_fingerprints,
            ground_truth_input: validation_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (validation_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
                  (total_accuracy * 100, set_size))
  
  # test set
  set_size = audio_processor.set_size('testing')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size, FLAGS.batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    test_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: test_fingerprints,
            ground_truth_input: test_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (test_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100,
                                                           set_size))
Example #18
0
def main(_):

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
    model_settings = prepare_model_settings(len(words_list), FLAGS.sample_rate,
                                            FLAGS.clip_duration_ms,
                                            FLAGS.window_size_ms,
                                            FLAGS.window_stride_ms,
                                            FLAGS.dct_coefficient_count,
                                            FLAGS.preprocess,
                                            bool(FLAGS.use_power_spectrogram))
    print(model_settings)

    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    interpreter = tf.lite.Interpreter(model_path=FLAGS.tflite_model)
    interpreter.allocate_tensors()

    # Get input and output tensors.
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    print('Input details: ', input_details)
    print('Output details: ', output_details)
    scale, zero_point = input_details[0]['quantization']

    # validation set
    set_size = audio_processor.set_size('validation')
    tf.logging.info('Validation set size:%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    corrects = 0
    for i in range(0, set_size):
        validation_fingerprint, validation_ground_truth = audio_processor.get_data(
            1, i, model_settings, 0.0, 0, 0, 'validation', sess)
        if scale != 0.0:
            input_array = np.array(
                np.floor(validation_fingerprint / scale + 0.5) +
                zero_point).astype(np.uint8)
        else:
            input_array = np.array(validation_fingerprint).astype(np.float32)
        interpreter.set_tensor(input_details[0]['index'],
                               input_array.reshape(input_details[0]['shape']))
        interpreter.invoke()
        output = interpreter.get_tensor(output_details[0]['index'])
        predicted_class = np.argmax(output)
        gt_class = np.argmax(validation_ground_truth)
        corrects += 1 if predicted_class == gt_class else 0
        conf_matrix = sklearn.metrics.confusion_matrix(
            [words_list[gt_class]], [words_list[predicted_class]],
            labels=words_list)
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
        if not (i % 100) and i > 0:
            print("Pred/Tot: {}/{} Accuracy: {}%".format(
                corrects, i, corrects / i * 100))
    print("Pred/Tot: {}/{} Accuracy: {}%\n".format(corrects, i,
                                                   corrects / i * 100))
    print("Confusion matrix:\n{}".format(total_conf_matrix))

    # test set
    set_size = audio_processor.set_size('testing')
    tf.logging.info('Test set size:%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    corrects = 0
    for i in range(0, set_size):
        testing_fingerprint, testing_ground_truth = audio_processor.get_data(
            1, i, model_settings, 0.0, 0, 0, 'testing', sess)
        if scale != 0.0:
            input_array = np.array(
                np.floor(testing_fingerprint / scale + 0.5) +
                zero_point).astype(np.uint8)
        else:
            input_array = np.array(testing_fingerprint).astype(np.float32)
        interpreter.set_tensor(input_details[0]['index'],
                               input_array.reshape(input_details[0]['shape']))
        interpreter.invoke()
        output = interpreter.get_tensor(output_details[0]['index'])
        predicted_class = np.argmax(output)
        gt_class = np.argmax(testing_ground_truth)
        corrects += 1 if predicted_class == gt_class else 0
        conf_matrix = sklearn.metrics.confusion_matrix(
            [words_list[gt_class]], [words_list[predicted_class]],
            labels=words_list)
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
        if not (i % 100) and i > 0:
            print("Pred/Tot: {}/{} Accuracy: {}%".format(
                corrects, i, corrects / i * 100))
    print("Pred/Tot: {}/{} Accuracy: {}%\n".format(corrects, i,
                                                   corrects / i * 100))
    print("Confusion matrix:\n{}".format(total_conf_matrix))
def main(_):
  words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
      FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
  audio_processor = input_data.AudioProcessor(
      '', FLAGS.data_dir, FLAGS.silence_percentage, 10,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)

  output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds
  output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32)

  # Set up background audio.
  background_crossover_ms = 500
  background_segment_duration_ms = (
      FLAGS.clip_duration_ms + background_crossover_ms)
  background_segment_duration_samples = int(
      (background_segment_duration_ms * FLAGS.sample_rate) / 1000)
  background_segment_stride_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  background_ramp_samples = int(
      ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000)

  # Mix the background audio into the main track.
  how_many_backgrounds = int(
      math.ceil(output_audio_sample_count / background_segment_stride_samples))
  for i in range(how_many_backgrounds):
    output_offset = int(i * background_segment_stride_samples)
    background_index = np.random.randint(len(audio_processor.background_data))
    background_samples = audio_processor.background_data[background_index]
    background_offset = np.random.randint(
        0, len(background_samples) - model_settings['desired_samples'])
    background_volume = np.random.uniform(0, FLAGS.background_volume)
    mix_in_audio_sample(output_audio, output_offset, background_samples,
                        background_offset, background_segment_duration_samples,
                        background_volume, background_ramp_samples,
                        background_ramp_samples)

  # Mix the words into the main track, noting their labels and positions.
  output_labels = []
  word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
  word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
  clip_duration_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
  how_many_words = int(
      math.floor(output_audio_sample_count / word_stride_samples))
  all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
      -1, model_settings, 'testing')
  for i in range(how_many_words):
    output_offset = (
        int(i * word_stride_samples) + np.random.randint(word_gap_samples))
    output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
    is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
    if is_unknown:
      wanted_label = input_data.UNKNOWN_WORD_LABEL
    else:
      wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)]
    test_data_start = np.random.randint(len(all_test_data))
    found_sample_data = None
    index_lookup = np.arange(len(all_test_data), dtype=np.int32)
    np.random.shuffle(index_lookup)
    for test_data_offset in range(len(all_test_data)):
      test_data_index = index_lookup[(
          test_data_start + test_data_offset) % len(all_test_data)]
      current_label = all_test_labels[test_data_index]
      if current_label == wanted_label:
        found_sample_data = all_test_data[test_data_index]
        break
    mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
                        clip_duration_samples, 1.0, 500, 500)
    output_labels.append({'label': wanted_label, 'time': output_offset_ms})

  input_data.save_wav_file(FLAGS.output_audio_file, output_audio,
                           FLAGS.sample_rate)
  tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

  with open(FLAGS.output_labels_file, 'w') as f:
    for output_label in output_labels:
      f.write('%s, %f\n' % (output_label['label'], output_label['time']))
  tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
Example #20
0
def main(_):
  # We want to see all the logging messages for this tutorial.
  tf.logging.set_verbosity(tf.logging.INFO)

  # Start a new TensorFlow session.
  sess = tf.InteractiveSession()

  # Begin by making sure we have the training data we need. If you already have
  # training data of your own, use `--data_url= ` on the command line to avoid
  # downloading.
  model_settings = models.prepare_model_settings(
      len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
      FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
      FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
  audio_processor = input_data.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
  fingerprint_size = model_settings['fingerprint_size']
  label_count = model_settings['label_count']
  time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
  # Figure out the learning rates for each training phase. Since it's often
  # effective to have high learning rates at the start of training, followed by
  # lower levels towards the end, the number of steps and learning rates can be
  # specified as comma-separated lists to define the rate at each stage. For
  # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
  # will run 13,000 training loops in total, with a rate of 0.001 for the first
  # 10,000, and 0.0001 for the final 3,000.
  training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
  learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
  if len(training_steps_list) != len(learning_rates_list):
    raise Exception(
        '--how_many_training_steps and --learning_rate must be equal length '
        'lists, but are %d and %d long instead' % (len(training_steps_list),
                                                   len(learning_rates_list)))

  fingerprint_input = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')

  logits, dropout_prob = models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      is_training=True)

  # Define loss and optimizer
  ground_truth_input = tf.placeholder(
      tf.float32, [None, label_count], name='groundtruth_input')

  # Optionally we can add runtime checks to spot when NaNs or other symptoms of
  # numerical errors start occurring during training.
  control_dependencies = []
  if FLAGS.check_nans:
    checks = tf.add_check_numerics_ops()
    control_dependencies = [checks]

  # Create the back propagation and training evaluation machinery in the graph.
  with tf.name_scope('cross_entropy'):
    cross_entropy_mean = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            labels=ground_truth_input, logits=logits))
  tf.summary.scalar('cross_entropy', cross_entropy_mean)
  with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
    learning_rate_input = tf.placeholder(
        tf.float32, [], name='learning_rate_input')
    train_step = tf.train.GradientDescentOptimizer(
        learning_rate_input).minimize(cross_entropy_mean)
  predicted_indices = tf.argmax(logits, 1)
  expected_indices = tf.argmax(ground_truth_input, 1)
  correct_prediction = tf.equal(predicted_indices, expected_indices)
  confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  tf.summary.scalar('accuracy', evaluation_step)

  global_step = tf.train.get_or_create_global_step()
  increment_global_step = tf.assign(global_step, global_step + 1)

  saver = tf.train.Saver(tf.global_variables())

  # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
  merged_summaries = tf.summary.merge_all()
  train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                       sess.graph)
  validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation')

  tf.global_variables_initializer().run()

  start_step = 1

  if FLAGS.start_checkpoint:
    models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
    start_step = global_step.eval(session=sess)

  tf.logging.info('Training from step: %d ', start_step)

  # Save graph.pbtxt.
  tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                       FLAGS.model_architecture + '.pbtxt')

  # Save list of words.
  with gfile.GFile(
      os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'),
      'w') as f:
    f.write('\n'.join(audio_processor.words_list))

  # Training loop.
  training_steps_max = np.sum(training_steps_list)
  for training_step in xrange(start_step, training_steps_max + 1):
    # Figure out what the current learning rate is.
    training_steps_sum = 0
    for i in range(len(training_steps_list)):
      training_steps_sum += training_steps_list[i]
      if training_step <= training_steps_sum:
        learning_rate_value = learning_rates_list[i]
        break
    # Pull the audio samples we'll use for training.
    train_fingerprints, train_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
        FLAGS.background_volume, time_shift_samples, 'training', sess)
    # Run the graph with this batch of training data.
    train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
        [
            merged_summaries, evaluation_step, cross_entropy_mean, train_step,
            increment_global_step
        ],
        feed_dict={
            fingerprint_input: train_fingerprints,
            ground_truth_input: train_ground_truth,
            learning_rate_input: learning_rate_value,
            dropout_prob: 0.5
        })
    train_writer.add_summary(train_summary, training_step)
    tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
                    (training_step, learning_rate_value, train_accuracy * 100,
                     cross_entropy_value))
    is_last_step = (training_step == training_steps_max)
    if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
      set_size = audio_processor.set_size('validation')
      total_accuracy = 0
      total_conf_matrix = None
      for i in xrange(0, set_size, FLAGS.batch_size):
        validation_fingerprints, validation_ground_truth = (
            audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                     0.0, 0, 'validation', sess))
        # Run a validation step and capture training summaries for TensorBoard
        # with the `merged` op.
        validation_summary, validation_accuracy, conf_matrix = sess.run(
            [merged_summaries, evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: validation_fingerprints,
                ground_truth_input: validation_ground_truth,
                dropout_prob: 1.0
            })
        validation_writer.add_summary(validation_summary, training_step)
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (validation_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
          total_conf_matrix = conf_matrix
        else:
          total_conf_matrix += conf_matrix
      tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
      tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                      (training_step, total_accuracy * 100, set_size))

    # Save the model checkpoint periodically.
    if (training_step % FLAGS.save_step_interval == 0 or
        training_step == training_steps_max):
      checkpoint_path = os.path.join(FLAGS.train_dir,
                                     FLAGS.model_architecture + '.ckpt')
      tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step)
      saver.save(sess, checkpoint_path, global_step=training_step)

  set_size = audio_processor.set_size('testing')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in xrange(0, set_size, FLAGS.batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    test_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: test_fingerprints,
            ground_truth_input: test_ground_truth,
            dropout_prob: 1.0
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (test_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100,
                                                           set_size))
Example #21
0
def main(_):
  # We want to see all the logging messages for this tutorial.
  tf.logging.set_verbosity(tf.logging.INFO)

  # Start a new TensorFlow session.
  sess = tf.InteractiveSession()

  # Begin by making sure we have the training data we need. If you already have
  # training data of your own, use `--data_url= ` on the command line to avoid
  # downloading.
  model_settings = models.prepare_model_settings(
      len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
      FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
      FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
  audio_processor = input_data.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
  fingerprint_size = model_settings['fingerprint_size']
  label_count = model_settings['label_count']
  time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)

  print(FLAGS.sample_rate)
  print(FLAGS.clip_duration_ms)
  print(FLAGS.window_size_ms)
  print(FLAGS.window_stride_ms)
  print(FLAGS.dct_coefficient_count)

  # get a set of decoded audio waves (in PCM format)  from dataset
  train_fingerprints_unproc, train_ground_truth_unproc = audio_processor.get_unprocessed_data(
     2, model_settings , 'training')


  print(train_fingerprints_unproc[1:2,:])
#  f = open("wave.txt","w")
#  np.savetxt("wave.txt",train_fingerprints_unproc[1], delimiter=",")
#  f.close()
  
#  return

  # Figure out the learning rates for each training phase. Since it's often
  # effective to have high learning rates at the start of training, followed by
  # lower levels towards the end, the number of steps and learning rates can be
  # specified as comma-separated lists to define the rate at each stage. For
  # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
  # will run 13,000 training loops in total, with a rate of 0.001 for the first
  # 10,000, and 0.0001 for the final 3,000.
  training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
  learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
  if len(training_steps_list) != len(learning_rates_list):
    raise Exception(
        '--how_many_training_steps and --learning_rate must be equal length '
        'lists, but are %d and %d long instead' % (len(training_steps_list),
                                                   len(learning_rates_list)))

  fingerprint_input = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')

  logits, dropout_prob , max_pool_value, first_conv_val, second_conv_val, first_bias_val,first_weights_val,second_bias_val,second_weights_val,final_fc_bias_val, final_fc_weights_val = models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      is_training=True)

  # Define loss and optimizer
  ground_truth_input = tf.placeholder(
      tf.float32, [None, label_count], name='groundtruth_input')

  # Optionally we can add runtime checks to spot when NaNs or other symptoms of
  # numerical errors start occurring during training.
  control_dependencies = []
  if FLAGS.check_nans:
    checks = tf.add_check_numerics_ops()
    control_dependencies = [checks]

  # Create the back propagation and training evaluation machinery in the graph.
  with tf.name_scope('cross_entropy'):
    cross_entropy_mean = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            labels=ground_truth_input, logits=logits))
  tf.summary.scalar('cross_entropy', cross_entropy_mean)
  with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
    learning_rate_input = tf.placeholder(
        tf.float32, [], name='learning_rate_input')
    train_step = tf.train.GradientDescentOptimizer(
        learning_rate_input).minimize(cross_entropy_mean)
  predicted_indices = tf.argmax(logits, 1)
  expected_indices = tf.argmax(ground_truth_input, 1)
  correct_prediction = tf.equal(predicted_indices, expected_indices)
  confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  tf.summary.scalar('accuracy', evaluation_step)

  global_step = tf.contrib.framework.get_or_create_global_step()
  increment_global_step = tf.assign(global_step, global_step + 1)

  saver = tf.train.Saver(tf.global_variables())

  # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
  merged_summaries = tf.summary.merge_all()
  train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                       sess.graph)
  validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation')

  tf.global_variables_initializer().run()

  start_step = 1

  if FLAGS.start_checkpoint:
    models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
    start_step = global_step.eval(session=sess)

  tf.logging.info('Training from step: %d ', start_step)

  # Save graph.pbtxt.
  tf.train.write_graph(sess.graph.as_graph_def(add_shapes=True), FLAGS.train_dir,
                       FLAGS.model_architecture + '.pbtxt')

  # Save list of words.
  with gfile.GFile(
      os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'),
      'w') as f:
    f.write('\n'.join(audio_processor.words_list))
  #Initialize the max of output conv2d tensors to zero
  max2_conv1=0
  max2_conv2=0
  maxF=open("max.log","w")
  # Training loop.
  training_steps_max = np.sum(training_steps_list)

  # !!!!!!!!!!!!! bypass training to generate layer output
  if FLAGS.save_layers :
    training_steps_max = -1

  for training_step in xrange(start_step, training_steps_max + 1):
    # Figure out what the current learning rate is.
    training_steps_sum = 0
    for i in range(len(training_steps_list)):
      training_steps_sum += training_steps_list[i]
      if training_step <= training_steps_sum:
        learning_rate_value = learning_rates_list[i]
        break
    # Pull the audio samples we'll use for training.
    train_fingerprints, train_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
        FLAGS.background_volume, time_shift_samples, 'training', sess)
    # Run the graph with this batch of training data.
    train_summary, train_accuracy, cross_entropy_value, maxpool_summary,first_conv_max,second_conv_max, first_bias_max, first_weights_max, second_bias_max, second_weights_max, final_fc_bias_max, final_fc_weights_max , _, _ = sess.run(
        [
            merged_summaries, evaluation_step, cross_entropy_mean, max_pool_value, first_conv_val, second_conv_val, first_bias_val, first_weights_val, second_bias_val, second_weights_val, final_fc_bias_val, final_fc_weights_val, train_step,
            increment_global_step
        ],
        feed_dict={
            fingerprint_input: train_fingerprints,
            ground_truth_input: train_ground_truth,
            learning_rate_input: learning_rate_value,
            dropout_prob: 0.5
        })
    #    if (training_step == start_step):
    #      for i in range(0,39):
    #        print ("********** printing file " + "maxpool" + str(i) + ".txt")
    #       np.savetxt("maxpool" + str(i) + ".txt",maxpool_summary[0,i,:,:])
    #     print("*********************")
    # Just keep the max of max_conv1 and max_conv2
    max2_conv1=max(max2_conv1,first_conv_max)
    max2_conv2=max(max2_conv2,second_conv_max)
    train_writer.add_summary(train_summary, training_step)
    tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
                    (training_step, learning_rate_value, train_accuracy * 100,
                     cross_entropy_value))
    is_last_step = (training_step == training_steps_max)
    if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
      set_size = audio_processor.set_size('validation')
      total_accuracy = 0
      total_conf_matrix = None
      for i in xrange(0, set_size, FLAGS.batch_size):
        validation_fingerprints, validation_ground_truth = (
            audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                     0.0, 0, 'validation', sess))
        # Run a validation step and capture training summaries for TensorBoard
        # with the `merged` op.
        validation_summary, validation_accuracy, conf_matrix = sess.run(
            [merged_summaries, evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: validation_fingerprints,
                ground_truth_input: validation_ground_truth,
                dropout_prob: 1.0
            })
        validation_writer.add_summary(validation_summary, training_step)
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (validation_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
          total_conf_matrix = conf_matrix
        else:
          total_conf_matrix += conf_matrix
      tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
      tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                      (training_step, total_accuracy * 100, set_size))

    # Save the model checkpoint periodically.
    if (training_step % FLAGS.save_step_interval == 0 or
        training_step == training_steps_max):
      checkpoint_path = os.path.join(FLAGS.train_dir,
                                     FLAGS.model_architecture + '.ckpt')
      tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step)
      saver.save(sess, checkpoint_path, global_step=training_step)
    #Save the bias & weights  tensors maximums in a file
    max_bias1=first_bias_max
    max_weights1=first_weights_max
    max_bias2=second_bias_max
    max_weights2=second_weights_max
    max_fc_bias=final_fc_bias_max
    max_fc_weights=final_fc_weights_max
    if (training_step == training_steps_max):
      maxF.write(str(max_bias1) + " \n")
      maxF.write(str(max_weights1) + " \n")
      maxF.write(str(max_bias2)+ " \n")
      maxF.write(str(max_weights2)+ " \n")
      maxF.write(str(max_fc_bias)+ " \n")
      maxF.write(str(max_fc_weights)+ " \n")
    
  # End of training loop
  
  #Now save the conv2d outputs tensors maxs and close the file
  maxF.write(str(max2_conv1) + " \n")
  maxF.write(str(max2_conv2)+ " \n")
  maxF.close()

  
  set_size = audio_processor.set_size('testing')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  if FLAGS.save_layers :
    set_size = 1

  print("set_size", set_size)
  for i in xrange(0, set_size, FLAGS.batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess, FLAGS.save_layers)
    outfc,test_accuracy, conf_matrix = sess.run(
        [logits,evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: test_fingerprints,
            ground_truth_input: test_ground_truth,
            dropout_prob: 1.0
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (test_accuracy * batch_size) / set_size

    if FLAGS.save_layers :
      np.savetxt(os.path.join("./data", "outFC_{}.txt".format(i)),outfc, delimiter=",")

    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix

  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100,
                                                           set_size))
Example #22
0
def main(_):

    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)

    # Start a new TensorFlow session.
    sess = tf.InteractiveSession()

    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)

    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    fingerprint_size = model_settings['fingerprint_size']
    label_count = model_settings['label_count']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)

    training_steps = FLAGS.how_many_training_steps
    learning_rate = FLAGS.learning_rate

    # -----------------------------------------------------------------------
    # -----------------------------Placeholder-------------------------------
    # -----------------------------------------------------------------------

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits, dropout_prob, w_conv1, w_conv2 = models.create_model(
        fingerprint_input,
        model_settings,
        FLAGS.model_architecture,
        is_training=True)

    # Define loss and optimizer
    ground_truth_input = tf.placeholder(tf.int64, [None],
                                        name='groundtruth_input')

    # Optionally we can add runtime checks to spot when NaNs or other symptoms of
    # numerical errors start occurring during training.
    control_dependencies = []
    if FLAGS.check_nans:
        checks = tf.add_check_numerics_ops()
        control_dependencies = [checks]

    # -----------------------------------------------------------------------
    # -----------------Back propagation and training evaluation--------------
    # -----------------------------------------------------------------------

    reg_costant = 0.01

    # Create the back propagation and training evaluation machinery in the graph.
    with tf.name_scope('cross_entropy'):
        #  l2 regularization
        l2_reg = tf.reduce_sum(
            [tf.nn.l2_loss(w_conv1),
             tf.nn.l2_loss(w_conv2)])
        cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
            labels=ground_truth_input, logits=logits)
        loss = cross_entropy_mean + reg_costant * l2_reg

    tf.summary.scalar('cross_entropy', cross_entropy_mean)

    with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
        #Adam optimizer
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(
            cross_entropy_mean)

    predicted_indices = tf.argmax(logits, 1)
    correct_prediction = tf.equal(predicted_indices, ground_truth_input)
    confusion_matrix = tf.confusion_matrix(ground_truth_input,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', evaluation_step)

    global_step = tf.train.get_or_create_global_step()
    increment_global_step = tf.assign(global_step, global_step + 1)

    saver = tf.train.Saver(tf.global_variables())

    # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
    merged_summaries = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                         sess.graph)
    validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                              '/validation')

    tf.global_variables_initializer().run()

    start_step = 1

    if FLAGS.start_checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
        start_step = global_step.eval(session=sess)

    tf.logging.info('Training from step: %d ', start_step)

    # Save graph.pbtxt.
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         FLAGS.model_architecture + '.pbtxt')

    # Save list of words.
    with gfile.GFile(
            os.path.join(FLAGS.train_dir,
                         FLAGS.model_architecture + '_labels.txt'), 'w') as f:
        f.write('\n'.join(audio_processor.words_list))

    # -----------------------------------------------------------------------
    # -----------------Training and validation-------------------------------
    # -----------------------------------------------------------------------

    # Training loop.
    training_steps_max = training_steps

    # Print the local time of beginning training
    beg_time = datetime.datetime.now()
    print("Beginning time : " + str(beg_time))

    for training_step in xrange(start_step, training_steps_max + 1):

        # Pull the audio samples we'll use for training.
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
            FLAGS.background_volume, time_shift_samples, 'training', sess)

        # Run the graph with this batch of training data.
        train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
            [
                merged_summaries, evaluation_step, cross_entropy_mean,
                train_step, increment_global_step
            ],
            feed_dict={
                fingerprint_input: train_fingerprints,
                ground_truth_input: train_ground_truth,
                dropout_prob: 0.5
            })

        train_writer.add_summary(train_summary, training_step)
        tf.logging.info(
            'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
            (training_step, learning_rate, train_accuracy * 100,
             cross_entropy_value))
        is_last_step = (training_step == training_steps_max)

        # Validation
        if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:

            set_size = audio_processor.set_size('validation')
            total_accuracy = 0
            total_conf_matrix = None

            for i in xrange(0, set_size, FLAGS.batch_size):

                validation_fingerprints, validation_ground_truth = (
                    audio_processor.get_data(FLAGS.batch_size, i,
                                             model_settings, 0.0, 0.0, 0,
                                             'validation', sess))

                # Run a validation step and capture training summaries for TensorBoard
                # with the `merged` op.
                validation_summary, validation_accuracy, conf_matrix = sess.run(
                    [merged_summaries, evaluation_step, confusion_matrix],
                    feed_dict={
                        fingerprint_input: validation_fingerprints,
                        ground_truth_input: validation_ground_truth,
                        dropout_prob: 1.0
                    })

                validation_writer.add_summary(validation_summary,
                                              training_step)
                batch_size = min(FLAGS.batch_size, set_size - i)
                total_accuracy += (validation_accuracy * batch_size) / set_size

                if total_conf_matrix is None:
                    total_conf_matrix = conf_matrix
                else:
                    total_conf_matrix += conf_matrix

            tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
            tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                            (training_step, total_accuracy * 100, set_size))

        # Save the model checkpoint periodically.
        if (training_step % FLAGS.save_step_interval == 0
                or training_step == training_steps_max):
            checkpoint_path = os.path.join(FLAGS.train_dir,
                                           FLAGS.model_architecture + '.ckpt')

            tf.logging.info('Saving to "%s-%d"', checkpoint_path,
                            training_step)
            saver.save(sess, checkpoint_path, global_step=training_step)

    # Print the local time of ending training
    print("Beginning time : " + str(beg_time))
    print("Ending time : " + str(datetime.datetime.now()))

    # -----------------------------------------------------------------------
    # ------------------------------Test-------------------------------------
    # -----------------------------------------------------------------------

    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None

    for i in xrange(0, set_size, FLAGS.batch_size):

        test_fingerprints, test_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)

        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
                dropout_prob: 1.0
            })

        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size

        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix

    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Final test accuracy = %.1f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
Example #23
0
    '--wanted_words',
    type=str,
    default='yes,no,up,down,left,right,on,off,stop,go',
    help='Words to use (others will be added to an unknown label)', )

parser.add_argument(
    '--model_size_info',
    type=int,
    nargs="+",
    default=[1, 100],
    help='Model dimensions - different for various models')

args = parser.parse_args()

model_settings = models.prepare_model_settings(
    len(input_data.prepare_words_list(args.wanted_words.split(','))),
    args.sample_rate, args.clip_duration_ms, args.window_size_ms,
    args.window_stride_ms, args.dct_coefficient_count)

print(model_settings)

model = models.create_model(model_settings, args.arch, args.model_size_info)
model.cuda()

model_path = os.path.join(args.save_dir, args.load_model_name)
print(model_path)
model.load_state_dict(torch.load(model_path)["state_dict"],strict=False)#i modify here

for name, weight in model.named_parameters():
    print (name)
    unique, counts = np.unique((weight.cpu().detach().numpy()).flatten(), return_counts=True)
Example #24
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc' or 'average'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
Example #25
0
def run_inference_pb(wanted_words, sample_rate, clip_duration_ms,
                           window_size_ms, window_stride_ms, dct_coefficient_count, 
                           model_architecture, model_size_info):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """
  
  tf.logging.set_verbosity(tf.logging.INFO)
  sess = tf.InteractiveSession()
  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)

  load_graph(FLAGS.graph)

  audio_processor = input_data.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)

  softmax = sess.graph.get_tensor_by_name("labels_softmax:0")

  label_count = model_settings['label_count']

  ground_truth_input = tf.placeholder(
      tf.float32, [None, label_count], name='groundtruth_input')

  predicted_indices = tf.argmax(softmax, 1)
  expected_indices = tf.argmax(ground_truth_input, 1)
  correct_prediction = tf.equal(predicted_indices, expected_indices)
  confusion_matrix = tf.confusion_matrix(
      expected_indices, predicted_indices, num_classes=label_count)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

  if(FLAGS.training):
    # training set
    set_size = audio_processor.set_size('training')
    tf.logging.info('Training set size:%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in range(0, set_size):
      training_file, training_ground_truth = (
          audio_processor.get_wav_files(1, i, model_settings, 'training'))
      with open(training_file[0], 'rb') as wav_file:
        training_data = wav_file.read()
      training_accuracy, conf_matrix = sess.run(
          [evaluation_step, confusion_matrix],
          feed_dict={
              'wav_data:0': training_data,
              ground_truth_input: training_ground_truth,
          })
      total_accuracy += (training_accuracy) / set_size
      if total_conf_matrix is None:
        total_conf_matrix = conf_matrix
      else:
        total_conf_matrix += conf_matrix
    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Training accuracy = %.2f%% (N=%d)' % (total_accuracy * 100,
                                                             set_size))

  # validation set
  set_size = audio_processor.set_size('validation')
  tf.logging.info('Validation set size:%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size):
    validation_file, validation_ground_truth = (
        audio_processor.get_wav_files(1, i, model_settings, 'validation'))
    with open(validation_file[0], 'rb') as wav_file:
      validation_data = wav_file.read()
    validation_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            'wav_data:0': validation_data,
            ground_truth_input: validation_ground_truth,
        })
    total_accuracy += (validation_accuracy) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
                  (total_accuracy * 100, set_size))
  # test set
  set_size = audio_processor.set_size('testing')
  tf.logging.info('Test set size:%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size):
    test_file, test_ground_truth = (
        audio_processor.get_wav_files(1, i, model_settings, 'testing'))
    with open(test_file[0], 'rb') as wav_file:
      test_data = wav_file.read()
    test_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            'wav_data:0': test_data,
            ground_truth_input: test_ground_truth,
        })
    total_accuracy += (test_accuracy) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100,
                                                           set_size))
def main(_):
  words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
      FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.feature_bin_count,
      'mfcc')
  audio_processor = input_data.AudioProcessor(
      '', FLAGS.data_dir, FLAGS.silence_percentage, 10,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings, FLAGS.data_dir)

  output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds
  output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32)

  # Set up background audio.
  background_crossover_ms = 500
  background_segment_duration_ms = (
      FLAGS.clip_duration_ms + background_crossover_ms)
  background_segment_duration_samples = int(
      (background_segment_duration_ms * FLAGS.sample_rate) / 1000)
  background_segment_stride_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  background_ramp_samples = int(
      ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000)

  # Mix the background audio into the main track.
  how_many_backgrounds = int(
      math.ceil(output_audio_sample_count / background_segment_stride_samples))
  for i in range(how_many_backgrounds):
    output_offset = int(i * background_segment_stride_samples)
    background_index = np.random.randint(len(audio_processor.background_data))
    background_samples = audio_processor.background_data[background_index]
    background_offset = np.random.randint(
        0, len(background_samples) - model_settings['desired_samples'])
    background_volume = np.random.uniform(0, FLAGS.background_volume)
    mix_in_audio_sample(output_audio, output_offset, background_samples,
                        background_offset, background_segment_duration_samples,
                        background_volume, background_ramp_samples,
                        background_ramp_samples)

  # Mix the words into the main track, noting their labels and positions.
  output_labels = []
  word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
  word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
  clip_duration_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
  how_many_words = int(
      math.floor(output_audio_sample_count / word_stride_samples))
  all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
      -1, model_settings, 'testing')
  for i in range(how_many_words):
    output_offset = (
        int(i * word_stride_samples) + np.random.randint(word_gap_samples))
    output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
    is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
    if is_unknown:
      wanted_label = input_data.UNKNOWN_WORD_LABEL
    else:
      wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)]
    test_data_start = np.random.randint(len(all_test_data))
    found_sample_data = None
    index_lookup = np.arange(len(all_test_data), dtype=np.int32)
    np.random.shuffle(index_lookup)
    for test_data_offset in range(len(all_test_data)):
      test_data_index = index_lookup[(
          test_data_start + test_data_offset) % len(all_test_data)]
      current_label = all_test_labels[test_data_index]
      if current_label == wanted_label:
        found_sample_data = all_test_data[test_data_index]
        break
    mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
                        clip_duration_samples, 1.0, 500, 500)
    output_labels.append({'label': wanted_label, 'time': output_offset_ms})

  input_data.save_wav_file(FLAGS.output_audio_file, output_audio,
                           FLAGS.sample_rate)
  tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

  with open(FLAGS.output_labels_file, 'w') as f:
    for output_label in output_labels:
      f.write('%s, %f\n' % (output_label['label'], output_label['time']))
  tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
Example #27
0
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms,
                        window_size_ms, window_stride_ms,
                        dct_coefficient_count, model_architecture,
                        model_size_info):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)

    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']
    time_shift_samples = int((100.0 * FLAGS.sample_rate) / 1000)

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 FLAGS.model_architecture,
                                 FLAGS.model_size_info,
                                 FLAGS.act_max,
                                 is_training=False)
    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    if FLAGS.if_retrain:
        with tf.name_scope('cross_entropy'):
            cross_entropy_mean = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    labels=ground_truth_input, logits=logits))
        tf.summary.scalar('cross_entropy', cross_entropy_mean)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.name_scope('train'), tf.control_dependencies(update_ops):
            train_op = tf.train.AdamOptimizer(learning_rate=0.0001)
            train_step = tf.contrib.slim.learning.create_train_op(
                cross_entropy_mean, train_op)

    saver = tf.train.Saver(tf.global_variables())
    merged = tf.summary.merge_all()
    test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test',
                                        sess.graph)
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train')
    validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                              '/validation')
    tf.global_variables_initializer().run()

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)

    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        # ap_fixed<8,1> uses 7 decimal bits and 1 bit for sign
        dec_bits = 7 - int_bits
        # dec_bits = min(7, 7-int_bits)
        # convert to [-128,128) or int8
        # var_values = np.round(var_values*2**dec_bits)
        # convert back original range but quantized to 8-bits or 256 levels
        # var_values = var_values/(2**dec_bits)
        if FLAGS.update_weights:
            # define datatypes
            # f = open('weights/parameters.h','wb')
            # f.close()
            from save_data import prepare_header, prepare_lstm_headers
            var_name_split = var_name.split(':')
            if var_name_split[0].startswith('W_o'):
                os.makedirs('weights/fc', exist_ok=True)
                c_var_name = 'Wy[' + str(var_values.shape[1]) + '][' + str(
                    var_values.shape[0]) + ']'  # transposed
                np.savetxt('weights/fc/Wy.h',
                           np.transpose(var_values),
                           delimiter=',',
                           newline=',\n')
                prepare_header('weights/fc/Wy.h', 'Wy_t ' + c_var_name)
            elif var_name_split[0].startswith('b_o'):
                c_var_name = 'by[' + str(var_values.shape[0]) + ']'
                np.savetxt('weights/fc/by.h', var_values[None], delimiter=',')
                prepare_header('weights/fc/by.h', 'by_t ' + c_var_name)

            elif var_name_split[0].startswith('lstm'):
                lstm_name = var_name_split[0].split('/')
                param_name = lstm_name[-1]
                # if (lstm_name[0] == 'lstm0'):
                #   prepare_lstm_headers('weights/' + lstm_name[0], var_values,input_size = FLAGS.dct_coefficient_count, param_name=param_name)
                # else:
                #   state_size = FLAGS.model_size_info[0] # TODO
                #   prepare_lstm_headers('weights/' + lstm_name[0], var_values,input_size = state_size, param_name=param_name)

                # for lstmp
                if (lstm_name[-2] == 'projection'):
                    param_name = 'projection'
                if (lstm_name[1] == 'lstm0'):
                    prepare_lstm_headers(
                        'weights/' + lstm_name[1],
                        var_values,
                        input_size=FLAGS.dct_coefficient_count,
                        param_name=param_name)
                else:
                    state_size = FLAGS.model_size_info[0]  # TODO
                    prepare_lstm_headers('weights/' + lstm_name[1],
                                         var_values,
                                         input_size=state_size,
                                         param_name=param_name)

        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))
        print(var_name+' number of wts/bias: '+str(var_values.shape)+\
                ' dec bits: '+str(dec_bits)+\
                ' max: ('+str(var_values.max())+','+str(max_value)+')'+\
                ' min: ('+str(var_values.min())+','+str(min_value)+')')
    if FLAGS.if_retrain:
        best_accuracy = 0
        for training_step in range(FLAGS.retrain_steps):
            # Pull the audio samples we'll use for training.
            train_fingerprints, train_ground_truth = audio_processor.get_data(
                FLAGS.batch_size, 0, model_settings, 0.8, 0.1,
                time_shift_samples, 'training', sess)
            # Run the graph with this batch of training data.
            train_summary, train_accuracy, cross_entropy_value, _ = sess.run(
                [merged, evaluation_step, cross_entropy_mean, train_step],
                feed_dict={
                    fingerprint_input: train_fingerprints,
                    ground_truth_input: train_ground_truth
                })
            train_writer.add_summary(train_summary, training_step)
            tf.logging.info(
                'Step #%d: accuracy %.2f%%, cross entropy %f' %
                (training_step, train_accuracy * 100, cross_entropy_value))
            is_last_step = (training_step == FLAGS.retrain_steps)
            if (training_step % 200) == 0 or is_last_step:
                set_size = audio_processor.set_size('validation')
                total_accuracy = 0
                total_conf_matrix = None
                for i in range(0, set_size, FLAGS.batch_size):
                    validation_fingerprints, validation_ground_truth = (
                        audio_processor.get_data(FLAGS.batch_size, i,
                                                 model_settings, 0.0, 0.0, 0,
                                                 'validation', sess))

                    # Run a validation step and capture training summaries for TensorBoard
                    # with the `merged` op.
                    validation_summary, validation_accuracy, conf_matrix = sess.run(
                        [merged, evaluation_step, confusion_matrix],
                        feed_dict={
                            fingerprint_input: validation_fingerprints,
                            ground_truth_input: validation_ground_truth
                        })
                    validation_writer.add_summary(validation_summary,
                                                  training_step)
                    batch_size = min(FLAGS.batch_size, set_size - i)
                    total_accuracy += (validation_accuracy *
                                       batch_size) / set_size
                    if total_conf_matrix is None:
                        total_conf_matrix = conf_matrix
                    else:
                        total_conf_matrix += conf_matrix
                tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
                tf.logging.info(
                    'Step %d: Validation accuracy = %.2f%% (N=%d)' %
                    (training_step, total_accuracy * 100, set_size))

                # Save the model checkpoint when validation accuracy improves
                if total_accuracy > best_accuracy:
                    best_accuracy = total_accuracy
                    checkpoint_path = os.path.join(
                        FLAGS.new_checkpoint, FLAGS.model_architecture + '_' +
                        str(int(best_accuracy * 10000)) + '.ckpt')
                    tf.logging.info('Saving best model to "%s-%d"',
                                    checkpoint_path, training_step)
                    saver.save(sess,
                               checkpoint_path,
                               global_step=training_step)
                tf.logging.info(
                    'So far the best validation accuracy is %.2f%%' %
                    (best_accuracy * 100))

    # validation set
    set_size = audio_processor.set_size('validation')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in range(0, set_size, FLAGS.batch_size):
        validation_fingerprints, validation_ground_truth = (
            audio_processor.get_wav_files(FLAGS.batch_size, i, model_settings,
                                          'validation'))

        validation_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: validation_fingerprints,
                ground_truth_input: validation_ground_truth,
            })
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (validation_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
                    (total_accuracy * 100, set_size))

    # test set
    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in range(0, set_size, FLAGS.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_wav_files(
            FLAGS.batch_size, i, model_settings, 'testing')

        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
            })

        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix

    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Test accuracy = %.2f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
Example #28
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  elif preprocess == 'micro':
    if not frontend_op:
      raise Exception(
          'Micro frontend op is currently not available when running TensorFlow'
          ' directly from Python, you need to build and run through Bazel, for'
          ' example'
          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
      )
    sample_rate = model_settings['sample_rate']
    window_size_ms = (model_settings['window_size_samples'] *
                      1000) / sample_rate
    window_step_ms = (model_settings['window_stride_samples'] *
                      1000) / sample_rate
    int16_input = tf.cast(
        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=model_settings['fingerprint_width'],
        out_scale=1,
        out_type=tf.float32)
    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                    ' "average", or "micro")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
Example #29
0
def main(_):
    # Set the verbosity based on flags (default is INFO, so we see all messages)
    tf.compat.v1.logging.set_verbosity(FLAGS.verbosity)

    # Start a new TensorFlow session.
    sess = tf.compat.v1.InteractiveSession()

    summaries_dir = os.path.join(FLAGS.train_dir, 'summaries')

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess)
    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings, summaries_dir)

    wav_file = FLAGS.wav

    fingerprint_size = model_settings['fingerprint_size']

    # Figure out the learning rates for each training phase. Since it's often
    # effective to have high learning rates at the start of training, followed by
    # lower levels towards the end, the number of steps and learning rates can be
    # specified as comma-separated lists to define the rate at each stage. For
    # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
    # will run 13,000 training loops in total, with a rate of 0.001 for the first
    # 10,000, and 0.0001 for the final 3,000.
    training_steps_list = list(
        map(int, FLAGS.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))
    input_placeholder = tf.compat.v1.placeholder(tf.float32,
                                                 [None, fingerprint_size],
                                                 name='fingerprint_input')
    if FLAGS.quantize:
        fingerprint_min, fingerprint_max = input_data.get_features_range(
            model_settings)
        fingerprint_input = tf.quantization.fake_quant_with_min_max_args(
            input_placeholder, fingerprint_min, fingerprint_max)
    else:
        fingerprint_input = input_placeholder

    print('fingerprint input:', fingerprint_input)

    logits, dropout_prob = models.create_model(
        fingerprint_input,
        model_settings,
        FLAGS.model_architecture,
        is_training=True,
    )

    # Define loss and optimizer
    ground_truth_input = tf.compat.v1.placeholder(tf.int64, [None],
                                                  name='groundtruth_input')

    # Create the back propagation and training evaluation machinery in the graph.
    with tf.compat.v1.name_scope('cross_entropy'):
        cross_entropy_mean = tf.compat.v1.losses.sparse_softmax_cross_entropy(
            labels=ground_truth_input, logits=logits)
    if FLAGS.quantize:
        tf.contrib.quantize.create_training_graph(quant_delay=0)
    predicted_indices = tf.argmax(input=logits, axis=1)
    correct_prediction = tf.equal(predicted_indices, ground_truth_input)
    evaluation_step = tf.reduce_mean(
        input_tensor=tf.cast(correct_prediction, tf.float32))
    with tf.compat.v1.get_default_graph().name_scope('eval'):
        tf.compat.v1.summary.scalar('cross_entropy', cross_entropy_mean)
        tf.compat.v1.summary.scalar('accuracy', evaluation_step)

    global_step = tf.compat.v1.train.get_or_create_global_step()

    tf.compat.v1.global_variables_initializer().run()

    start_step = 1

    if FLAGS.checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)
        start_step = global_step.eval(session=sess)
        tf.compat.v1.logging.info('Checkpoint: {}'.format(FLAGS.checkpoint))

    tf.compat.v1.logging.info(
        'Recovering checkpoint from step: {}'.format(start_step))

    input_features = audio_processor.get_features_for_wav(
        wav_file, model_settings, sess)
    print('features:', input_features)
    print('features:', len(input_features))
    input_features = input_features[0]
    print('features:', input_features.shape)
    input_features = np.expand_dims(input_features.flatten(), 0)

    y_pred = sess.run(predicted_indices,
                      feed_dict={
                          fingerprint_input: input_features,
                          dropout_prob: 1.0
                      })

    print('Predict:', y_pred)
    print('Label:', audio_processor.words_list[y_pred[0]])
def main(_):

    tf.logging.set_verbosity(tf.logging.INFO)

    sess = tf.InteractiveSession()

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    ####################################################################

    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    #######################################################################
    fingerprint_size = model_settings['fingerprint_size']
    label_count = model_settings['label_count']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)

    ##################################################################

    training_steps_list = map(int, FLAGS.how_many_training_steps.split(','))
    learning_rates_list = map(float, FLAGS.learning_rate.split(','))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits, dropout_prob = models.create_model(fingerprint_input,
                                               model_settings,
                                               FLAGS.model_architecture,
                                               is_training=True)

    # Define loss and optimizer
    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    # Optionally we can add runtime checks to spot when NaNs or other symptoms of
    # numerical errors start occurring during training.
    control_dependencies = []
    if FLAGS.check_nans:
        checks = tf.add_check_numerics_ops()
        control_dependencies = [checks]

###################################################################

#backpropagation
    with tf.name_scope('cross_entropy'):
        cross_entropy_mean = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_input,
                                                    logits=logits))
    tf.summary.scalar('cross_entropy', cross_entropy_mean)
    with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
        learning_rate_input = tf.placeholder(tf.float32, [],
                                             name='learning_rate_input')
        train_step = tf.train.GradientDescentOptimizer(
            learning_rate_input).minimize(cross_entropy_mean)
    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', evaluation_step)

    global_step = tf.contrib.framework.get_or_create_global_step()
    increment_global_step = tf.assign(global_step, global_step + 1)

    saver = tf.train.Saver(tf.global_variables())

    # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
    merged_summaries = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                         sess.graph)
    validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                              '/validation')

    tf.global_variables_initializer().run()

    start_step = 1

    if FLAGS.start_checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
        start_step = global_step.eval(session=sess)

    tf.logging.info('step number: %d ', start_step)

    # Save graph.pbtxt.
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         FLAGS.model_architecture + '.pbtxt')

    # Save list of words.
    with gfile.GFile(
            os.path.join(FLAGS.train_dir,
                         FLAGS.model_architecture + '_labels.txt'), 'w') as f:
        f.write('\n'.join(audio_processor.words_list))

####################################################################
# Training loop.
    training_steps_max = np.sum(training_steps_list)
    for training_step in xrange(start_step, training_steps_max + 1):
        # Figure out what the current learning rate is.
        training_steps_sum = 0
        for i in range(len(training_steps_list)):
            training_steps_sum += training_steps_list[i]
            if training_step <= training_steps_sum:
                learning_rate_value = learning_rates_list[i]
                break
        # Pull the audio samples we'll use for training.
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
            FLAGS.background_volume, time_shift_samples, 'training', sess)
        # Run the graph with this batch of training data.

        ###################################################################
        train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
            [
                merged_summaries, evaluation_step, cross_entropy_mean,
                train_step, increment_global_step
            ],
            feed_dict={
                fingerprint_input: train_fingerprints,
                ground_truth_input: train_ground_truth,
                learning_rate_input: learning_rate_value,
                dropout_prob: 0.5
            })
        train_writer.add_summary(train_summary, training_step)
        tf.logging.info(
            'Step number #%d: learning rate %f, model accuracy %.1f%%, model cross entropy %f'
            % (training_step, learning_rate_value, train_accuracy * 100,
               cross_entropy_value))
        is_last_step = (training_step == training_steps_max)
        if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
            set_size = audio_processor.set_size('validation')
            total_accuracy = 0
            total_conf_matrix = None
            for i in xrange(0, set_size, FLAGS.batch_size):
                validation_fingerprints, validation_ground_truth = (
                    audio_processor.get_data(FLAGS.batch_size, i,
                                             model_settings, 0.0, 0.0, 0,
                                             'validation', sess))
                #the graph are print using the library of tensorboard

                #############################################################
                validation_summary, validation_accuracy, conf_matrix = sess.run(
                    [merged_summaries, evaluation_step, confusion_matrix],
                    feed_dict={
                        fingerprint_input: validation_fingerprints,
                        ground_truth_input: validation_ground_truth,
                        dropout_prob: 1.0
                    })
                validation_writer.add_summary(validation_summary,
                                              training_step)

                batch_size = min(FLAGS.batch_size, set_size - i)
                total_accuracy += (validation_accuracy * batch_size) / set_size
                if total_conf_matrix is None:
                    total_conf_matrix = conf_matrix
                else:
                    total_conf_matrix += conf_matrix
            tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
            tf.logging.info(
                'Step number %d: Validation Model accuracy = %.1f%% (N=%d)' %
                (training_step, total_accuracy * 100, set_size))

        # Save the model checkpoint periodically.

###############################################################
        if (training_step % FLAGS.save_step_interval == 0
                or training_step == training_steps_max):
            checkpoint_path = os.path.join(FLAGS.train_dir,
                                           FLAGS.model_architecture + '.ckpt')
            tf.logging.info('Saving to "%s-%d"', checkpoint_path,
                            training_step)
            saver.save(sess, checkpoint_path, global_step=training_step)


############################################################

    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in xrange(0, set_size, FLAGS.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
                dropout_prob: 1.0
            })
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Final model accuracy accuracy = %.1f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
Example #31
0
def fold_batch_norm(wanted_words,
                    sample_rate,
                    clip_duration_ms,
                    window_size_ms,
                    window_stride_ms,
                    dct_coefficient_count,
                    model_architecture,
                    model_size_info,
                    checkpoint,
                    include_silence=True,
                    lower_frequency_limit=20,
                    upper_frequency_limit=4000,
                    filterbank_channel_count=40):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.

    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model_architecture: Name of the kind of model to generate.
    """
    tf.reset_default_graph()
    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               include_silence)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, lower_frequency_limit,
        upper_frequency_limit, filterbank_channel_count)

    fingerprint_input = tf.placeholder(
        tf.float32, [None, model_settings['fingerprint_size']],
        name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 is_training=False)

    ground_truth_input = tf.placeholder(tf.float32,
                                        [None, model_settings['label_count']],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    models.load_variables_from_checkpoint(sess, checkpoint)
    saver = tf.train.Saver(tf.global_variables())

    tf.logging.info(
        'Folding batch normalization layer parameters to preceding layer weights/biases'
    )
    # epsilon added to variance to avoid division by zero
    epsilon = 1e-3  # default epsilon for tf.slim.batch_norm
    # get batch_norm mean
    mean_variables = [
        v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        if 'moving_mean' in v.name
    ]
    for mean_var in mean_variables:
        mean_name = mean_var.name
        mean_values = sess.run(mean_var)
        variance_name = mean_name.replace('moving_mean', 'moving_variance')
        variance_var = [
            v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            if v.name == variance_name
        ][0]
        variance_values = sess.run(variance_var)
        beta_name = mean_name.replace('moving_mean', 'beta')
        beta_var = [
            v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            if v.name == beta_name
        ][0]
        beta_values = sess.run(beta_var)
        bias_name = mean_name.replace('batch_norm/moving_mean', 'biases')
        bias_var = [
            v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            if v.name == bias_name
        ][0]
        bias_values = sess.run(bias_var)
        wt_name = mean_name.replace('batch_norm/moving_mean:0', '')
        wt_var = \
        [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if (wt_name in v.name and 'weights' in v.name)][0]
        wt_values = sess.run(wt_var)
        wt_name = wt_var.name

        # Update weights
        tf.logging.info('Updating ' + wt_name)
        for l in range(wt_values.shape[3]):
            for k in range(wt_values.shape[2]):
                for j in range(wt_values.shape[1]):
                    for i in range(wt_values.shape[0]):
                        if "depthwise" in wt_name:  # depthwise batchnorm params are ordered differently
                            wt_values[i][j][k][l] *= 1.0 / np.sqrt(
                                variance_values[k] +
                                epsilon)  # gamma (scale factor) is 1.0
                        else:
                            wt_values[i][j][k][l] *= 1.0 / np.sqrt(
                                variance_values[l] +
                                epsilon)  # gamma (scale factor) is 1.0
        wt_values = sess.run(tf.assign(wt_var, wt_values))
        # Update biases
        tf.logging.info('Updating ' + bias_name)
        if "depthwise" in wt_name:
            depth_dim = wt_values.shape[2]
        else:
            depth_dim = wt_values.shape[3]
        for l in range(depth_dim):
            bias_values[l] = (1.0 * (bias_values[l] - mean_values[l]) / np.sqrt(variance_values[l] + epsilon)) + \
                             beta_values[l]
        bias_values = sess.run(tf.assign(bias_var, bias_values))

    # Write fused weights to ckpt file
    tf.logging.info('Saving new checkpoint at ' + checkpoint + '_bnfused')
    saver.save(sess, checkpoint + '_bnfused')
    tf.reset_default_graph()
    sess.close()
Example #32
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc' or 'average'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, feature_bin_count, preprocess)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    daudio = tf.identity(decoded_sample_data.audio, name='dao')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if preprocess == 'average':
        fingerprint_input = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
    elif preprocess == 'mfcc':
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
    else:
        raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
                        ' "average")' % (preprocess))

    fingerprint_size = model_settings['fingerprint_size']
    reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
Example #33
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Returns:
    Input and output tensor objects.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, feature_bin_count, preprocess)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                    name='wav_data')
    decoded_sample_data = tf.audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = audio_ops.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if preprocess == 'average':
        fingerprint_input = tf.nn.pool(
            input=tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
    elif preprocess == 'mfcc':
        fingerprint_input = audio_ops.mfcc(
            spectrogram,
            sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
    elif preprocess == 'micro':
        if not frontend_op:
            raise Exception(
                'Micro frontend op is currently not available when running TensorFlow'
                ' directly from Python, you need to build and run through Bazel, for'
                ' example'
                ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
            )
        sample_rate = model_settings['sample_rate']
        window_size_ms = (model_settings['window_size_samples'] *
                          1000) / sample_rate
        window_step_ms = (model_settings['window_stride_samples'] *
                          1000) / sample_rate
        int16_input = tf.cast(tf.multiply(decoded_sample_data.audio, 32767),
                              tf.int16)
        micro_frontend = frontend_op.audio_microfrontend(
            int16_input,
            sample_rate=sample_rate,
            window_size=window_size_ms,
            window_step=window_step_ms,
            num_channels=model_settings['fingerprint_width'],
            out_scale=1,
            out_type=tf.float32)
        fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))

    elif preprocess == "rune":
        fingerprint_input = np.random.uniform(0, 26, 1960).astype(np.float32)

    else:
        raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                        ' "average", or "micro")' % (preprocess))

    fingerprint_size = model_settings['fingerprint_size']
    reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    softmax = tf.nn.softmax(logits, name='labels_softmax')

    return reshaped_input, softmax
Example #34
0
def run_inference(wanted_words, sample_rate, clip_duration_ms,
                  window_size_ms, window_stride_ms, dct_coefficient_count,
                  model_architecture, model_size_info, use_mfcc,
                  csv_writer):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """
  
  tf.logging.set_verbosity(tf.logging.INFO)
  sess = tf.InteractiveSession()
  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count, use_mfcc)

  # audio_processor = input_data.AudioProcessor(
  audio_processor = AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
  
  label_count = model_settings['label_count']
  fingerprint_size = model_settings['fingerprint_size']

  fingerprint_input = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')

  logits = models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      FLAGS.model_size_info,
      is_training=False)

  # ground_truth_input = tf.placeholder(
  #     tf.float32, [None, label_count], name='groundtruth_input')

  predicted_indices = tf.argmax(logits, 1)
  # expected_indices = tf.argmax(ground_truth_input, 1)
  # correct_prediction = tf.equal(predicted_indices, expected_indices)
  # confusion_matrix = tf.confusion_matrix(
  #     expected_indices, predicted_indices, num_classes=label_count)
  # evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)

  # # training set
  # set_size = audio_processor.set_size('training')
  # tf.logging.info('set_size=%d', set_size)
  # total_accuracy = 0
  # total_conf_matrix = None
  # for i in xrange(0, set_size, FLAGS.batch_size):
  #   training_fingerprints, training_ground_truth = (
  #       audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
  #                                0.0, 0, 'training', sess))
  #   training_accuracy, conf_matrix = sess.run(
  #       [evaluation_step, confusion_matrix],
  #       feed_dict={
  #           fingerprint_input: training_fingerprints,
  #           ground_truth_input: training_ground_truth,
  #       })
  #   batch_size = min(FLAGS.batch_size, set_size - i)
  #   total_accuracy += (training_accuracy * batch_size) / set_size
  #   if total_conf_matrix is None:
  #     total_conf_matrix = conf_matrix
  #   else:
  #     total_conf_matrix += conf_matrix
  # tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  # tf.logging.info('Training accuracy = %.2f%% (N=%d)' %
  #                 (total_accuracy * 100, set_size))
  #
  #
  # # validation set
  # set_size = audio_processor.set_size('validation')
  # tf.logging.info('set_size=%d', set_size)
  # total_accuracy = 0
  # total_conf_matrix = None
  # for i in xrange(0, set_size, FLAGS.batch_size):
  #   validation_fingerprints, validation_ground_truth = (
  #       audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
  #                                0.0, 0, 'validation', sess))
  #   validation_accuracy, conf_matrix = sess.run(
  #       [evaluation_step, confusion_matrix],
  #       feed_dict={
  #           fingerprint_input: validation_fingerprints,
  #           ground_truth_input: validation_ground_truth,
  #       })
  #   batch_size = min(FLAGS.batch_size, set_size - i)
  #   total_accuracy += (validation_accuracy * batch_size) / set_size
  #   if total_conf_matrix is None:
  #     total_conf_matrix = conf_matrix
  #   else:
  #     total_conf_matrix += conf_matrix
  # tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  # tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
  #                 (total_accuracy * 100, set_size))
  
  # test set
  set_size = audio_processor.set_size('testing')
  tf.logging.info('set_size=%d', set_size)
  # total_accuracy = 0
  # total_conf_matrix = None
  expected_classes = []
  for i in xrange(0, set_size, FLAGS.batch_size):
    # test_fingerprints, test_ground_truth = audio_processor.get_data(
    #     FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    # test_accuracy, conf_matrix = sess.run(
    #     [evaluation_step, confusion_matrix],
    #     feed_dict={
    #         fingerprint_input: test_fingerprints,
    #         ground_truth_input: test_ground_truth,
    #     })
    test_fingerprints, test_fnames = audio_processor.get_data(
        FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    expected_classes = sess.run(
        predicted_indices,
        feed_dict={
            fingerprint_input: test_fingerprints,
        })
    # batch_size = min(FLAGS.batch_size, set_size - i)
    # print ("i, len(expeceted_classes), len(test_fnames)=", i, len(expected_classes), len(test_fnames))
    for j in range(min(FLAGS.batch_size, set_size - i)):
      csv_writer.writerow([test_fnames[j], class_labels[expected_classes[j]]])
Example #35
0
def main(_):
  # We want to see all the logging messages for this tutorial.
  tf.logging.set_verbosity(tf.logging.INFO)

  # Start a new TensorFlow session.
  sess = tf.InteractiveSession()

  # Begin by making sure we have the training data we need. If you already have
  # training data of your own, use `--data_url= ` on the command line to avoid
  # downloading.
  model_settings = models.prepare_model_settings(
      len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
      FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
      FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess)
  audio_processor = input_data.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir,
      FLAGS.silence_percentage, FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings, FLAGS.summaries_dir)
  fingerprint_size = model_settings['fingerprint_size']
  label_count = model_settings['label_count']
  time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
  # Figure out the learning rates for each training phase. Since it's often
  # effective to have high learning rates at the start of training, followed by
  # lower levels towards the end, the number of steps and learning rates can be
  # specified as comma-separated lists to define the rate at each stage. For
  # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
  # will run 13,000 training loops in total, with a rate of 0.001 for the first
  # 10,000, and 0.0001 for the final 3,000.
  training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
  learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
  if len(training_steps_list) != len(learning_rates_list):
    raise Exception(
        '--how_many_training_steps and --learning_rate must be equal length '
        'lists, but are %d and %d long instead' % (len(training_steps_list),
                                                   len(learning_rates_list)))

  input_placeholder = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')
  if FLAGS.quantize:
    fingerprint_min, fingerprint_max = input_data.get_features_range(
        model_settings)
    fingerprint_input = tf.fake_quant_with_min_max_args(
        input_placeholder, fingerprint_min, fingerprint_max)
  else:
    fingerprint_input = input_placeholder

  logits, dropout_prob = models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      is_training=True)

  # Define loss and optimizer
  ground_truth_input = tf.placeholder(
      tf.int64, [None], name='groundtruth_input')

  # Optionally we can add runtime checks to spot when NaNs or other symptoms of
  # numerical errors start occurring during training.
  control_dependencies = []
  if FLAGS.check_nans:
    checks = tf.add_check_numerics_ops()
    control_dependencies = [checks]

  # Create the back propagation and training evaluation machinery in the graph.
  with tf.name_scope('cross_entropy'):
    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
        labels=ground_truth_input, logits=logits)
  if FLAGS.quantize:
    tf.contrib.quantize.create_training_graph(quant_delay=0)
  with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
    learning_rate_input = tf.placeholder(
        tf.float32, [], name='learning_rate_input')
    train_step = tf.train.GradientDescentOptimizer(
        learning_rate_input).minimize(cross_entropy_mean)
  predicted_indices = tf.argmax(logits, 1)
  correct_prediction = tf.equal(predicted_indices, ground_truth_input)
  confusion_matrix = tf.confusion_matrix(
      ground_truth_input, predicted_indices, num_classes=label_count)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  with tf.get_default_graph().name_scope('eval'):
    tf.summary.scalar('cross_entropy', cross_entropy_mean)
    tf.summary.scalar('accuracy', evaluation_step)

  global_step = tf.train.get_or_create_global_step()
  increment_global_step = tf.assign(global_step, global_step + 1)

  saver = tf.train.Saver(tf.global_variables())

  # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
  merged_summaries = tf.summary.merge_all(scope='eval')
  train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                       sess.graph)
  validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation')

  tf.global_variables_initializer().run()

  start_step = 1

  if FLAGS.start_checkpoint:
    models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
    start_step = global_step.eval(session=sess)

  tf.logging.info('Training from step: %d ', start_step)

  # Save graph.pbtxt.
  tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                       FLAGS.model_architecture + '.pbtxt')

  # Save list of words.
  with gfile.GFile(
      os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'),
      'w') as f:
    f.write('\n'.join(audio_processor.words_list))

  # Training loop.
  training_steps_max = np.sum(training_steps_list)
  for training_step in xrange(start_step, training_steps_max + 1):
    # Figure out what the current learning rate is.
    training_steps_sum = 0
    for i in range(len(training_steps_list)):
      training_steps_sum += training_steps_list[i]
      if training_step <= training_steps_sum:
        learning_rate_value = learning_rates_list[i]
        break
    # Pull the audio samples we'll use for training.
    train_fingerprints, train_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
        FLAGS.background_volume, time_shift_samples, 'training', sess)
    # Run the graph with this batch of training data.
    train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
        [
            merged_summaries,
            evaluation_step,
            cross_entropy_mean,
            train_step,
            increment_global_step,
        ],
        feed_dict={
            fingerprint_input: train_fingerprints,
            ground_truth_input: train_ground_truth,
            learning_rate_input: learning_rate_value,
            dropout_prob: 0.5
        })
    train_writer.add_summary(train_summary, training_step)
    tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
                    (training_step, learning_rate_value, train_accuracy * 100,
                     cross_entropy_value))
    is_last_step = (training_step == training_steps_max)
    if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
      set_size = audio_processor.set_size('validation')
      total_accuracy = 0
      total_conf_matrix = None
      for i in xrange(0, set_size, FLAGS.batch_size):
        validation_fingerprints, validation_ground_truth = (
            audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                     0.0, 0, 'validation', sess))
        # Run a validation step and capture training summaries for TensorBoard
        # with the `merged` op.
        validation_summary, validation_accuracy, conf_matrix = sess.run(
            [merged_summaries, evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: validation_fingerprints,
                ground_truth_input: validation_ground_truth,
                dropout_prob: 1.0
            })
        validation_writer.add_summary(validation_summary, training_step)
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (validation_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
          total_conf_matrix = conf_matrix
        else:
          total_conf_matrix += conf_matrix
      tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
      tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                      (training_step, total_accuracy * 100, set_size))

    # Save the model checkpoint periodically.
    if (training_step % FLAGS.save_step_interval == 0 or
        training_step == training_steps_max):
      checkpoint_path = os.path.join(FLAGS.train_dir,
                                     FLAGS.model_architecture + '.ckpt')
      tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step)
      saver.save(sess, checkpoint_path, global_step=training_step)

  set_size = audio_processor.set_size('testing')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in xrange(0, set_size, FLAGS.batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    test_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: test_fingerprints,
            ground_truth_input: test_ground_truth,
            dropout_prob: 1.0
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (test_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100,
                                                           set_size))
Example #36
0
def run_inference(wanted_words, sample_rate, clip_duration_ms,
                           window_size_ms, window_stride_ms, dct_coefficient_count, 
                           model_architecture, model_size_info):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """
  
  tf.logging.set_verbosity(tf.logging.INFO)
  sess = tf.InteractiveSession()
  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)

  audio_processor = input_data.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
  
  label_count = model_settings['label_count']
  fingerprint_size = model_settings['fingerprint_size']

  fingerprint_input = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')

  logits = models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      FLAGS.model_size_info,
      is_training=False)

  ground_truth_input = tf.placeholder(
      tf.float32, [None, label_count], name='groundtruth_input')

  predicted_indices = tf.argmax(logits, 1)
  expected_indices = tf.argmax(ground_truth_input, 1)
  correct_prediction = tf.equal(predicted_indices, expected_indices)
  confusion_matrix = tf.confusion_matrix(
      expected_indices, predicted_indices, num_classes=label_count)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)

  # training set
  set_size = audio_processor.set_size('training')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in xrange(0, set_size, FLAGS.batch_size):
    training_fingerprints, training_ground_truth = (
        audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                 0.0, 0, 'training', sess))
    training_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: training_fingerprints,
            ground_truth_input: training_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (training_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Training accuracy = %.2f%% (N=%d)' %
                  (total_accuracy * 100, set_size))


  # validation set
  set_size = audio_processor.set_size('validation')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in xrange(0, set_size, FLAGS.batch_size):
    validation_fingerprints, validation_ground_truth = (
        audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                 0.0, 0, 'validation', sess))
    validation_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: validation_fingerprints,
            ground_truth_input: validation_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (validation_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
                  (total_accuracy * 100, set_size))
  
  # test set
  set_size = audio_processor.set_size('testing')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in xrange(0, set_size, FLAGS.batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    test_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: test_fingerprints,
            ground_truth_input: test_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (test_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100,
                                                           set_size))