def run_quant_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] time_shift_samples = int((100.0 * FLAGS.sample_rate) / 1000) fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model(fingerprint_input, model_settings, FLAGS.model_architecture, FLAGS.model_size_info, FLAGS.act_max, is_training=False) ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') if FLAGS.if_retrain: with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=ground_truth_input, logits=logits)) tf.summary.scalar('cross_entropy', cross_entropy_mean) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.name_scope('train'), tf.control_dependencies(update_ops): train_op = tf.train.AdamOptimizer(learning_rate=0.0001) train_step = tf.contrib.slim.learning.create_train_op( cross_entropy_mean, train_op) saver = tf.train.Saver(tf.global_variables()) merged = tf.summary.merge_all() test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test', sess.graph) train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train') validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, FLAGS.checkpoint) for v in tf.trainable_variables(): var_name = str(v.name) var_values = sess.run(v) min_value = var_values.min() max_value = var_values.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) # ap_fixed<8,1> uses 7 decimal bits and 1 bit for sign dec_bits = 7 - int_bits # dec_bits = min(7, 7-int_bits) # convert to [-128,128) or int8 # var_values = np.round(var_values*2**dec_bits) # convert back original range but quantized to 8-bits or 256 levels # var_values = var_values/(2**dec_bits) if FLAGS.update_weights: # define datatypes # f = open('weights/parameters.h','wb') # f.close() from save_data import prepare_header, prepare_lstm_headers var_name_split = var_name.split(':') if var_name_split[0].startswith('W_o'): os.makedirs('weights/fc', exist_ok=True) c_var_name = 'Wy[' + str(var_values.shape[1]) + '][' + str( var_values.shape[0]) + ']' # transposed np.savetxt('weights/fc/Wy.h', np.transpose(var_values), delimiter=',', newline=',\n') prepare_header('weights/fc/Wy.h', 'Wy_t ' + c_var_name) elif var_name_split[0].startswith('b_o'): c_var_name = 'by[' + str(var_values.shape[0]) + ']' np.savetxt('weights/fc/by.h', var_values[None], delimiter=',') prepare_header('weights/fc/by.h', 'by_t ' + c_var_name) elif var_name_split[0].startswith('lstm'): lstm_name = var_name_split[0].split('/') param_name = lstm_name[-1] # if (lstm_name[0] == 'lstm0'): # prepare_lstm_headers('weights/' + lstm_name[0], var_values,input_size = FLAGS.dct_coefficient_count, param_name=param_name) # else: # state_size = FLAGS.model_size_info[0] # TODO # prepare_lstm_headers('weights/' + lstm_name[0], var_values,input_size = state_size, param_name=param_name) # for lstmp if (lstm_name[-2] == 'projection'): param_name = 'projection' if (lstm_name[1] == 'lstm0'): prepare_lstm_headers( 'weights/' + lstm_name[1], var_values, input_size=FLAGS.dct_coefficient_count, param_name=param_name) else: state_size = FLAGS.model_size_info[0] # TODO prepare_lstm_headers('weights/' + lstm_name[1], var_values, input_size=state_size, param_name=param_name) # update the weights in tensorflow graph for quantizing the activations var_values = sess.run(tf.assign(v, var_values)) print(var_name+' number of wts/bias: '+str(var_values.shape)+\ ' dec bits: '+str(dec_bits)+\ ' max: ('+str(var_values.max())+','+str(max_value)+')'+\ ' min: ('+str(var_values.min())+','+str(min_value)+')') if FLAGS.if_retrain: best_accuracy = 0 for training_step in range(FLAGS.retrain_steps): # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, 0.8, 0.1, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _ = sess.run( [merged, evaluation_step, cross_entropy_mean, train_step], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth }) train_writer.add_summary(train_summary, training_step) tf.logging.info( 'Step #%d: accuracy %.2f%%, cross entropy %f' % (training_step, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == FLAGS.retrain_steps) if (training_step % 200) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info( 'Step %d: Validation accuracy = %.2f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint when validation accuracy improves if total_accuracy > best_accuracy: best_accuracy = total_accuracy checkpoint_path = os.path.join( FLAGS.new_checkpoint, FLAGS.model_architecture + '_' + str(int(best_accuracy * 10000)) + '.ckpt') tf.logging.info('Saving best model to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) tf.logging.info( 'So far the best validation accuracy is %.2f%%' % (best_accuracy * 100)) # validation set set_size = audio_processor.set_size('validation') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_wav_files(FLAGS.batch_size, i, model_settings, 'validation')) validation_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Validation accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size)) # test set set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_wav_files( FLAGS.batch_size, i, model_settings, 'testing') test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size))
def run_full_quant_inference( wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info, act_max, data_url, data_dir, silence_percentage, unknown_percentage, validation_percentage, testing_percentage, checkpoint, batch_size, lower_frequency_limit, upper_frequency_limit, filterbank_channel_count, is_bg_volume_constant, feature_extraction): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(','), silence_percentage != 0) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, lower_frequency_limit, upper_frequency_limit, filterbank_channel_count) audio_processor = input_data.AudioProcessor( data_url, data_dir, silence_percentage, unknown_percentage, wanted_words.split(','), validation_percentage, testing_percentage, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model(fingerprint_input, model_settings, model_architecture, model_size_info, act_max, is_training=False) ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, checkpoint) num_layers = model_size_info[0] helper.write_ds_cnn_cpp_file('ds_cnn.cpp', num_layers) ds_cnn_h_fname = "ds_cnn.h" weights_h_fname = "ds_cnn_weights.h" f = open(ds_cnn_h_fname, 'wb') f.close() with open(ds_cnn_h_fname, 'a') as f: helper.write_ds_cnn_h_beginning(f, wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_size_info, act_max) # # Quantize weights to 8-bits using (min,max) and write to file f = open(weights_h_fname, 'wb') f.close() total_layers = len(act_max) layer_no = 1 weights_dec_bits = 0 for v in tf.trainable_variables(): var_name = str(v.name) var_values = sess.run(v) min_value = var_values.min() max_value = var_values.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits = 7 - int_bits # convert to [-128,128) or int8 var_values = np.round(var_values * 2**dec_bits) var_name = var_name.replace('/', '_') var_name = var_name.replace(':', '_') if (len(var_values.shape) > 2): # convolution layer weights transposed_wts = np.transpose(var_values, (3, 0, 1, 2)) else: # fully connected layer weights or biases of any layer transposed_wts = np.transpose(var_values) # convert back original range but quantized to 8-bits or 256 levels var_values = var_values / (2**dec_bits) # update the weights in tensorflow graph for quantizing the activations var_values = sess.run(tf.assign(v, var_values)) print(var_name + ' number of wts/bias: ' + str(var_values.shape) + \ ' dec bits: ' + str(dec_bits) + \ ' max: (' + str(var_values.max()) + ',' + str(max_value) + ')' + \ ' min: (' + str(var_values.min()) + ',' + str(min_value) + ')') conv_layer_no = layer_no // 2 + 1 wt_or_bias = 'BIAS' if 'weights' in var_name: wt_or_bias = 'WT' with open(weights_h_fname, 'a') as f: if conv_layer_no == 1: f.write('#define CONV1_{} {{'.format(wt_or_bias)) elif conv_layer_no <= num_layers: if layer_no % 2 == 0: f.write('#define CONV{}_DS_{} {{'.format( conv_layer_no, wt_or_bias)) else: f.write('#define CONV{}_PW_{} {{'.format( conv_layer_no, wt_or_bias)) else: f.write('#define FINAL_FC_{} {{'.format(wt_or_bias)) transposed_wts.tofile(f, sep=", ", format="%d") f.write('}\n') if 'weights' in var_name: weights_dec_bits = dec_bits if 'biases' in var_name: if layer_no == total_layers - 2: # if averege pool layer, go to the next one layer_no += 1 input_dec_bits = 7 - np.log2(act_max[layer_no - 1]) output_dec_bits = 7 - np.log2(act_max[layer_no]) weights_x_input_dec_bits = input_dec_bits + weights_dec_bits bias_lshift = int(weights_x_input_dec_bits - dec_bits) output_rshift = int(weights_x_input_dec_bits - output_dec_bits) print( "Layer no: {} | Bias Lshift: {} | Output Rshift: {}\n".format( layer_no, bias_lshift, output_rshift)) with open('ds_cnn.h', 'a') as f: if conv_layer_no == 1: f.write( "#define CONV1_BIAS_LSHIFT {}\n".format(bias_lshift)) f.write( "#define CONV1_OUT_RSHIFT {}\n".format(output_rshift)) elif conv_layer_no <= num_layers: if layer_no % 2 == 0: f.write("#define CONV{}_DS_BIAS_LSHIFT {}\n".format( conv_layer_no, bias_lshift)) f.write("#define CONV{}_DS_OUT_RSHIFT {}\n".format( conv_layer_no, output_rshift)) else: f.write("#define CONV{}_PW_BIAS_LSHIFT {}\n".format( conv_layer_no, bias_lshift)) f.write("#define CONV{}_PW_OUT_RSHIFT {}\n".format( conv_layer_no, output_rshift)) else: f.write("#define FINAL_FC_BIAS_LSHIFT {}\n".format( bias_lshift)) f.write("#define FINAL_FC_OUT_RSHIFT {}\n".format( output_rshift)) layer_no += 1 input_dec_bits = 7 - np.log2(act_max[len(act_max) - 3]) output_dec_bits = 7 - np.log2(act_max[len(act_max) - 2]) with open(ds_cnn_h_fname, 'a') as f: f.write("#define AVG_POOL_OUT_LSHIFT {}\n\n".format( int(output_dec_bits - input_dec_bits))) helper.write_ds_cnn_h_end(f, num_layers) # Evaluate result after quantization on testing set set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess, is_bg_volume_constant, feature_extraction) test_accuracy, conf_matrix, predictions, true_labels = sess.run( [ evaluation_step, confusion_matrix, predicted_indices, expected_indices ], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, }) batch_size = min(batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size)) sess.close()
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info, act_max, data_url, data_dir, silence_percentage, unknown_percentage, checkpoint, batch_size, include_silence, lower_frequency_limit, upper_frequency_limit, filterbank_channel_count, is_bg_volume_constant, feature_extraction): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ tf.reset_default_graph() tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(','), silence_percentage != 0) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, lower_frequency_limit, upper_frequency_limit, filterbank_channel_count) audio_processor = input_data.AudioProcessor(data_url, data_dir, silence_percentage, unknown_percentage, wanted_words.split(','), 0, 100, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model(fingerprint_input, model_settings, model_architecture, model_size_info, act_max, is_training=False) ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, checkpoint) for v in tf.trainable_variables(): var_name = str(v.name) var_values = sess.run(v) min_value = var_values.min() max_value = var_values.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits = 7 - int_bits # convert to [-128,128) or int8 var_values = np.round(var_values * 2**dec_bits) var_values = var_values / (2**dec_bits) # update the weights in tensorflow graph for quantizing the activations var_values = sess.run(tf.assign(v, var_values)) # test set set_size = audio_processor.set_size('testing') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess, is_bg_volume_constant, feature_extraction) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, }) batch_size = min(batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.reset_default_graph() sess.close() return total_accuracy
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, FLAGS.model_size_info, FLAGS.act_max, is_training=False) ground_truth_input = tf.placeholder( tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix( expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, FLAGS.checkpoint) # Quantize weights to 8-bits using (min,max) and write to file f = open('weights.h','wb') f.close() for v in tf.trainable_variables(): var_name = str(v.name) var_values = sess.run(v) min_value = var_values.min() max_value = var_values.max() int_bits = int(np.ceil(np.log2(max(abs(min_value),abs(max_value))))) dec_bits = 7-int_bits # convert to [-128,128) or int8 var_values = np.round(var_values*2**dec_bits) var_name = var_name.replace('/','_') var_name = var_name.replace(':','_') with open('weights.h','a') as f: f.write('#define '+var_name+' {') if(len(var_values.shape)>2): #convolution layer weights transposed_wts = np.transpose(var_values,(3,0,1,2)) else: #fully connected layer weights or biases of any layer transposed_wts = np.transpose(var_values) with open('weights.h','a') as f: transposed_wts.tofile(f,sep=", ",format="%d") f.write('}\n') # convert back original range but quantized to 8-bits or 256 levels var_values = var_values/(2**dec_bits) # update the weights in tensorflow graph for quantizing the activations var_values = sess.run(tf.assign(v,var_values)) print(var_name+' number of wts/bias: '+str(var_values.shape)+\ ' dec bits: '+str(dec_bits)+\ ' max: ('+str(var_values.max())+','+str(max_value)+')'+\ ' min: ('+str(var_values.min())+','+str(min_value)+')') # training set set_size = audio_processor.set_size('training') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, FLAGS.batch_size): training_fingerprints, training_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'training', sess)) training_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: training_fingerprints, ground_truth_input: training_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (training_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Training accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size)) # validation set set_size = audio_processor.set_size('validation') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) validation_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Validation accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size)) # test set set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size))
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = quant_models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} ''' wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) ''' fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_time_size * fingerprint_frequency_size], name='fingerprint_input') reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) ''' logits = models.create_model( reshaped_input, model_settings, model_architecture, model_size_info, is_training=False, runtime_settings=runtime_settings) ''' logits = quant_models.create_model(fingerprint_input, model_settings, FLAGS.model_architecture, FLAGS.model_size_info, FLAGS.act_max, is_training=False) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def run_quant_inference(act_max): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ wanted_words = FLAGS.wanted_words sample_rate = FLAGS.sample_rate clip_duration_ms = FLAGS.clip_duration_ms window_size_ms = FLAGS.window_size_ms window_stride_ms = FLAGS.window_stride_ms dct_coefficient_count = FLAGS.dct_coefficient_count model_architecture = FLAGS.model_architecture model_size_info = FLAGS.model_size_info total_layers = len(act_max) layer_no = 1 weights_dec_bits = 0 tf.reset_default_graph() tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) if FLAGS.validation_dir is None: FLAGS.validation_dir = FLAGS.data_dir validation_audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.validation_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), 100, 0, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model(fingerprint_input, model_settings, model_architecture, model_size_info, act_max, is_training=False) ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, FLAGS.checkpoint) # Quantize weights to 8-bits using (min,max) for v in tf.trainable_variables(): var_name = str(v.name) var_values = sess.run(v) min_value = var_values.min() max_value = var_values.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits = 7 - int_bits # convert to [-128,128) or int8 var_values = np.round(var_values * 2**dec_bits) # convert back original range but quantized to 8-bits or 256 levels var_values = var_values / (2**dec_bits) # update the weights in tensorflow graph for quantizing the activations var_values = sess.run(tf.assign(v, var_values)) if 'weights' in var_name: weights_dec_bits = dec_bits if 'biases' in var_name: # if averege pool layer, go to the next one if layer_no == total_layers - 2: layer_no += 1 if act_max[layer_no] != 0 and act_max[layer_no - 1] != 0: input_dec_bits = 7 - np.log2(act_max[layer_no - 1]) output_dec_bits = 7 - np.log2(act_max[layer_no]) weights_x_input_dec_bits = input_dec_bits + weights_dec_bits bias_lshift = int(weights_x_input_dec_bits - dec_bits) output_rshift = int(weights_x_input_dec_bits - output_dec_bits) if bias_lshift < 0 or output_rshift < 0: print("CMSIS-5 NN doesn't support negative shift now!") tf.reset_default_graph() sess.close() return -1 layer_no += 1 # validation set set_size = validation_audio_processor.set_size('validation') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( validation_audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) validation_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Validation accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size)) tf.reset_default_graph() sess.close() return total_accuracy
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Returns: Input and output tensor objects. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, preprocess) print(model_settings) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [], name='wav_data') decoded_sample_data = tf.audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = audio_ops.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = audio_ops.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] 'Ahmad' reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) 'Ahmad' if FLAGS.freeze_bn_folded_model: logits = quant_models.create_model(fingerprint_input, model_settings, model_architecture, model_size_info, is_training=False, runtime_settings=runtime_settings) else: logits = models.create_model(fingerprint_input, model_settings, model_architecture, model_size_info, is_training=False, runtime_settings=runtime_settings) # logits = models.create_model( # reshaped_input, model_settings, model_architecture, is_training=False, # runtime_settings=runtime_settings) # Create an output to use for inference. softmax = tf.nn.softmax(logits, name='labels_softmax') identity = tf.identity(softmax, name='identity_out') return reshaped_input, identity
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ act_max = FLAGS.act_max for maxium in act_max: if maxium == 0: print('Calling quant_act_max.py to get best act_max') quant_act_max.FLAGS = FLAGS act_max = quant_act_max.get_best_act_max(act_max) tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model(fingerprint_input, model_settings, model_architecture, model_size_info, act_max, is_training=False) ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, FLAGS.checkpoint) # Quantize weights to 8-bits using (min,max) and write to file f = open('weights.h', 'wb') f.close() if model_architecture == "ds_cnn": num_layers = model_size_info[0] helper.write_ds_cnn_c_file('ds_cnn.c', num_layers) ds_cnn_h_fname = "ds_cnn.h" weights_h_fname = "ds_cnn_weights.h" f = open(ds_cnn_h_fname, 'wb') f.close() with open(ds_cnn_h_fname, 'a') as f: helper.write_ds_cnn_h_beginning(f, wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_size_info, act_max) # Quantize weights to 8-bits using (min,max) and write to file f = open(weights_h_fname, 'wb') f.close() total_layers = len(act_max) layer_no = 1 weights_dec_bits = 0 for v in tf.trainable_variables(): var_name = str(v.name) var_values = sess.run(v) min_value = var_values.min() max_value = var_values.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits = 7 - int_bits # convert to [-128,128) or int8 var_values = np.round(var_values * 2**dec_bits) var_name = var_name.replace('/', '_') var_name = var_name.replace(':', '_') with open('weights.h', 'a') as f: f.write('#define ' + var_name + ' {') if (len(var_values.shape) > 2): #convolution layer weights transposed_wts = np.transpose(var_values, (3, 0, 1, 2)) else: #fully connected layer weights or biases of any layer transposed_wts = np.transpose(var_values) with open('weights.h', 'a') as f: transposed_wts.tofile(f, sep=", ", format="%d") f.write('}\n') # convert back original range but quantized to 8-bits or 256 levels var_values = var_values / (2**dec_bits) # update the weights in tensorflow graph for quantizing the activations var_values = sess.run(tf.assign(v, var_values)) print(var_name+' number of wts/bias: '+str(var_values.shape)+\ ' dec bits: '+str(dec_bits)+\ ' max: ('+str(var_values.max())+','+str(max_value)+')'+\ ' min: ('+str(var_values.min())+','+str(min_value)+')') if model_architecture == "ds_cnn": conv_layer_no = layer_no // 2 + 1 wt_or_bias = 'BIAS' if 'weights' in var_name: wt_or_bias = 'WT' with open(weights_h_fname, 'a') as f: if conv_layer_no == 1: f.write('#define CONV1_{} {{'.format(wt_or_bias)) elif conv_layer_no <= num_layers: if layer_no % 2 == 0: f.write('#define CONV{}_DS_{} {{'.format( conv_layer_no, wt_or_bias)) else: f.write('#define CONV{}_PW_{} {{'.format( conv_layer_no, wt_or_bias)) else: f.write('#define FINAL_FC_{} {{'.format(wt_or_bias)) transposed_wts.tofile(f, sep=", ", format="%d") f.write('}\n') if 'weights' in var_name: weights_dec_bits = dec_bits if 'biases' in var_name: # if averege pool layer, go to the next one if layer_no == total_layers - 2: layer_no += 1 input_dec_bits = 7 - np.log2(act_max[layer_no - 1]) output_dec_bits = 7 - np.log2(act_max[layer_no]) weights_x_input_dec_bits = input_dec_bits + weights_dec_bits bias_lshift = int(weights_x_input_dec_bits - dec_bits) output_rshift = int(weights_x_input_dec_bits - output_dec_bits) print("Layer no: {} | Bias Lshift: {} | Output Rshift: {}\n". format(layer_no, bias_lshift, output_rshift)) with open('ds_cnn.h', 'a') as f: if conv_layer_no == 1: f.write("#define CONV1_BIAS_LSHIFT {}\n".format( bias_lshift)) f.write("#define CONV1_OUT_RSHIFT {}\n".format( output_rshift)) elif conv_layer_no <= num_layers: if layer_no % 2 == 0: f.write( "#define CONV{}_DS_BIAS_LSHIFT {}\n".format( conv_layer_no, bias_lshift)) f.write("#define CONV{}_DS_OUT_RSHIFT {}\n".format( conv_layer_no, output_rshift)) else: f.write( "#define CONV{}_PW_BIAS_LSHIFT {}\n".format( conv_layer_no, bias_lshift)) f.write("#define CONV{}_PW_OUT_RSHIFT {}\n".format( conv_layer_no, output_rshift)) else: f.write("#define FINAL_FC_BIAS_LSHIFT {}\n".format( bias_lshift)) f.write("#define FINAL_FC_OUT_RSHIFT {}\n".format( output_rshift)) layer_no += 1 if model_architecture == "ds_cnn": input_dec_bits = 7 - np.log2(act_max[len(act_max) - 3]) output_dec_bits = 7 - np.log2(act_max[len(act_max) - 2]) if input_dec_bits > output_dec_bits: output_dec_bits = input_dec_bits with open(ds_cnn_h_fname, 'a') as f: f.write("#define AVG_POOL_OUT_LSHIFT {}\n\n".format( int(output_dec_bits - input_dec_bits))) helper.write_ds_cnn_h_end(f, num_layers)