def main(_): best_acc = 0 best_step = 0 best_acc_istrain = 0 best_step_istrain = 0 # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data_filler.prepare_words_list_my(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data_filler.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) ############################################## ############tensorflow modules########## fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') # ############ 模型创建 ########## istrain = tf.placeholder(tf.bool, name='istrain') logits= models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=istrain) ############ 模型创建 ########## # logits, dropout_prob= models.create_model( # fingerprint_input, # model_settings, # FLAGS.model_architecture, # is_training=True) # Define loss and optimizer ############ 真实值 ########## ground_truth_input = tf.placeholder( tf.float32, [None, label_count], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. ############ 交叉熵计算 ########## # with tf.name_scope('cross_entropy'): # cross_entropy_mean = tf.reduce_mean( # tf.nn.softmax_cross_entropy_with_logits( # labels=ground_truth_input, logits=logits)) + beta*loss_norm with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=ground_truth_input, logits=logits)) tf.summary.scalar('cross_entropy', cross_entropy_mean) ############ 学习率、准确率、混淆矩阵 ########## # learning_rate_input 学习率输入(tf.placeholder) # train_step 训练过程 (优化器) # predicted_indices 预测输出索引 # expected_indices 实际希望输出索引 # correct_prediction 正确预测矩阵 # confusion_matrix 混淆矩阵 # evaluation_step 正确分类概率(每个阶段) # global_step 全局训练阶段 # increment_global_step 全局训练阶段递增 learning_rate_input = tf.placeholder( tf.float32, [], name='learning_rate_input') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = tf.train.AdamOptimizer( learning_rate_input).minimize(cross_entropy_mean) # with tf.name_scope('train'), tf.control_dependencies(control_dependencies): # learning_rate_input = tf.placeholder( # tf.float32, [], name='learning_rate_input') # # train_step = tf.train.GradientDescentOptimizer( # # learning_rate_input).minimize(cross_entropy_mean) # with tf.control_dependencies(update_ops): # train_step = tf.train.AdamOptimizer( # learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix( expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) acc = tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables(),max_to_keep=None)# max keep file // moren 5 # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() validation_merged_summaries = tf.summary.merge([tf.get_collection(tf.GraphKeys.SUMMARIES,'accuracy'),tf.get_collection(tf.GraphKeys.SUMMARIES,'cross_entropy')]) test_summaries = tf.summary.merge([acc]) test_summaries_istrain = tf.summary.merge([acc]) train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test') test_istrain_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test_istrain') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) ### # model1: fc # model2: conv :940k个parameter # model3:low_latancy_conv:~~model1 # model4: 750k # Training loop. ############################################# ######## 主循环 ###### ############################################# training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. ####### 自动切换学习率 ####### if training_step <12000+1: learning_rate_value = learning_rates_list[0]*0.02**(training_step/12000) else: learning_rate_value = learning_rates_list[0]*0.02 #0.015 12000 training_steps_sum = 0 # for i in range(len(training_steps_list)): # training_steps_sum += training_steps_list[i] # if training_step <= training_steps_sum: # learning_rate_value = learning_rates_list[i] # break # Pull the audio samples we'll use for training. ####### audio处理器导入数据 ################################## ##get_data(self, how_many, offset, model_settings, background_frequency, ## background_volume_range, time_shift, mode, sess) ######################################################################## train_fingerprints, train_ground_truth = audio_processor.get_data_my( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) #mid = np.abs(np.max(train_fingerprints) + np.min(train_fingerprints)) / 2 #half = np.max(train_fingerprints) - np.min(train_fingerprints) #train_fingerprints = ((train_fingerprints + mid) / half * 255).astype(int) #### 输入归一化 #### # train_fingerprints=input_normalization(train_fingerprints) # Run the graph with this batch of training data. train_fingerprints = np.round(train_fingerprints) train_fingerprints = np.clip(train_fingerprints, -100, 100) train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, istrain:True }) train_writer.add_summary(train_summary, training_step) tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None ############################################# ########交叉验证集重复计算正确率和混淆矩阵###### for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data_my(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) #mid = np.abs(np.max(validation_fingerprints) + np.min(validation_fingerprints)) / 2 # half = np.max(validation_fingerprints) - np.min(validation_fingerprints) #validation_fingerprints = ((validation_fingerprints + mid) / half * 255).astype(int) # #### 输入归一化 #### # validation_fingerprints = input_normalization(validation_fingerprints) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_fingerprints = np.round(validation_fingerprints) validation_fingerprints = np.clip(validation_fingerprints,-100,100) validation_summaries, validation_accuracy, conf_matrix = sess.run( [validation_merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, istrain: True }) validation_writer.add_summary(validation_summaries, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) ############################################# ######## 测试集重复计算正确率和混淆矩阵 ###### set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) test_fingerprints, test_ground_truth = audio_processor.get_data_my( -1, 0, model_settings, 0.0, 0.0, 0, 'testing', sess) #mid = np.abs(np.max(test_fingerprints) + np.min(test_fingerprints)) / 2 #half = np.max(test_fingerprints) - np.min(test_fingerprints) #test_fingerprints = ((test_fingerprints + mid) / half * 255).astype(int) test_fingerprints = np.round(test_fingerprints) test_fingerprints = np.clip(test_fingerprints, -100, 100) final_summary,test_accuracy, conf_matrix = sess.run( [test_summaries,evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, istrain : False }) final_summary_istrain,test_accuracy_istrain= sess.run( [test_summaries_istrain,evaluation_step], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, istrain : True }) if test_accuracy > best_acc: best_acc = test_accuracy best_step = training_step if test_accuracy_istrain > best_acc_istrain: best_acc_istrain = test_accuracy_istrain best_step_istrain = training_step test_writer.add_summary(final_summary, training_step) test_istrain_writer.add_summary(final_summary_istrain, training_step) tf.logging.info('Confusion Matrix:\n %s' % (conf_matrix)) tf.logging.info('test accuracy = %.1f%% (N=%d)' % (test_accuracy * 100,6882)) tf.logging.info('test_istrain accuracy = %.1f%% (N=%d)' % (test_accuracy_istrain * 100,6882)) tf.logging.info('Best test accuracy before now = %.1f%% (N=%d)' % (best_acc * 100,6882) + ' at step of ' + str(best_step)) tf.logging.info('Best test_istrain accuracy before now = %.1f%% (N=%d)' % (best_acc_istrain * 100,6882) + ' at step of ' + str(best_step_istrain)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir + '/'+FLAGS.model_architecture, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) print_line = 'Best test accuracy before now = %.1f%% (N=%d)' % (best_acc * 100,6882) + ' at step of ' + str(best_step) + '\n' + \ 'Best test_istrain accuracy before now = %.1f%% (N=%d)' % (best_acc_istrain * 100,6882) + ' at step of ' + str(best_step_istrain) if training_step == training_steps_max: with open(FLAGS.train_dir + '/' +FLAGS.model_architecture+ '/details.txt', 'w') as f: f.write(print_line)
def main(_): num = 0 words_list = input_data_filler.prepare_words_list_my(FLAGS.wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data_filler.AudioProcessor( '', FLAGS.data_dir, FLAGS.silence_percentage, 10, #unknown % FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds #test_duration_secongd output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32) #600s # Set up background audio. background_crossover_ms = 500 background_segment_duration_ms = ( FLAGS.clip_duration_ms + background_crossover_ms) background_segment_duration_samples = int( (background_segment_duration_ms * FLAGS.sample_rate) / 1000) background_segment_stride_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) background_ramp_samples = int( ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000) #在1/2background crossover ms处音量变化 # Mix the background audio into the main track. how_many_backgrounds = int( math.ceil(output_audio_sample_count / background_segment_stride_samples)) for i in range(how_many_backgrounds): output_offset = int(i * background_segment_stride_samples) background_index = np.random.randint(len(audio_processor.background_data)) background_samples = audio_processor.background_data[background_index] background_offset = np.random.randint( 0, len(background_samples) - model_settings['desired_samples']) background_volume = np.random.uniform(0, FLAGS.background_volume) mix_in_audio_sample(output_audio, output_offset, background_samples, background_offset, background_segment_duration_samples, background_volume, background_ramp_samples, background_ramp_samples) #mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset, # clip_duration, sample_volume, ramp_in, ramp_out) # Mix the words into the main track, noting their labels and positions. output_labels = [] word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000) clip_duration_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000) how_many_words = int( math.floor(output_audio_sample_count / word_stride_samples)) all_test_data, all_test_labels = audio_processor.get_unprocessed_data( -1, model_settings, 'testing') for i in range(how_many_words): output_offset = ( int(i * word_stride_samples) + np.random.randint(word_gap_samples)) #output_offset = ( # int(i * word_stride_samples)) output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate is_unknown = np.random.randint(100) < FLAGS.unknown_percentage if is_unknown: wanted_label = input_data_filler.UNKNOWN_WORD_LABEL #wanted_label = 'unknown' num = num+1 print("is unknown " + str(num)) else: wanted_label = words_list[1 + np.random.randint(len(words_list) - 1)] #wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)] test_data_start = np.random.randint(len(all_test_data)) found_sample_data = None index_lookup = np.arange(len(all_test_data), dtype=np.int32) np.random.shuffle(index_lookup) for test_data_offset in range(len(all_test_data)): test_data_index = index_lookup[( test_data_start + test_data_offset) % len(all_test_data)] current_label = all_test_labels[test_data_index] if current_label == wanted_label: found_sample_data = all_test_data[test_data_index] break # mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset, # clip_duration, sample_volume, ramp_in, ramp_out) mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, clip_duration_samples, 1.0, 325, 325) #mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, # clip_duration_samples, 1.0, 5, 5) #if not is_unknown: # output_labels.append({'label': wanted_label, 'time': output_offset_ms}) input_data_filler.save_wav_file(FLAGS.output_audio_file, output_audio, FLAGS.sample_rate) tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file) with open(FLAGS.output_labels_file, 'w') as f: for output_label in output_labels: f.write('%s, %f\n' % (output_label['label'], output_label['time'])) tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
# We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() model_settings = models.prepare_model_settings( len(input_data_filler.prepare_words_list_my(wanted_words.split(','))), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) audio_processor = input_data_filler.AudioProcessor( data_url, data_dir, silence_percentage, unknown_percentage, wanted_words.split(','), validation_percentage, testing_percentage, model_settings) time_shift_samples = int((time_shift_ms * sample_rate) / 1000) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] #********************************************************************* print(" ***************** audio processor ********************") training_datas = len(audio_processor.data_index['training']) + len(audio_processor.unknown_index['training']) validation_datas = len(audio_processor.data_index['validation']) + len(audio_processor.unknown_index['validation']) testing_datas = len(audio_processor.data_index['testing']) + len(audio_processor.unknown_index['testing']) print("* total samples : " + str(training_datas+validation_datas + testing_datas)) print("* training samples : "+str(len(audio_processor.data_index['training'])) + ' + ' \ + str(len(audio_processor.unknown_index['training'])) + '(unknowns)' + ' = ' + str(training_datas))