def run_test(): # Get all ckpt names in log dir (without meta ext) meta_list = get_checkpoints(FLAGS.log_dir) # GPU/CPU Flag if FLAGS.gpu is not None: compute_string = '/gpu:' + str(FLAGS.gpu) else: compute_string = '/cpu:0' # Iterate through the checkpoints for ckpt_path in meta_list: tf.reset_default_graph() #################### # Setup Data Queue # #################### with tf.device("/cpu:0"): with tf.variable_scope('test') as scope: data_pipeline = DataPipeline(augment=False, num_epochs=1, shuffle=False) validate_x, validate_y, ids = data_pipeline.batch_ops() with tf.device(compute_string): ########################## # Declare Validate Graph # ########################## # Sets train/test mode; currently only used for BatchNormalization # True: Train False: Test phase = tf.placeholder(tf.bool, name='phase') validate_model = model(validate_x, validate_y, phase) # Delete extraneous info when done debugging validate_pred = validate_model.inference() pool5 = validate_model.fc2 init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) ids_file = open(os.path.join(FLAGS.log_dir, 'ids.txt'), 'w') predictions_file = open(os.path.join(FLAGS.log_dir, 'predictions.txt'), 'w') session_config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=session_config) as sess: sess.run(init) # Coordinator hands data fetching threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) optimistic_restore(sess, ckpt_path) try: step = 0 cum_time = 0 while True: if coord.should_stop(): break step += 1 start_time = time() prediction_value, pool5_value, ids_value = sess.run( [validate_pred, pool5, ids], feed_dict={phase: False}) duration_time = time() - start_time cum_time += duration_time feature_file = os.path.join(FLAGS.log_dir, "feature_%d" % step) #pool5_value = np.sum(pool5_value, (1,2)) #spatial average pool5_value = pool5_value.reshape(FLAGS.batch_size, -1) np.save(feature_file, pool5_value) for id in ids_value: ids_file.write("%s\n" % id) # Save prediction and ground truth info predictions_file.write(np.array_str( \ prediction_value, \ max_line_width=1e3, \ precision=10, \ suppress_small=True)) predictions_file.write('\n') predictions_file.flush() except tf.errors.OutOfRangeError: step -= 1 except Exception as e: step -= 1 # Stop Queueing data, we're done! coord.request_stop() coord.join(threads)
def run_training(): ''' Run Training Loop ''' # GPU/CPU Flag if FLAGS.gpu is not None: compute_string = '/gpu:' + str(FLAGS.gpu) else: compute_string = '/cpu:0' ##################### # Setup Data Queues # ##################### with tf.device("/cpu:0"): with tf.variable_scope('train'): data_pipeline = DataPipeline(augment=True) train_x, train_y = data_pipeline.batch_ops() ####################### # Declare train graph # ####################### with tf.device(compute_string): phase = tf.placeholder(tf.bool, name='phase') train_model = model(train_x, train_y, phase) train_predictions = train_model.inference() train_acc = train_model.evaluate() train_loss, gt_y = train_model.loss() train_op = train_model.optimize() global_step = train_model.get_global_step() tf.summary.scalar('train_loss', train_loss) tf.summary.scalar('train_acc', train_acc) ############################# # Setup Summaries and Saver # ############################# # Collect summaries for TensorBoard summary = tf.summary.merge_all() # Create variable initializer op init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Create checkpoint saver saver = tf.train.Saver(max_to_keep=100) # Begin TensorFlow Session session_config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=session_config) as sess: # Resume training or # Run the Variable Initializer Op sess.run(init) if FLAGS.resume == True: try: meta_list = get_checkpoints(FLAGS.log_dir) optimistic_restore(sess, meta_list[-1]) resume_status = True except: print('Checkpoint Load Failed') print('Training from scratch') resume_status = False if not resume_status: try: train_model.load_pretrained_weights(sess) except: print('Failed to load pretrained weights.') print('Training from scratch') sys.stdout.flush() # Coordinator hands data fetching threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) # Instantiate a summary writer to output summaries and the Graph. summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # Actually begin the training process try: for step in xrange(FLAGS.max_steps): if coord.should_stop(): break start_time = time() # Run one step of the model. _, loss_value, acc = sess.run( [train_op, train_loss, train_acc], feed_dict={phase: True}) global_step_value = global_step.eval() duration_time = time() - start_time # debug profiler on step 3 # open timeline.json in chrome://tracing/ if FLAGS.profile and step == 3: run_metadata = tf.RunMetadata() _, loss, acc = sess.run( [train_op, train_loss, train_acc], options=tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) # Display progress if global_step_value % 1 == 0: # Print progress to stdout print('Step %d: loss = %.2f, acc = %.2f (%.3f sec)' % (global_step_value, loss_value, acc, duration_time)) sys.stdout.flush() # Write the summaries if global_step_value % 20 == 0: # Update the summary file summary_str = sess.run(summary, feed_dict={phase: False}) summary_writer.add_summary(summary_str, global_step_value) summary_writer.flush() # Save Model Checkpoint if (global_step_value)%FLAGS.checkpoint_freq==0 or \ (global_step_value+1)==FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.log_dir, 'model') saver.save(sess, checkpoint_path, global_step=global_step) #loop_time = time() - start_time #print('Total Loop Time: %.3f' % loop_time) except tf.errors.OutOfRangeError: print('Done Training -- Epoch limit reached.') sys.stdout.flush() except Exception as e: print("Exception encountered: ", e) sys.stdout.flush() # Stop Queueing data, we're done! coord.request_stop() coord.join(threads)
def run_validate(): # Get all ckpt names in log dir (without meta ext) meta_list = get_checkpoints(FLAGS.log_dir) # GPU/CPU Flag if FLAGS.gpu is not None: compute_string = '/gpu:' + str(FLAGS.gpu) else: compute_string = '/cpu:0' # Iterate through the checkpoints val_loss = [] val_acc = [] val_itr = [] for ckpt_path in meta_list: tf.reset_default_graph() #################### # Setup Data Queue # #################### with tf.device("/cpu:0"): with tf.variable_scope('validate') as scope: data_pipeline = DataPipeline(augment=False, num_epochs=1, shuffle=False) validate_x, validate_y, ids = data_pipeline.batch_ops() with tf.device(compute_string): ########################## # Declare Validate Graph # ########################## # Sets train/test mode; currently only used for BatchNormalization # True: Train False: Test phase = tf.placeholder(tf.bool, name='phase') validate_model = model(validate_x, validate_y, phase) # Delete extraneous info when done debugging validate_pred = validate_model.inference() validate_acc = validate_model.evaluate() validate_loss, gt_y = validate_model.loss() global_step = validate_model.get_global_step() summary = tf.summary.merge_all() init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) session_config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=session_config) as sess: sess.run(init) summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph) # Coordinator hands data fetching threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) optimistic_restore(sess, ckpt_path) global_step_value = global_step.eval() try: step = 0 cum_loss = 0 cum_acc = 0 cum_time = 0 while True: if coord.should_stop(): break step += 1 start_time = time() loss_value, acc_value, prediction_value, gt_value, ids_value = sess.run( [ validate_loss, validate_acc, validate_pred, gt_y, ids ], feed_dict={phase: False}) duration_time = time() - start_time cum_loss += loss_value cum_acc += acc_value cum_time += duration_time if step % 1 == 0: # Print progress to stdout if FLAGS.print_pred: print( 'Step %d: loss = %.4f acc = %.4f (%.3f sec)' % (step, loss_value, acc_value, duration_time)) print('Prediction:{}'.format(prediction_value)) print('GT:{}'.format(gt_value)) sys.stdout.flush() # Write the summaries if step % 25 == 0: # Update the summary file summary_str = sess.run(summary, feed_dict={phase: False}) summary_writer.add_summary(summary_str, global_step_value) summary_writer.flush() except tf.errors.OutOfRangeError: step -= 1 except Exception as e: step -= 1 # Stop Queueing data, we're done! coord.request_stop() coord.join(threads) avg_loss = cum_loss / step avg_acc = cum_acc / step avg_time = cum_time / step val_loss.append(float(avg_loss)) val_acc.append(float(avg_acc)) val_itr.append(int(global_step_value)) print('Results For Load File: %s' % ckpt_path) print('Average_Loss = %.4f' % avg_loss) print('Average_Acc = %.4f' % avg_acc) print('Run Time: %.2f' % cum_time) sys.stdout.flush() val_loss = np.asarray(val_loss) val_acc = np.asarray(val_acc) val_itr = np.asarray(val_itr) best_loss = np.amin(val_loss) best_acc = np.amax(val_acc) best_itr = val_itr[np.argmax(val_acc)] print('Overall Results') print('Minimum Loss: %.4f' % best_loss) print('Maximum Acc: %.4f' % best_acc) print('Best Checkpoint: %d' % best_itr) save_path = os.path.join(FLAGS.log_dir, 'validation_results.npz') np.savez(save_path, val_loss=val_loss, val_acc=val_acc, val_itr=val_itr)