def evaluate(): """Evaluate.""" assert FLAGS.dataset == 'KITTI', \ 'Currently only supports KITTI dataset' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu with tf.Graph().as_default() as g: assert FLAGS.net == 'squeezeSeg', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'squeezeSeg': mc = kitti_squeezeSeg_config() mc.LOAD_PRETRAINED_MODEL = False mc.DATA_AUGMENTATION = False mc.BATCH_SIZE = 1 # TODO(bichen): fix this hard-coded batch size. model = SqueezeSeg(mc) imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) eval_summary_ops = [] eval_summary_phs = {} eval_summary_names = [ 'Timing/read', 'Timing/detect', ] for i in range(1, mc.NUM_CLASS): eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_iou') eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_precision') eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_recall') for sm in eval_summary_names: ph = tf.placeholder(tf.float32) eval_summary_phs[sm] = ph eval_summary_ops.append(tf.summary.scalar(sm, ph)) saver = tf.train.Saver(model.model_params) summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) ckpts = set() for step in range(0, 32000, 100): ckpt_path = '/rscratch/xuanyu/SqueezeSeg_v2/log/train/model.ckpt-' + str( step) eval_once(saver, ckpt_path, summary_writer, eval_summary_ops, eval_summary_phs, imdb, model) """
def evaluate(): """Evaluate.""" assert FLAGS.dataset == 'KITTI', \ 'Currently only supports KITTI dataset' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu with tf.Graph().as_default() as g: assert FLAGS.net == 'squeezeSeg', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'squeezeSeg': mc = kitti_squeezeSeg_config() mc.LOAD_PRETRAINED_MODEL = False mc.BATCH_SIZE = 1 # TODO(bichen): fix this hard-coded batch size. model = SqueezeSeg(mc) imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) eval_summary_ops = [] eval_summary_phs = {} eval_summary_names = [ 'Timing/read', 'Timing/detect', ] for i in range(1, mc.NUM_CLASS): eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_iou') eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_precision') eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_recall') for sm in eval_summary_names: ph = tf.placeholder(tf.float32) eval_summary_phs[sm] = ph eval_summary_ops.append(tf.summary.scalar(sm, ph)) saver = tf.train.Saver(model.model_params) summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) ckpts = set() while True: if FLAGS.run_once: # When run_once is true, checkpoint_path should point to the exact # checkpoint file. eval_once(saver, FLAGS.checkpoint_path, summary_writer, eval_summary_ops, eval_summary_phs, imdb, model) return else: # When run_once is false, checkpoint_path should point to the directory # that stores checkpoint files. ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) if ckpt and ckpt.model_checkpoint_path: if ckpt.model_checkpoint_path in ckpts: # Do not evaluate on the same checkpoint print( 'Wait {:d}s for new checkpoints to be saved ... '. format(FLAGS.eval_interval_secs)) time.sleep(FLAGS.eval_interval_secs) else: ckpts.add(ckpt.model_checkpoint_path) print('Evaluating {}...'.format( ckpt.model_checkpoint_path)) eval_once(saver, ckpt.model_checkpoint_path, summary_writer, eval_summary_ops, eval_summary_phs, imdb, model) else: print('No checkpoint file found') if not FLAGS.run_once: print( 'Wait {:d}s for new checkpoints to be saved ... '. format(FLAGS.eval_interval_secs)) time.sleep(FLAGS.eval_interval_secs)
def train(): """Train SqueezeSeg model""" assert FLAGS.dataset == 'KITTI', \ 'Currently only support KITTI dataset' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu with tf.Graph().as_default(): assert FLAGS.net == 'squeezeSeg', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'squeezeSeg': mc = kitti_squeezeSeg_config() # ed: SqueezeSeg을 본격적으로 training 하기 전에 전이학습을 위해 SqueezeNet의 pretrained_model을 불러오는듯 mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = SqueezeSeg(mc) imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) # ed: Model의 여러 정보를 저장하기 위한 코드 # save model size, flops, activations by layers with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f: f.write('Number of parameter by layer:\n') # ed: parameter size를 기록하는 코드 count = 0 for c in model.model_size_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) # ed: output activation이 정확히 뭔지 모르겠지만 그걸 기록해주는 코드 count = 0 f.write('\nActivation size by layer:\n') for c in model.activation_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) # ed: Flop Count를 기록해주는 코드 count = 0 f.write('\nNumber of flops by layer:\n') for c in model.flop_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) f.close() print('Model statistics saved to {}.'.format( os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) def enqueue(sess, coord): with coord.stop_on_exception(): while not coord.should_stop(): # ed: 여기가 Input (.npy) 파일들을 처리하는 코드인듯 # read batch input lidar_per_batch, lidar_mask_per_batch, label_per_batch,\ weight_per_batch = imdb.read_batch() feed_dict = { model.ph_keep_prob: mc.KEEP_PROB, model.ph_lidar_input: lidar_per_batch, model.ph_lidar_mask: lidar_mask_per_batch, model.ph_label: label_per_batch, model.ph_loss_weight: weight_per_batch, } # ed: placeholder에 데이터를 넣어주는 코드 # FIFOQueue라는 함수를 사용해서 여러 input들을 병렬적으로 처리하는듯 sess.run(model.enqueue_op, feed_dict=feed_dict) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.summary.merge_all() init = tf.initialize_all_variables() # ed: sess 초기화 sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) coord = tf.train.Coordinator() enq_threads = [] for _ in range(mc.NUM_ENQUEUE_THREAD): eqth = threading.Thread(target=enqueue, args=[sess, coord]) eqth.start() enq_threads.append(eqth) # ed: 특정 시간이상 연산이 초과되면 assertion을 내주기 위한 코드인듯 run_options = tf.RunOptions(timeout_in_ms=60000) try: # ed: 학습하는 코드 for step in xrange(FLAGS.max_steps): start_time = time.time() # ed: 50번 마다 실행되고 마지막 step에서 실행되는 제어문 if step % FLAGS.summary_step == 0 or step == FLAGS.max_steps - 1: op_list = [ model.lidar_input, model.lidar_mask, model.label, model.train_op, model.loss, model.pred_cls, summary_op ] # ed: 50번 step과 마지막 step에만 실행되는 코드, 학습이 끝나고 성능을 알아보기 위해 실행하는 듯하다 # 이런식으로 Queue를 사용해서 일괄적으로 placeholder들에 feeding을 할 수 있는듯하다 lidar_per_batch, lidar_mask_per_batch, label_per_batch, \ _, loss_value, pred_cls, summary_str = sess.run(op_list, options=run_options) # ed: label, pred_cls에 의해서 class가 정해진 곳에 colorize를 해주는 코드 label_image = visualize_seg(label_per_batch[:6, :, :], mc) pred_image = visualize_seg(pred_cls[:6, :, :], mc) # ed: IOU를 계산하는 코드 # Run evaluation on the batch ious, _, _, _ = evaluate_iou( label_per_batch, pred_cls * np.squeeze(lidar_mask_per_batch), mc.NUM_CLASS) feed_dict = {} # Assume that class-0 is the background class for i in range(1, mc.NUM_CLASS): feed_dict[model.iou_summary_placeholders[i]] = ious[i] iou_summary_list = sess.run(model.iou_summary_ops[1:], feed_dict) # ed: 여기서 summary 형식으로 visualize 해주는건 뭘까? ==> tensorboard를 위한 코드 # Run visualization viz_op_list = [ model.show_label, model.show_depth_img, model.show_pred ] viz_summary_list = sess.run(viz_op_list, feed_dict={ model.depth_image_to_show: lidar_per_batch[:6, :, :, [4]], model.label_to_show: label_image, model.pred_image_to_show: pred_image, }) # Add summaries summary_writer.add_summary(summary_str, step) for sum_str in iou_summary_list: summary_writer.add_summary(sum_str, step) for viz_sum in viz_summary_list: summary_writer.add_summary(viz_sum, step) # force tensorflow to synchronise summaries summary_writer.flush() else: # ed: 실제 학습을 하는 코드 _, loss_value = sess.run([model.train_op, model.loss], options=run_options) # ed: 알고리즘 수행시간 체크 duration = time.time() - start_time # ed: 여러 loss value 중 invalid한 값이 없어야 한다 assert not np.isnan(loss_value), \ 'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \ 'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss) # ed: 10번에 한번씩 print 해주는 코드 if step % 10 == 0: num_images_per_step = mc.BATCH_SIZE images_per_sec = num_images_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f images/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, images_per_sec, sec_per_batch)) sys.stdout.flush() # ed: default=1000 번에 한번씩 model의 가중치를 저장한다 # Save the model checkpoint periodically. if step % FLAGS.checkpoint_step == 0 or step == FLAGS.max_steps - 1: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except Exception, e: coord.request_stop(e) finally:
def train(): """Train SqueezeSeg model""" assert FLAGS.dataset == 'KITTI', \ 'Currently only support KITTI dataset' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu with tf.Graph().as_default(): assert FLAGS.net == 'squeezeSeg', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'squeezeSeg': mc = df_squeezeSeg_config() mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = SqueezeSeg(mc) imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) # save model size, flops, activations by layers with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f: f.write('Number of parameter by layer:\n') count = 0 for c in model.model_size_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nActivation size by layer:\n') for c in model.activation_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nNumber of flops by layer:\n') for c in model.flop_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) f.close() print('Model statistics saved to {}.'.format( os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) def enqueue(sess, coord): with coord.stop_on_exception(): while not coord.should_stop(): # read batch input lidar_per_batch, lidar_mask_per_batch, label_per_batch,\ weight_per_batch = imdb.read_batch() feed_dict = { model.ph_keep_prob: mc.KEEP_PROB, model.ph_lidar_input: lidar_per_batch, model.ph_lidar_mask: lidar_mask_per_batch, model.ph_label: label_per_batch, model.ph_loss_weight: weight_per_batch, } sess.run(model.enqueue_op, feed_dict=feed_dict) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.summary.merge_all() init = tf.initialize_all_variables() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.per_process_gpu_memory_fraction = 0.85 #sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess = tf.Session(config=config) sess.run(init) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) coord = tf.train.Coordinator() enq_threads = [] for _ in range(mc.NUM_ENQUEUE_THREAD): eqth = threading.Thread(target=enqueue, args=[sess, coord]) eqth.start() enq_threads.append(eqth) run_options = tf.RunOptions(timeout_in_ms=60000) try: for step in xrange(FLAGS.max_steps): start_time = time.time() if step % FLAGS.summary_step == 0 or step == FLAGS.max_steps - 1: op_list = [ model.lidar_input, model.lidar_mask, model.label, model.train_op, model.loss, model.pred_cls, summary_op ] lidar_per_batch, lidar_mask_per_batch, label_per_batch, \ _, loss_value, pred_cls, summary_str = sess.run(op_list, options=run_options) label_image = visualize_seg(label_per_batch[:6, :, :], mc) pred_image = visualize_seg(pred_cls[:6, :, :], mc) # Run evaluation on the batch ious, _, _, _ = evaluate_iou( label_per_batch, pred_cls * np.squeeze(lidar_mask_per_batch), mc.NUM_CLASS) feed_dict = {} # Assume that class-0 is the background class for i in range(1, mc.NUM_CLASS): feed_dict[model.iou_summary_placeholders[i]] = ious[i] iou_summary_list = sess.run(model.iou_summary_ops[1:], feed_dict) # Run visualization viz_op_list = [ model.show_label, model.show_depth_img, model.show_pred ] viz_summary_list = sess.run(viz_op_list, feed_dict={ model.depth_image_to_show: lidar_per_batch[:6, :, :, [4]], model.label_to_show: label_image, model.pred_image_to_show: pred_image, }) # Add summaries summary_writer.add_summary(summary_str, step) for sum_str in iou_summary_list: summary_writer.add_summary(sum_str, step) for viz_sum in viz_summary_list: summary_writer.add_summary(viz_sum, step) # force tensorflow to synchronise summaries summary_writer.flush() else: _, loss_value = sess.run([model.train_op, model.loss], options=run_options) duration = time.time() - start_time assert not np.isnan(loss_value), \ 'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \ 'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss) if step % 10 == 0: num_images_per_step = mc.BATCH_SIZE images_per_sec = num_images_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f images/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, images_per_sec, sec_per_batch)) sys.stdout.flush() # Save the model checkpoint periodically. if step % FLAGS.checkpoint_step == 0 or step == FLAGS.max_steps - 1: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except Exception, e: coord.request_stop(e) finally:
def train(): """Train SqueezeSeg model""" assert FLAGS.dataset == 'KITTI', \ 'Currently only support KITTI dataset' #os.environ['CUDA_VISIBLE_DEVICES'] = "" # Check for eager execution. Source: https://mlfromscratch.com/tensorflow-2/#/ if (tf.executing_eagerly()): print('Eager execution is enabled (running operations immediately)\n') print(('Turn eager execution off by running: \n{0}\n{1}').format( '' 'from tensorflow.python.framework.ops import disable_eager_execution', 'disable_eager_execution()')) else: print( 'You are not running eager execution. TensorFlow version >= 2.0.0' 'has eager execution enabled by default.') print(('Turn on eager execution by running: \n\n{0}\n\nOr upgrade ' 'your tensorflow version by running:\n\n{1}').format( 'tf.compat.v1.enable_eager_execution()', '!pip install --upgrade tensorflow\n' '!pip install --upgrade tensorflow-gpu')) with tf.Graph().as_default(): assert FLAGS.net == 'squeezeSeg', \ 'Selected neural net architecture not supported: {}'.format( FLAGS.net) if FLAGS.net == 'squeezeSeg': mc = kitti_squeezeSeg_config() mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path print("Preloaded model: ", mc.LOAD_PRETRAINED_MODEL) print("Creating model ... ") model = SqueezeSeg(mc) print("Model created") imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) # save model size, flops, activations by layers with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f: f.write('Number of parameter by layer:\n') count = 0 for c in model.model_size_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nActivation size by layer:\n') for c in model.activation_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nNumber of flops by layer:\n') for c in model.flop_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) f.close() print('Model statistics saved to {}.'.format( os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) def enqueue(sess, coord): with coord.stop_on_exception(): while not coord.should_stop(): # read batch input lidar_per_batch, lidar_mask_per_batch, label_per_batch,\ weight_per_batch = imdb.read_batch() feed_dict = { model.ph_keep_prob: mc.KEEP_PROB, model.ph_lidar_input: lidar_per_batch, model.ph_lidar_mask: lidar_mask_per_batch, model.ph_label: label_per_batch, model.ph_loss_weight: weight_per_batch, } sess.run(model.enqueue_op, feed_dict=feed_dict) saver = tf.compat.v1.train.Saver(tf.compat.v1.all_variables()) summary_op = tf.compat.v1.summary.merge_all() init = tf.compat.v1.initialize_all_variables() sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto( allow_soft_placement=True, device_count={'GPU': 1})) sess.run(init) summary_writer = tf.compat.v1.summary.FileWriter( FLAGS.train_dir, sess.graph) coord = tf.train.Coordinator() enq_threads = [] for _ in range(mc.NUM_ENQUEUE_THREAD): eqth = threading.Thread(target=enqueue, args=[sess, coord]) eqth.start() enq_threads.append(eqth) run_options = tf.compat.v1.RunOptions(timeout_in_ms=600000) try: for step in xrange(FLAGS.max_steps): print("Starting step {}".format(step)) start_time = time.time() if step % FLAGS.summary_step == 0 or step == FLAGS.max_steps - 1: print("Running summary step") op_list = [ model.lidar_input, model.lidar_mask, model.label, model.train_op, model.loss, model.pred_cls, summary_op ] lidar_per_batch, lidar_mask_per_batch, label_per_batch, \ _, loss_value, pred_cls, summary_str = sess.run(op_list, options=run_options) print("Run") label_image = visualize_seg(label_per_batch[:6, :, :], mc) pred_image = visualize_seg(pred_cls[:6, :, :], mc) print("Run evaluation") # Run evaluation on the batch ious, _, _, _ = evaluate_iou( label_per_batch, pred_cls * np.squeeze(lidar_mask_per_batch), mc.NUM_CLASS) feed_dict = {} # Assume that class-0 is the background class for i in range(1, mc.NUM_CLASS): feed_dict[model.iou_summary_placeholders[i]] = ious[i] iou_summary_list = sess.run(model.iou_summary_ops[1:], feed_dict) # Run visualization viz_op_list = [ model.show_label, model.show_depth_img, model.show_pred ] viz_summary_list = sess.run(viz_op_list, feed_dict={ model.depth_image_to_show: lidar_per_batch[:6, :, :, [4]], model.label_to_show: label_image, model.pred_image_to_show: pred_image, }) # Add summaries summary_writer.add_summary(summary_str, step) for sum_str in iou_summary_list: summary_writer.add_summary(sum_str, step) for viz_sum in viz_summary_list: summary_writer.add_summary(viz_sum, step) # force tensorflow to synchronise summaries summary_writer.flush() else: print("Running Session") _, loss_value = sess.run([model.train_op, model.loss], options=run_options) print("Completed Session") duration = time.time() - start_time assert not np.isnan(loss_value), \ 'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \ 'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss) if step % 10 == 0: num_images_per_step = mc.BATCH_SIZE images_per_sec = num_images_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.5f (%.1f images/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, images_per_sec, sec_per_batch)) sys.stdout.flush() # Save the model checkpoint periodically. if step % FLAGS.checkpoint_step == 0 or step == FLAGS.max_steps - 1: print("Saving checkpoint") checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) print("Checkpoint saved at {}".format(checkpoint_path)) except Exception, e: coord.request_stop(e) finally:
def evaluate(): """Evaluate.""" assert FLAGS.dataset == 'KITTI', \ 'Currently only supports KITTI dataset' if FLAGS.gpu == '0': os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu else: os.environ['CUDA_VISIBLE_DEVICES'] = "" # Train only with CPU with tf.Graph().as_default() as g: assert FLAGS.net == 'squeezeSeg' or FLAGS.net == 'squeezeSeg32' or FLAGS.net == 'squeezeSeg16', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'squeezeSeg': if FLAGS.classes == 'ext': mc = kitti_squeezeSeg_config_ext() # Added ground class else: mc = kitti_squeezeSeg_config() # Original training set mc.LOAD_PRETRAINED_MODEL = False mc.BATCH_SIZE = 1 # TODO(bichen): fix this hard-coded batch size. model = SqueezeSeg(mc) elif FLAGS.net == 'squeezeSeg32': if FLAGS.classes == 'ext': mc = kitti_squeezeSeg32_config_ext() # Added ground class else: mc = kitti_squeezeSeg32_config() # Original training set mc.LOAD_PRETRAINED_MODEL = False mc.BATCH_SIZE = 1 # TODO(bichen): fix this hard-coded batch size. if FLAGS.crf == '1': model = SqueezeSeg32(mc) else: model = SqueezeSeg32x(mc) elif FLAGS.net == 'squeezeSeg16': if FLAGS.classes == 'ext': mc = kitti_squeezeSeg16_config_ext() # Added ground class else: mc = kitti_squeezeSeg16_config() # Original training set mc.LOAD_PRETRAINED_MODEL = False mc.BATCH_SIZE = 1 # TODO(bichen): fix this hard-coded batch size. if FLAGS.crf == '1': # Using conditional random fields (CRF) model = SqueezeSeg16(mc) else: # Disable CRF model = SqueezeSeg16x(mc) imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) eval_summary_ops = [] eval_summary_phs = {} eval_summary_names = [ 'Timing/read', 'Timing/detect', ] for i in range(1, mc.NUM_CLASS): eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_iou') eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_precision') eval_summary_names.append('Pixel_seg_accuracy/' + mc.CLASSES[i] + '_recall') for sm in eval_summary_names: ph = tf.placeholder(tf.float32) eval_summary_phs[sm] = ph eval_summary_ops.append(tf.summary.scalar(sm, ph)) saver = tf.train.Saver(model.model_params) summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) ckpts = set() max_steps = int(FLAGS.max_steps) checkpoint_step = int(FLAGS.ckpt_step) count = 0 while True: if count > max_steps: return file = open(FLAGS.checkpoint_path + 'checkpoint', 'w') file.write("model_checkpoint_path: \"model.ckpt-" + str(count) + "\"") file.close() ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) count += checkpoint_step eval_once(saver, ckpt.model_checkpoint_path, summary_writer, eval_summary_ops, eval_summary_phs, imdb, model) print(count) time.sleep(FLAGS.eval_interval_secs)
def train(): """Train SqueezeSeg model""" assert FLAGS.dataset == 'KITTI', \ 'Currently only support KITTI dataset' if FLAGS.gpu == '0': os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu else: os.environ['CUDA_VISIBLE_DEVICES'] = "" # Train only with CPU with tf.Graph().as_default(): assert FLAGS.net == 'squeezeSeg' or FLAGS.net == 'squeezeSeg32' or FLAGS.net == 'squeezeSeg16', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'squeezeSeg': if FLAGS.classes == 'ext': mc = kitti_squeezeSeg_config_ext() # Added ground class else: mc = kitti_squeezeSeg_config() # Original training set mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = SqueezeSeg(mc) elif FLAGS.net == 'squeezeSeg32': if FLAGS.classes == 'ext': mc = kitti_squeezeSeg32_config_ext() # Added ground class else: mc = kitti_squeezeSeg32_config() # Original training set mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path if FLAGS.crf == '1': # Using conditional random fields (CRF) model = SqueezeSeg32(mc) else: # Disable CRF model = SqueezeSeg32x(mc) elif FLAGS.net == 'squeezeSeg16': if FLAGS.classes == 'ext': mc = kitti_squeezeSeg16_config_ext() # Added ground class else: mc = kitti_squeezeSeg16_config() # Original training set mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path if FLAGS.crf == '1': # Using conditional random fields (CRF) model = SqueezeSeg16(mc) else: # Disable CRF model = SqueezeSeg16x(mc) imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) # save model size, flops, activations by layers with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f: f.write('Number of parameter by layer:\n') count = 0 for c in model.model_size_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nActivation size by layer:\n') for c in model.activation_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nNumber of flops by layer:\n') for c in model.flop_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) f.close() print('Model statistics saved to {}.'.format( os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) def enqueue(sess, coord): with coord.stop_on_exception(): while not coord.should_stop(): # read batch input lidar_per_batch, lidar_mask_per_batch, label_per_batch,\ weight_per_batch = imdb.read_batch() feed_dict = { model.ph_keep_prob: mc.KEEP_PROB, model.ph_lidar_input: lidar_per_batch, model.ph_lidar_mask: lidar_mask_per_batch, model.ph_label: label_per_batch, model.ph_loss_weight: weight_per_batch, } sess.run(model.enqueue_op, feed_dict=feed_dict) # Train form checkpoint ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt == None: '''Creating a new Checkpoint''' saver = tf.train.Saver(tf.all_variables(), max_to_keep=None) summary_op = tf.summary.merge_all() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init) global_step = 0 else: '''Restoring Checkpoint ''' var_list = tf.all_variables() new_var_list = [ variable for variable in var_list if "recurrent_crf" not in variable.name and "conv14_prob" not in variable.name ] try: '''Restoring all variables ''' check_point_path = ckpt.model_checkpoint_path global_step = int( float(check_point_path.split('/')[-1].split('-')[-1])) saver = tf.train.Saver(tf.all_variables(), max_to_keep=None) summary_op = tf.summary.merge_all() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) saver.restore(sess, check_point_path) except tf.errors.InvalidArgumentError: '''Restoring only variables with matching shapes, other variables are randomly initialized''' print( "###########Number of output channels/labels different from checkpoint. Not restoring the Recurrent CRF Layer and conv14 layer###########" ) check_point_path = ckpt.model_checkpoint_path global_step = int( float(check_point_path.split('/')[-1].split('-')[-1])) saver = tf.train.Saver(new_var_list, max_to_keep=None) summary_op = tf.summary.merge_all() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) saver.restore(sess, check_point_path) '''initializing CRF parameters and conv14 layer''' r_crf_var_list = [ variable for variable in var_list if "recurrent_crf" in variable.name or "conv14_prob" in variable.name ] init_new_vars_op = tf.initialize_variables(r_crf_var_list) sess.run(init_new_vars_op) '''Setting up global saver''' saver = tf.train.Saver(tf.all_variables(), max_to_keep=None) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) coord = tf.train.Coordinator() enq_threads = [] for _ in range(mc.NUM_ENQUEUE_THREAD): eqth = threading.Thread(target=enqueue, args=[sess, coord]) eqth.start() enq_threads.append(eqth) run_options = tf.RunOptions(timeout_in_ms=60000) try: for step in xrange(FLAGS.max_steps): start_time = time.time() if step % FLAGS.summary_step == 0 or step == FLAGS.max_steps - 1: op_list = [ model.lidar_input, model.lidar_mask, model.label, model.train_op, model.loss, model.pred_cls, summary_op ] lidar_per_batch, lidar_mask_per_batch, label_per_batch, \ _, loss_value, pred_cls, summary_str = sess.run(op_list, options=run_options) label_image = visualize_seg(label_per_batch[:6, :, :], mc) pred_image = visualize_seg(pred_cls[:6, :, :], mc) # Run evaluation on the batch ious, _, _, _ = evaluate_iou( label_per_batch, pred_cls * np.squeeze(lidar_mask_per_batch), mc.NUM_CLASS) feed_dict = {} # Assume that class-0 is the background class for i in range(1, mc.NUM_CLASS): feed_dict[model.iou_summary_placeholders[i]] = ious[i] iou_summary_list = sess.run(model.iou_summary_ops[1:], feed_dict) # Run visualization viz_op_list = [ model.show_label, model.show_depth_img, model.show_pred ] viz_summary_list = sess.run(viz_op_list, feed_dict={ model.depth_image_to_show: lidar_per_batch[:6, :, :, [4]], model.label_to_show: label_image, model.pred_image_to_show: pred_image, }) # Add summaries summary_writer.add_summary(summary_str, step) for sum_str in iou_summary_list: summary_writer.add_summary(sum_str, step) for viz_sum in viz_summary_list: summary_writer.add_summary(viz_sum, step) # force tensorflow to synchronise summaries summary_writer.flush() else: _, loss_value = sess.run([model.train_op, model.loss], options=run_options) duration = time.time() - start_time assert not np.isnan(loss_value), \ 'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \ 'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss) if step % 10 == 0: num_images_per_step = mc.BATCH_SIZE images_per_sec = num_images_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f images/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, images_per_sec, sec_per_batch)) sys.stdout.flush() # Save the model checkpoint periodically. if step % FLAGS.checkpoint_step == 0 or step == FLAGS.max_steps - 1: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except Exception, e: coord.request_stop(e) finally:
def train(): assert FLAGS.dataset == 'KITTI',\ 'Currently only support KITTI dataset' os.environ['CUDA_VISIBLE_DEVICES'] =FLAGS.gpu with tf.Graph().as_default(): assert FLAGS.net == 'RSnet',\ 'Selected neutral net architecture not supported : {}'.format(FLAGS.net) if FLAGS.net == 'RSnet': mc = kitti_RSnet_config() mc.PRETRAINED_MODEL_PATH =FLAGS.pretrained_model_path model = RSnet(mc) imbd = kitti(FLAGS.image_set, FLAGS.data_path, mc) # save model size, flops, activations by layers with open(os.path.join(FLAGS.train_dir, 'train_dir', 'model_metrics.txt'), 'w') as f: f.write ('Number of parameter by layer:\n') count = 0 for c in model.model_size_counter: f.write('\t{}:{}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nActivation size by Layer: \n') for c in model.activation_counter: f.write('\ttotal: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nNumber of flops by layer: \n') for c in model.flop_counter: f.write('\n{}:{}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) f.close() print('Model statistics saved to {}.'.format( os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) def enqueue(sess, coord): with coord.stop_on_exception(): while not coord.should_stop(): #read batch input lidar_per_batch, lidar_mask_per_batch, label_per_batch, \ weight_per_batch = imbd.read_batch() feed_dict = { model.ph_keep_prob : mc.KEEP_PROB, model.ph_lidar_input: lidar_per_batch, model.ph_lidar_mask: lidar_mask_per_batch, model.ph_lable: label_per_batch, model.ph_loss_weight: weight_per_batch } sess.run(model.enqueue_op, feed_dict) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.summary.merge_all() ## save all summary and can be showed in tensorboard init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) ##threads coord = tf.train_Coordinator() enq_threads = [] for _ in range(mc.NUM_ENQUEUE_THREAD): eqth = threading.Thread(target=enqueue, args=[sess, coord]) eqth.start() enq_threads.append(eqth) run_options = tf.RunOptions(timeout_in_ms=60000) #todo try: for step in xrange(FLAGS.max_steps): start_time = time.time() if step % FLAGS.summary_step == 0 or step == FLAGS.max_steps-1 : op_list =[ model.lidar_input, model.lidar_mask, model.label, model.train_op, model.loss, model.pred_cls, summary_op ] lidar_per_batch, lidar_mask_per_batch, label_per_batch, \ _, loss_value, pred_cls, summary_str = sess.run(op_list, options=run_options)