def _restore_chkpt(self): # Load the latest checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) # Select the last checkpoint ckpt_idx = num_checkpoints - 1 return self._saver.last_checkpoints[ckpt_idx]
def run_latest_checkpoints(self): """Evaluation function for evaluating all the existing checkpoints. This function just runs through all the existing checkpoints. Raises: ValueError: if model.checkpoint_dir doesn't have at least one element. """ if not os.path.exists(self.checkpoint_dir): raise ValueError( '{} must have at least one checkpoint entry.'.format( self.checkpoint_dir)) # Load the latest checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) if self.skip_evaluated_checkpoints: already_evaluated_ckpts = self.get_evaluated_ckpts( self.model_config, self.model_name) ckpt_indices = np.asarray(self.eval_config.ckpt_indices) if ckpt_indices is not None: if ckpt_indices[0] == -1: # Restore the most recent checkpoint ckpt_idx = num_checkpoints - 1 ckpt_indices = [ckpt_idx] print(ckpt_idx, num_checkpoints, ckpt_indices) for ckpt_idx in ckpt_indices: checkpoint_to_restore = self._saver.last_checkpoints[ckpt_idx] self.run_checkpoint_once(checkpoint_to_restore) else: last_checkpoint_id = -1 number_of_evaluations = 0 # go through all existing checkpoints for ckpt_idx in range(num_checkpoints): checkpoint_to_restore = self._saver.last_checkpoints[ckpt_idx] ckpt_id = evaluator_utils.strip_checkpoint_id( checkpoint_to_restore) # Check if checkpoint has been evaluated already already_evaluated = ckpt_id in already_evaluated_ckpts if already_evaluated or ckpt_id <= last_checkpoint_id: number_of_evaluations = max( (ckpt_idx + 1, number_of_evaluations)) continue self.run_checkpoint_once(checkpoint_to_restore) number_of_evaluations += 1 # Save the id of the latest evaluated checkpoint last_checkpoint_id = ckpt_id
def train(model, train_config): """Training function for detection models. Args: model: The detection model object. train_config: a train_*pb2 protobuf. training i.e. loading RPN weights onto AVOD model. """ model = model train_config = train_config # Get model configurations model_config = model.model_config # Create a variable tensor to hold the global step global_step_tensor = tf.Variable( 0, trainable=False, name='global_step') ############################# # Get training configurations ############################# max_iterations = train_config.max_iterations summary_interval = train_config.summary_interval checkpoint_interval = \ train_config.checkpoint_interval max_checkpoints = train_config.max_checkpoints_to_keep paths_config = model_config.paths_config logdir = paths_config.logdir if not os.path.exists(logdir): os.makedirs(logdir) checkpoint_dir = paths_config.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + '/' + \ model_config.checkpoint_name global_summaries = set([]) # The model should return a dictionary of predictions prediction_dict = model.build() #WZN: for debug only #img_input_debug = model._rpn_model._img_preprocessed #bev_input_debug = model._rpn_model._bev_preprocessed #bev_pooled_debug = model._rpn_model.bev_input_pooled #img_pooled_debug = model._rpn_model.img_input_pooled #import numpy as np #import matplotlib.pyplot as plt summary_histograms = train_config.summary_histograms summary_img_images = train_config.summary_img_images summary_bev_images = train_config.summary_bev_images ############################## # Setup loss ############################## losses_dict, total_loss = model.loss(prediction_dict) # Optimizer training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries, global_step_tensor) # Create the train op with tf.variable_scope('train_op'): train_op = slim.learning.create_train_op( total_loss, training_optimizer, clip_gradient_norm=1.0, global_step=global_step_tensor) # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True) # Add the result of the train_op to the summary tf.summary.scalar("training_loss", train_op) # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse()) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_merged = summary_utils.summaries_to_keep( summaries, global_summaries, histograms=summary_histograms, input_imgs=summary_img_images, input_bevs=summary_bev_images ) allow_gpu_mem_growth = train_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth sess = tf.Session(config=config) else: sess = tf.Session() # Create unique folder name using datetime for summary writer datetime_str = str(datetime.datetime.now()) logdir = logdir + '/train' train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str, sess.graph) # Create init op init = tf.global_variables_initializer() # Continue from last saved checkpoint if not train_config.overwrite_checkpoints: trainer_utils.load_checkpoints(checkpoint_dir, saver) if len(saver.last_checkpoints) > 0: checkpoint_to_restore = saver.last_checkpoints[-1] saver.restore(sess, checkpoint_to_restore) else: # Initialize the variables sess.run(init) else: # Initialize the variables sess.run(init) # Read the global step if restored global_step = tf.train.global_step(sess, global_step_tensor) print('Starting from step {} / {}'.format( global_step, max_iterations)) # Main Training Loop last_time = time.time() for step in range(global_step, max_iterations + 1): # Save checkpoint if step % checkpoint_interval == 0: global_step = tf.train.global_step(sess, global_step_tensor) saver.save(sess, save_path=checkpoint_path, global_step=global_step) print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format( step, max_iterations, checkpoint_path, global_step)) # Create feed_dict for inferencing feed_dict = model.create_feed_dict() #WZN: only run for debug #bev_p,img_p = sess.run([bev_pooled_debug,img_pooled_debug],feed_dict) #bev_p = np.squeeze(bev_p) #img_p = np.squeeze(img_p) #import pdb #pdb.set_trace() # Write summaries and train op if step % summary_interval == 0: current_time = time.time() time_elapsed = current_time - last_time last_time = current_time train_op_loss, summary_out = sess.run( [train_op, summary_merged], feed_dict=feed_dict) print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format( step, train_op_loss, time_elapsed)) train_writer.add_summary(summary_out, step) else: # Run the train op only sess.run(train_op, feed_dict) # Close the summary writers train_writer.close()
def repeated_checkpoint_run(self): """Periodically evaluates the checkpoints inside the `checkpoint_dir`. This function evaluates all the existing checkpoints as they are being generated. If there are none, it sleeps until new checkpoints become available. Since there is no synchronization guarantee for the trainer and evaluator, at each iteration it reloads all the checkpoints and searches for the last checkpoint to continue from. This is meant to be called in parallel to the trainer to evaluate the models regularly. Raises: ValueError: if model.checkpoint_dir doesn't have at least one element. """ if not os.path.exists(self.checkpoint_dir): raise ValueError( '{} must have at least one checkpoint entry.'.format( self.checkpoint_dir)) # Copy kitti native eval code into the predictions folder if self.do_kitti_native_eval: evaluator_utils.copy_kitti_native_code( self.model_config.checkpoint_name) if self.skip_evaluated_checkpoints: already_evaluated_ckpts = self.get_evaluated_ckpts( self.model_config, self.full_model) tf.logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) last_checkpoint_id = -1 number_of_evaluations = 0 while True: # Load current checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) start = time.time() if number_of_evaluations >= num_checkpoints: tf.logging.info( 'No new checkpoints found in %s.' 'Will try again in %d seconds', self.checkpoint_dir, self.eval_wait_interval) else: for ckpt_idx in range(num_checkpoints): checkpoint_to_restore = \ self._saver.last_checkpoints[ckpt_idx] ckpt_id = evaluator_utils.strip_checkpoint_id( checkpoint_to_restore) # Check if checkpoint has been evaluated already already_evaluated = ckpt_id in already_evaluated_ckpts if already_evaluated or ckpt_id <= last_checkpoint_id: number_of_evaluations = max( (ckpt_idx + 1, number_of_evaluations)) continue self.run_checkpoint_once(checkpoint_to_restore) number_of_evaluations += 1 # Save the id of the latest evaluated checkpoint last_checkpoint_id = ckpt_id time_to_next_eval = start + self.eval_wait_interval - time.time() if time_to_next_eval > 0: time.sleep(time_to_next_eval)
def run_profiler(pipeline_config_path, run_mode, data_split, ckpt_index): avod_top_dir = avod.top_dir() # Timeline results logfile file_name = avod_top_dir + '/scripts/profilers/tf_profiler/' + \ 'tf_timeline_output.json' with tf.Session() as sess: if run_mode == 'train': # In train mode, data_split should not be 'test' as the test # split does not have gt. if data_split == 'test': raise ValueError('Data split can only be train or val' 'in train mode.') model, train_op = set_up_model_train_mode(pipeline_config_path, data_split) init = tf.global_variables_initializer() sess.run(init) elif run_mode == 'test': model, model_config = set_up_model_test_mode( pipeline_config_path, data_split) paths_config = model_config.paths_config checkpoint_dir = paths_config.checkpoint_dir prediction_dict = model.build() # Load the weights saver = tf.train.Saver() trainer_utils.load_checkpoints(checkpoint_dir, saver) if not saver.last_checkpoints: raise ValueError('Need existing checkpoints to run' 'in test_mode') checkpoint_to_restore = saver.last_checkpoints[ckpt_index] saver.restore(sess, checkpoint_to_restore) else: raise ValueError('Invalid run_mode {}'.format(run_mode)) feed_dict = model.create_feed_dict() ############################################ # Parameters and Shapes ############################################ graph = tf.get_default_graph() # Print trainable variable parameter statistics to stdout. ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder # Gives the total number of trainable parameters param_stats = tf.profiler.profile( graph, options=ProfileOptionBuilder.trainable_variables_parameter()) # Gives the FLOPS for the ops tf.profiler.profile( graph, options=tf.profiler.ProfileOptionBuilder.float_operation()) run_metadata = tf.RunMetadata() if run_mode == 'train': sess.run( [train_op], options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata, feed_dict=feed_dict) else: # Run in test mode sess.run( prediction_dict, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata, feed_dict=feed_dict) # The profiler gives us rounded FLOP counts # So instead query it directly and count the total op_missing_shape = 0 # op_missing_shape_names = [] total_flops = 0 for op in graph.get_operations(): try: stats = ops.get_stats_for_node_def(graph, op.node_def, 'flops') if stats.value: total_flops += stats.value except ValueError: op_missing_shape += 1 # op_missing_shape_names.append(op.name) print('=============================================================') print('Number of ops with missing shape: ', op_missing_shape) print('=============================================================') ############################################ # Log Time and Memory ############################################ # Log the analysis to file # 'code' view organizes profile using Python call stack opts = ProfileOptionBuilder( ProfileOptionBuilder.time_and_memory()).with_timeline_output( file_name).build() tf.profiler.profile(graph, run_meta=run_metadata, cmd='code', options=opts) ############################################ # Show Time and Memory on the console ############################################ tf.profiler.profile( graph, run_meta=run_metadata, cmd='op', options=tf.profiler.ProfileOptionBuilder.time_and_memory()) # print the total number of parameters print('Total params: %d' % param_stats.total_parameters) print('Total FLOPs: ', total_flops) print('=============================================================')
def train(model, train_config): """Training function for detection models. Args: model: The detection model object. train_config: a train_*pb2 protobuf. training i.e. loading RPN weights onto AVOD model. """ model = model train_config = train_config # Get model configurations model_config = model.model_config # Create a variable tensor to hold the global step global_step_tensor = tf.Variable(0, trainable=False, name='global_step') ############################# # Get training configurations ############################# max_iterations = train_config.max_iterations summary_interval = train_config.summary_interval checkpoint_interval = \ train_config.checkpoint_interval max_checkpoints = train_config.max_checkpoints_to_keep paths_config = model_config.paths_config logdir = paths_config.logdir if not os.path.exists(logdir): os.makedirs(logdir) checkpoint_dir = paths_config.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + '/' + \ model_config.checkpoint_name global_summaries = set([]) # The model should return a dictionary of predictions prediction_dict = model.build() summary_histograms = train_config.summary_histograms summary_img_images = train_config.summary_img_images summary_bev_images = train_config.summary_bev_images ############################## # Setup loss ############################## loss_dict, total_loss, rpn_score_2d_loss, \ rpn_acc_score_neg, rpn_acc_score_pos, \ rpn_class_loss, rpn_reg_loss, rpn_acc_all, \ rpn_acc_pos, refine_class_loss, refine_reg_loss, \ avod_acc_all, avod_acc_pos = model.loss(prediction_dict) # Optimizer training_optimizer = optimizer_builder.build(train_config.optimizer, global_summaries, global_step_tensor) # Create the train op with tf.variable_scope('train_op'): train_op = slim.learning.create_train_op( total_loss, training_optimizer, clip_gradient_norm=1.0, global_step=global_step_tensor) # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True) # Add the result of the train_op to the summary tf.summary.scalar("training_loss", train_op) # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse()) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_merged = summary_utils.summaries_to_keep( summaries, global_summaries, histograms=summary_histograms, input_imgs=summary_img_images, input_bevs=summary_bev_images) allow_gpu_mem_growth = train_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth sess = tf.Session(config=config) else: sess = tf.Session() # Create unique folder name using datetime for summary writer datetime_str = str(datetime.datetime.now()) logdir = logdir + '/train' train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str, sess.graph) # Create init op init = tf.global_variables_initializer() # Continue from last saved checkpoint if not train_config.overwrite_checkpoints: trainer_utils.load_checkpoints(checkpoint_dir, saver) all_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES) sess.run(tf.variables_initializer(all_variables)) var_list = [ var for var in all_variables if "beta" not in var.name and 'Adam' not in var.name and 'Average' not in var.name ] saver_part = tf.train.Saver(var_list=var_list) if train_config.use_pretrained: saver_part.restore(sess, train_config.pretrained) print('Model loaded from: {}'.format(train_config.pretrained)) step_base = tf.train.global_step(sess, global_step_tensor) elif len(saver.last_checkpoints) > 0: checkpoint_to_restore = saver.last_checkpoints[-1] saver_part.restore(sess, checkpoint_to_restore) step_base = 0 else: # Initialize the variables sess.run(init) else: # Initialize the variables sess.run(init) global_step = tf.train.global_step(sess, global_step_tensor) - step_base print('Starting from step {} / {}'.format(global_step, max_iterations)) # Main Training Loop last_time = time.time() for step in range(global_step, max_iterations + 1): # Save checkpoint if step % checkpoint_interval == 0: global_step = tf.train.global_step(sess, global_step_tensor) - step_base saver.save(sess, save_path=checkpoint_path, global_step=global_step) print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format( step, max_iterations, checkpoint_path, global_step)) # Create feed_dict for inferencing feed_dict = model.create_feed_dict() # Write summaries and train op if step % summary_interval == 0: current_time = time.time() time_elapsed = current_time - last_time last_time = current_time #train_op_loss, summary_out = sess.run( # [train_op, summary_merged], feed_dict=feed_dict) train_op_loss, summary_out, \ rpn_score_2d_loss_np, \ rpn_acc_score_neg_np, \ rpn_acc_score_pos_np, \ rpn_class_loss_np, rpn_reg_loss_np, \ rpn_acc_all_np, rpn_acc_pos_np, \ refine_class_loss_np, refine_reg_loss_np, \ avod_acc_all_np, avod_acc_pos_np = sess.run( [train_op, summary_merged, rpn_score_2d_loss, rpn_acc_score_neg, rpn_acc_score_pos, rpn_class_loss, rpn_reg_loss, rpn_acc_all, rpn_acc_pos, refine_class_loss, refine_reg_loss, avod_acc_all, avod_acc_pos], feed_dict=feed_dict) print('Step {}, Total Loss {:0.3f} | Score {:0.3f}, Acc {:0.2f} {:0.2f} | RPN Class {:0.3f}, Reg {:0.3f}, Acc {:0.2f} {:0.2f} | Final Class {:0.3f}, Reg {:0.3f}, Acc {:0.2f} {:0.2f}'.format(\ step, train_op_loss, rpn_score_2d_loss_np, rpn_acc_score_neg_np * 100, \ rpn_acc_score_pos_np * 100, rpn_class_loss_np, rpn_reg_loss_np, rpn_acc_all_np * 100, \ rpn_acc_pos_np * 100, refine_class_loss_np, refine_reg_loss_np, avod_acc_all_np * 100, avod_acc_pos_np * 100)) train_writer.add_summary(summary_out, step) else: # Run the train op only sess.run(train_op, feed_dict) # Close the summary writers train_writer.close()
def test(model_config, eval_config, dataset_config, data_split, ckpt_indices): # Overwrite the defaults dataset_config = config_builder.proto_to_obj(dataset_config) dataset_config.data_split = data_split dataset_config.data_split_dir = 'training' if data_split == 'test': dataset_config.data_split_dir = 'testing' eval_config.eval_mode = 'test' eval_config.evaluate_repeatedly = False dataset_config.has_labels = False # Enable this to see the actually memory being used eval_config.allow_gpu_mem_growth = True eval_config = config_builder.proto_to_obj(eval_config) # Grab the checkpoint indices to evaluate eval_config.ckpt_indices = ckpt_indices # Remove augmentation during evaluation in test mode dataset_config.aug_list = [] # Build the dataset object dataset = DatasetBuilder.build_kitti_dataset(dataset_config, use_defaults=False) # Setup the model model_name = model_config.model_name # Overwrite repeated field model_config = config_builder.proto_to_obj(model_config) # Switch path drop off during evaluation model_config.path_drop_probabilities = [1.0, 1.0] with tf.Graph().as_default(): if model_name == 'avod_model': model = AvodModel(model_config, train_val_test=eval_config.eval_mode, dataset=dataset) elif model_name == 'rpn_model': model = RpnModel(model_config, train_val_test=eval_config.eval_mode, dataset=dataset) else: raise ValueError('Invalid model name {}'.format(model_name)) #model_evaluator = Evaluator(model, dataset_config, eval_config) #model_evaluator.run_latest_checkpoints() # Create a variable tensor to hold the global step global_step_tensor = tf.Variable(0, trainable=False, name='global_step') allow_gpu_mem_growth = eval_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth _sess = tf.Session(config=config) else: _sess = tf.Session() _prediction_dict = model.build() _saver = tf.train.Saver() trainer_utils.load_checkpoints(model_config.paths_config.checkpoint_dir, _saver) num_checkpoints = len(_saver.last_checkpoints) print("test:",num_checkpoints) checkpoint_to_restore = _saver.last_checkpoints[num_checkpoints-1] _saver.restore(_sess, checkpoint_to_restore) num_samples = model.dataset.num_samples num_valid_samples = 0 current_epoch = model.dataset.epochs_completed while current_epoch == model.dataset.epochs_completed: # Keep track of feed_dict speed start_time = time.time() feed_dict = model.create_feed_dict() feed_dict_time = time.time() - start_time # Get sample name from model sample_name = model.sample_info['sample_name'] num_valid_samples += 1 print("Step: {} / {}, Inference on sample {}".format( num_valid_samples, num_samples, sample_name)) print("test mode") inference_start_time = time.time() # Don't calculate loss or run summaries for test predictions = _sess.run(_prediction_dict, feed_dict=feed_dict) inference_time = time.time() - inference_start_time print("inference time:", inference_time) predictions_and_scores = get_avod_predicted_boxes_3d_and_scores(predictions) #print(predictions_and_scores) #im_path = os.path.join(dataset_dir, 'training/image_2/{:06d}.png'.format(img_idx)) #im = cv2.imread(im_path) #cv2.imshow('result',im) #cv2.waitKey(30) prediction_boxes_3d = predictions_and_scores[:, 0:7] prediction_scores = predictions_and_scores[:, 7] prediction_class_indices = predictions_and_scores[:, 8] gt_classes = ['Car'] fig_size = (10, 6.1) avod_score_threshold = 0.1 if len(prediction_boxes_3d) > 0: # Apply score mask avod_score_mask = prediction_scores >= avod_score_threshold prediction_boxes_3d = prediction_boxes_3d[avod_score_mask] prediction_scores = prediction_scores[avod_score_mask] prediction_class_indices = \ prediction_class_indices[avod_score_mask] if len(prediction_boxes_3d) > 0: dataset_dir = model.dataset.dataset_dir sample_name = (model.dataset.sample_names[model.dataset._index_in_epoch - 1]) img_idx = int(sample_name) print("frame_index",img_idx) image_path = model.dataset.get_rgb_image_path(sample_name) image = Image.open(image_path) image_size = image.size if model.dataset.has_labels: gt_objects = obj_utils.read_labels(dataset.label_dir, img_idx) else: gt_objects = [] filtered_gt_objs = model.dataset.kitti_utils.filter_labels( gt_objects, classes=gt_classes) stereo_calib = calib_utils.read_calibration(dataset.calib_dir, img_idx) calib_p2 = stereo_calib.p2 # Project the 3D box predictions to image space image_filter = [] final_boxes_2d = [] for i in range(len(prediction_boxes_3d)): box_3d = prediction_boxes_3d[i, 0:7] img_box = box_3d_projector.project_to_image_space( box_3d, calib_p2, truncate=True, image_size=image_size, discard_before_truncation=False) if img_box is not None: image_filter.append(True) final_boxes_2d.append(img_box) else: image_filter.append(False) final_boxes_2d = np.asarray(final_boxes_2d) final_prediction_boxes_3d = prediction_boxes_3d[image_filter] final_scores = prediction_scores[image_filter] final_class_indices = prediction_class_indices[image_filter] num_of_predictions = final_boxes_2d.shape[0] # Convert to objs final_prediction_objs = \ [box_3d_encoder.box_3d_to_object_label( prediction, obj_type='Prediction') for prediction in final_prediction_boxes_3d] for (obj, score) in zip(final_prediction_objs, final_scores): obj.score = score pred_fig, pred_2d_axes, pred_3d_axes = \ vis_utils.visualization(dataset.rgb_image_dir, img_idx, display=False, fig_size=fig_size) draw_predictions(filtered_gt_objs, calib_p2, num_of_predictions, final_prediction_objs, final_class_indices, final_boxes_2d, pred_2d_axes, pred_3d_axes, True, True, gt_classes, False) #cv2.imshow('result',pred_fig) print(type(pred_fig)) pred_fig.canvas.draw() img = np.fromstring(pred_fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') img = img.reshape(pred_fig.canvas.get_width_height()[::-1] + (3,)) cv2.imshow('result',img) #draw bird view kitti_utils = model.dataset.kitti_utils print(img.shape[0:2]) point_cloud = kitti_utils.get_point_cloud( 'lidar', img_idx, (370, 1242)) ground_plane = kitti_utils.get_ground_plane(sample_name) bev_images = kitti_utils.create_bev_maps(point_cloud, ground_plane) density_map = np.array(bev_images.get("density_map")) _, box_points_norm = box_3d_projector.project_to_bev( final_prediction_boxes_3d, [[-40, 40], [0, 70]]) density_map = draw_boxes(density_map, box_points_norm) cv2.imshow('lidar',density_map) cv2.waitKey(-1)
def repeated_checkpoint_run(self): """Periodically evaluates the checkpoints inside the `checkpoint_dir`. This function evaluates all the existing checkpoints as they are being generated. If there are none, it sleeps until new checkpoints become available. Since there is no synchronization guarantee for the trainer and evaluator, at each iteration it reloads all the checkpoints and searches for the last checkpoint to continue from. This is meant to be called in parallel to the trainer to evaluate the models regularly. Raises: ValueError: if model.checkpoint_dir doesn't have at least one element. """ if not os.path.exists(self.checkpoint_dir): raise ValueError( '{} must have at least one checkpoint entry.'.format( self.checkpoint_dir)) # Copy kitti native eval code into the predictions folder if self.do_kitti_native_eval: evaluator_utils.copy_kitti_native_code( self.model_config.checkpoint_name) if self.skip_evaluated_checkpoints: already_evaluated_ckpts = self.get_evaluated_ckpts( self.model_config) else: already_evaluated_ckpts = [] tf.logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) last_checkpoint_id = -1 number_of_evaluations = 0 #Dont have to add summary(for model inference at each sample) at repeated evaluation.. #only care avg loss at each ckpt step. #self.summary_merged = None evaluated_ckpts = [ckpt for ckpt in already_evaluated_ckpts] while True: # Load current checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) no_newckpts = True evaluated_ckpts.sort() start = time.time() for ckpt_idx in range(num_checkpoints): checkpoint_to_restore = \ self._saver.last_checkpoints[ckpt_idx] ckpt_id = evaluator_utils.strip_checkpoint_id( checkpoint_to_restore) # Check if checkpoint has been evaluated already if ckpt_id == 0 or ckpt_id in evaluated_ckpts: continue else: no_newckpts = False print('evaluated ckpts: ', evaluated_ckpts) print('processing ckpt id: ', ckpt_id) self.run_checkpoint_once(checkpoint_to_restore) evaluated_ckpts.append(ckpt_id) time_to_next_eval = start + self.eval_wait_interval - time.time() if no_newckpts: tf.logging.info( 'No new checkpoints found in %s.' 'Will try again in %d seconds', self.checkpoint_dir, self.eval_wait_interval) if time_to_next_eval > 0: time.sleep(time_to_next_eval)
def train(model, train_config): """Training function for detection models. Args: model: The detection model object. train_config: a train_*pb2 protobuf. training i.e. loading RPN weights onto AVOD model. """ model = model train_config = train_config # Get model configurations model_config = model.model_config # Create a variable tensor to hold the global step global_step_tensor = tf.Variable(0, trainable=False, name='global_step') ############################# # Get training configurations ############################# max_iterations = train_config.max_iterations summary_interval = train_config.summary_interval checkpoint_interval = \ train_config.checkpoint_interval max_checkpoints = train_config.max_checkpoints_to_keep paths_config = model_config.paths_config logdir = paths_config.logdir if not os.path.exists(logdir): os.makedirs(logdir) checkpoint_dir = paths_config.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + '/' + \ model_config.checkpoint_name global_summaries = set([]) # The model should return a dictionary of predictions prediction_dict = model.build() summary_histograms = train_config.summary_histograms summary_img_images = train_config.summary_img_images summary_bev_images = train_config.summary_bev_images ############################## # Setup loss ############################## losses_dict, total_loss = model.loss(prediction_dict) ############################################################################################## # Select trainable variables of the original AVOD model to set gradient to 0(var0) var_moe = [ var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mix_of_experts') ] var0 = [var for var in tf.trainable_variables()] var_all_but_var_moe = [ var for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ] for var in var_moe: var0.remove(var) var_all_but_var_moe.remove(var) ############################################################################################## # Create optimizer with 0 gradient for the AVOD model and an optimizer for the MoE training_optimizer0 = tf.train.GradientDescentOptimizer(0.0) training_optimizer1 = optimizer_builder.build(train_config.optimizer, global_summaries, global_step_tensor) ############################################################################################## # Create the train op. train_op1 is for MoE and train_op0 is for AVOD with tf.variable_scope('train_op'): #Create training operations train_op1 = slim.learning.create_train_op( total_loss, training_optimizer1, variables_to_train=var_moe, clip_gradient_norm=1.0, global_step=global_step_tensor) train_op0 = slim.learning.create_train_op( total_loss, training_optimizer0, variables_to_train=var0, clip_gradient_norm=1.0, global_step=global_step_tensor) train_op = tf.group(train_op1, train_op0) ############################################################################################## # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True) # Add the result of the train_op to the summary tf.summary.scalar("training_loss", train_op1) # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse()) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_merged = summary_utils.summaries_to_keep( summaries, global_summaries, histograms=summary_histograms, input_imgs=summary_img_images, input_bevs=summary_bev_images) allow_gpu_mem_growth = train_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth sess = tf.Session(config=config) else: sess = tf.Session() # Create unique folder name using datetime for summary writer datetime_str = str(datetime.datetime.now()) logdir = logdir + '/train' train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str, sess.graph) # Create init op init = tf.global_variables_initializer() # Continue from last saved checkpoint if not train_config.overwrite_checkpoints: trainer_utils.load_checkpoints(checkpoint_dir, saver) if len(saver.last_checkpoints) > 0: checkpoint_to_restore = saver.last_checkpoints[-1] saver.restore(sess, checkpoint_to_restore) else: # Initialize the variables # Restore checkpoints from original avod model. Give the correct path to restore checkpoint_path_start = train_config.moe_config.initial_avod_checkpoint_path variables_to_restore = dict() for var in var_all_but_var_moe: variables_to_restore[var.op.name] = slim.get_unique_variable( var.op.name) init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path_start, variables_to_restore) sess.run(init) sess.run(init_assign_op, init_feed_dict) ############################################################################################## else: # Initialize the variables sess.run(init) # Read the global step if restored global_step = tf.train.global_step(sess, global_step_tensor) print('Starting from step {} / {}'.format(global_step, max_iterations)) # Main Training Loop last_time = time.time() for step in range(global_step, max_iterations + 1): # Save checkpoint if step % checkpoint_interval == 0: global_step = tf.train.global_step(sess, global_step_tensor) saver.save(sess, save_path=checkpoint_path, global_step=global_step) print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format( step, max_iterations, checkpoint_path, global_step)) # Create feed_dict for inferencing feed_dict = model.create_feed_dict() # Write summaries and train op if step % summary_interval == 0: current_time = time.time() time_elapsed = current_time - last_time last_time = current_time train_op_loss, summary_out = sess.run([train_op1, summary_merged], feed_dict=feed_dict) print(train_op_loss) print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format( step, train_op_loss, time_elapsed)) train_writer.add_summary(summary_out, step) else: # Run the train op only sess.run(train_op1, feed_dict) # Close the summary writers train_writer.close()
def test_load_model_weights(self): # Tests loading weights train_val_test = 'train' # Overwrite the training iterations self.train_config.max_iterations = 1 self.train_config.overwrite_checkpoints = True with tf.Graph().as_default(): model = RpnModel(self.model_config, train_val_test=train_val_test, dataset=self.dataset) trainer.train(model, self.train_config) paths_config = self.model_config.paths_config rpn_checkpoint_dir = paths_config.checkpoint_dir # load the weights back in init_op = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init_op) trainer_utils.load_checkpoints(rpn_checkpoint_dir, saver) checkpoint_to_restore = saver.last_checkpoints[-1] trainer_utils.load_model_weights(sess, checkpoint_to_restore) rpn_vars = slim.get_model_variables() rpn_weights = sess.run(rpn_vars) self.assertGreater(len(rpn_weights), 0, msg='Loaded RPN weights are empty') with tf.Graph().as_default(): model = AvodModel(self.model_config, train_val_test=train_val_test, dataset=self.dataset) model.build() # load the weights back in init_op = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init_op) trainer_utils.load_checkpoints(rpn_checkpoint_dir, saver) checkpoint_to_restore = saver.last_checkpoints[-1] trainer_utils.load_model_weights(sess, checkpoint_to_restore) avod_vars = slim.get_model_variables() avod_weights = sess.run(avod_vars) # AVOD weights should include both RPN + AVOD weights self.assertGreater(len(avod_weights), len(rpn_weights), msg='Expected more weights for AVOD') # grab weights corresponding to RPN by index # since the model variables are ordered rpn_len = len(rpn_weights) loaded_rpn_vars = avod_vars[0:rpn_len] rpn_weights_reload = sess.run(loaded_rpn_vars) # Make sure the reloaded weights match the originally # loaded weights for i in range(rpn_len): np.testing.assert_array_equal(rpn_weights_reload[i], rpn_weights[i])