def run_latest_checkpoints(self): """Evaluation function for evaluating all the existing checkpoints. This function just runs through all the existing checkpoints. Raises: ValueError: if model.checkpoint_dir doesn't have at least one element. """ if not os.path.exists(self.checkpoint_dir): raise ValueError( '{} must have at least one checkpoint entry.'.format( self.checkpoint_dir)) # Load the latest checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) if self.skip_evaluated_checkpoints: already_evaluated_ckpts = self.get_evaluated_ckpts( self.model_config, self.model_name) ckpt_indices = np.asarray(self.eval_config.ckpt_indices) if ckpt_indices is not None: print('ckpt_indices = ', ckpt_indices) if ckpt_indices[0] == -1: # Restore the most recent checkpoint ckpt_idx = num_checkpoints - 1 ckpt_indices = [ckpt_idx] print( '***************Restore the most recent checkpoint********' ) print('ckpt_idx = ', ckpt_idx) print('ckpt_indices = ', ckpt_indices) for ckpt_idx in ckpt_indices: checkpoint_to_restore = self._saver.last_checkpoints[ckpt_idx] self.run_checkpoint_once(checkpoint_to_restore) else: last_checkpoint_id = -1 number_of_evaluations = 0 # go through all existing checkpoints for ckpt_idx in range(num_checkpoints): checkpoint_to_restore = self._saver.last_checkpoints[ckpt_idx] ckpt_id = evaluator_utils.strip_checkpoint_id( checkpoint_to_restore) # Check if checkpoint has been evaluated already already_evaluated = ckpt_id in already_evaluated_ckpts if already_evaluated or ckpt_id <= last_checkpoint_id: number_of_evaluations = max( (ckpt_idx + 1, number_of_evaluations)) continue self.run_checkpoint_once(checkpoint_to_restore) number_of_evaluations += 1 # Save the id of the latest evaluated checkpoint last_checkpoint_id = ckpt_id
def repeated_checkpoint_run(self): """Periodically evaluates the checkpoints inside the `checkpoint_dir`. This function evaluates all the existing checkpoints as they are being generated. If there are none, it sleeps until new checkpoints become available. Since there is no synchronization guarantee for the trainer and evaluator, at each iteration it reloads all the checkpoints and searches for the last checkpoint to continue from. This is meant to be called in parallel to the trainer to evaluate the models regularly. Raises: ValueError: if model.checkpoint_dir doesn't have at least one element. """ if not os.path.exists(self.checkpoint_dir): raise ValueError('{} must have at least one checkpoint entry.' .format(self.checkpoint_dir)) # Copy panoptic native eval code into the predictions folder if self.do_panoptic_native_eval: evaluator_panoptic_utils.copy_panoptic_native_code( self.model_config.checkpoint_name) if self.skip_evaluated_checkpoints: already_evaluated_ckpts = self.get_evaluated_ckpts( self.model_config, self.full_model) tf.logging.info( 'Starting evaluation at ' + time.strftime( '%Y-%m-%d-%H:%M:%S', time.gmtime())) last_checkpoint_id = -1 number_of_evaluations = 0 while True: # Load current checkpoints available trainer_utils.load_checkpoints(self.checkpoint_dir, self._saver) num_checkpoints = len(self._saver.last_checkpoints) start = time.time() if number_of_evaluations >= num_checkpoints: tf.logging.info('No new checkpoints found in %s.' 'Will try again in %d seconds', self.checkpoint_dir, self.eval_wait_interval) else: for ckpt_idx in range(num_checkpoints): checkpoint_to_restore = \ self._saver.last_checkpoints[ckpt_idx] ckpt_id = evaluator_panoptic_utils.strip_checkpoint_id( checkpoint_to_restore) # Check if checkpoint has been evaluated already already_evaluated = ckpt_id in already_evaluated_ckpts if already_evaluated or ckpt_id <= last_checkpoint_id: number_of_evaluations = max((ckpt_idx + 1, number_of_evaluations)) continue self.run_checkpoint_once(checkpoint_to_restore) number_of_evaluations += 1 # Save the id of the latest evaluated checkpoint last_checkpoint_id = ckpt_id time_to_next_eval = start + self.eval_wait_interval - time.time() if time_to_next_eval > 0: time.sleep(time_to_next_eval)
def test_load_model_weights(self): # Tests loading weights train_val_test = 'train' # Overwrite the training iterations self.train_config.max_iterations = 1 self.train_config.overwrite_checkpoints = True with tf.Graph().as_default(): model = RpnModel(self.model_config, train_val_test=train_val_test, dataset=self.dataset) trainer.train(model, self.train_config) paths_config = self.model_config.paths_config rpn_checkpoint_dir = paths_config.checkpoint_dir # load the weights back in init_op = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init_op) trainer_utils.load_checkpoints(rpn_checkpoint_dir, saver) checkpoint_to_restore = saver.last_checkpoints[-1] trainer_utils.load_model_weights(sess, checkpoint_to_restore) rpn_vars = slim.get_model_variables() rpn_weights = sess.run(rpn_vars) self.assertGreater(len(rpn_weights), 0, msg='Loaded RPN weights are empty') with tf.Graph().as_default(): model = AvodModel(self.model_config, train_val_test=train_val_test, dataset=self.dataset) model.build() # load the weights back in init_op = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init_op) trainer_utils.load_checkpoints(rpn_checkpoint_dir, saver) checkpoint_to_restore = saver.last_checkpoints[-1] trainer_utils.load_model_weights(sess, checkpoint_to_restore) avod_vars = slim.get_model_variables() avod_weights = sess.run(avod_vars) # AVOD weights should include both RPN + AVOD weights self.assertGreater(len(avod_weights), len(rpn_weights), msg='Expected more weights for AVOD') # grab weights corresponding to RPN by index # since the model variables are ordered rpn_len = len(rpn_weights) loaded_rpn_vars = avod_vars[0:rpn_len] rpn_weights_reload = sess.run(loaded_rpn_vars) # Make sure the reloaded weights match the originally # loaded weights for i in range(rpn_len): np.testing.assert_array_equal(rpn_weights_reload[i], rpn_weights[i])
def train(model, train_config): """Training function for detection models. Args: model: The detection model object. train_config: a train_*pb2 protobuf. training i.e. loading RPN weights onto AVOD model. """ model = model train_config = train_config # Get model configurations model_config = model.model_config # Create a variable tensor to hold the global step global_step_tensor = tf.Variable( 0, trainable=False, name='global_step') ############################# # Get training configurations ############################# max_iterations = train_config.max_iterations summary_interval = train_config.summary_interval checkpoint_interval = \ train_config.checkpoint_interval max_checkpoints = train_config.max_checkpoints_to_keep paths_config = model_config.paths_config logdir = paths_config.logdir if not os.path.exists(logdir): os.makedirs(logdir) checkpoint_dir = paths_config.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + '/' + \ model_config.checkpoint_name print('checkpoint_path = ', checkpoint_path) global_summaries = set([]) # The model should return a dictionary of predictions prediction_dict = model.build() summary_histograms = train_config.summary_histograms summary_img_images = train_config.summary_img_images summary_bev_images = train_config.summary_bev_images ############################## # Setup loss ############################## losses_dict, total_loss = model.loss(prediction_dict) # Optimizer training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries, global_step_tensor) # Create the train op with tf.variable_scope('train_op'): # total_loss = tf.Print(total_loss, ['total_loss = ', total_loss]) train_op = slim.learning.create_train_op( total_loss, training_optimizer, clip_gradient_norm=1.0, global_step=global_step_tensor) # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True) # Add the result of the train_op to the summary # tf.summary.scalar("training_loss", train_op) # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse()) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_merged = summary_utils.summaries_to_keep( summaries, global_summaries, histograms=summary_histograms, input_imgs=summary_img_images, input_bevs=summary_bev_images ) allow_gpu_mem_growth = train_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth sess = tf.Session(config=config) else: sess = tf.Session() # Create unique folder name using datetime for summary writer datetime_str = str(datetime.datetime.now()) logdir = logdir + '/train' train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str, sess.graph) # Create init op init = tf.global_variables_initializer() # Continue from last saved checkpoint if not train_config.overwrite_checkpoints: trainer_utils.load_checkpoints(checkpoint_dir, saver) print('checkpoint_dir = ', checkpoint_dir) print('saver = ', saver) if len(saver.last_checkpoints) > 0: checkpoint_to_restore = saver.last_checkpoints[-1] print('Try to restore: ', checkpoint_to_restore) saver.restore(sess, checkpoint_to_restore) else: # Initialize the variables sess.run(init) else: # Initialize the variables sess.run(init) # Add debugger: # sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess) # Read the global step if restored global_step = tf.train.global_step(sess, global_step_tensor) print('Starting from step {} / {}'.format( global_step, max_iterations)) # Main Training Loop last_time = time.time() skipped_counter = 0 for step in range(global_step, max_iterations + 1): # print('@@@@@@@@@@ trainer.py Begin new step @@@@@@@@@@@@') # Save checkpoint if step % checkpoint_interval == 0: global_step = tf.train.global_step(sess, global_step_tensor) saver.save(sess, save_path=checkpoint_path, global_step=global_step) print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format( step, max_iterations, checkpoint_path, global_step)) # Create feed_dict for inferencing feed_dict = model.create_feed_dict() # len(feed_dict) = 18 # print('feed_dict = ', feed_dict) if not feed_dict: print('No useful minibatch for step ', step, '\n\n') continue # Write summaries and train op if step % summary_interval == 0: current_time = time.time() time_elapsed = current_time - last_time last_time = current_time try: train_op_loss, summary_out = sess.run( [train_op, summary_merged], feed_dict=feed_dict) print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format( step, train_op_loss, time_elapsed)) train_writer.add_summary(summary_out, step) # Invalid PNG files, the error only appears when running on the server except tf.errors.InvalidArgumentError: skipped_counter += 1 print("[INFO InvalidArgumentError] Step {} failed, batch skipped. Total skipped: {}".format(step, skipped_counter)) # unknown input/output files, the error only appears when running on the server except tf.errors.UnknownError: skipped_counter += 1 print("[INFO UnknownError] Step {} failed, batch skipped. Total skipped: {}".format(step, skipped_counter)) else: try: sess.run(train_op, feed_dict) # Invalid PNG files, the error only appears when running on the server except tf.errors.InvalidArgumentError: skipped_counter += 1 print("[INFO InvalidArgumentError] Step {} failed, batch skipped. Total skipped: {}".format(step, skipped_counter)) # unknown input/output files, the error only appears when running on the server except tf.errors.UnknownError: skipped_counter += 1 print("[INFO UnknownError] Step {} failed, batch skipped. Total skipped: {}".format(step, skipped_counter)) # Close the summary writers train_writer.close()