def test_path_drop_weights(self): # Tests the effect of path-drop on network's feature maps. # It sets up a minimal-training process to check the # feature before and after running the 'train_op' while # path-drop is in effect. train_val_test = 'train' # overwrite the training iterations self.train_config.max_iterations = 2 self.train_config.overwrite_checkpoints = True # Overwrite path drop probabilities model_config = config_builder.proto_to_obj(self.model_config) model_config.path_drop_probabilities = [0.0, 0.8] with tf.Graph().as_default(): # Set a graph-level seed tf.set_random_seed(1245) model = RpnModel(model_config, train_val_test=train_val_test, dataset=self.dataset) prediction_dict = model.build() losses_dict, total_loss = model.loss(prediction_dict) global_summaries = set([]) # Optimizer training_optimizer = optimizer_builder.build( self.train_config.optimizer, global_summaries) train_op = slim.learning.create_train_op( total_loss, training_optimizer) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) for step in range(1, self.train_config.max_iterations): feed_dict = model.create_feed_dict() if step == 1: current_feature_maps = sess.run(model.img_feature_maps, feed_dict=feed_dict) exp_feature_maps = current_feature_maps train_op_loss = sess.run(train_op, feed_dict=feed_dict) print('Step {}, Total Loss {:0.3f} '. format(step, train_op_loss)) updated_feature_maps = sess.run(model.img_feature_maps, feed_dict=feed_dict) # The feature maps should have remained the same since # the image path was dropped np.testing.assert_array_almost_equal( updated_feature_maps, exp_feature_maps, decimal=4)
def set_up_model_train_mode(pipeline_config_path, data_split): """Returns the model and its train_op.""" model_config, train_config, _, dataset_config = \ config_builder.get_configs_from_pipeline_file( pipeline_config_path, is_training=True) dataset = DatasetBuilder.build_kitti_dataset(dataset_config, use_defaults=False) model_name = model_config.model_name if model_name == 'rpn_model': model = RpnModel(model_config, train_val_test=data_split, dataset=dataset) elif model_name == 'avod_model': model = AvodModel(model_config, train_val_test=data_split, dataset=dataset) elif model_name == 'avod_ssd_model': model = AvodSSDModel(model_config, train_val_test=data_split, dataset=dataset) else: raise ValueError('Invalid model_name') prediction_dict = model.build() losses_dict, total_loss = model.loss(prediction_dict) # These parameters are required to set up the optimizer global_summaries = set([]) global_step_tensor = tf.Variable(0, trainable=False) training_optimizer = optimizer_builder.build(train_config.optimizer, global_summaries, global_step_tensor) # Set up the train op train_op = slim.learning.create_train_op(total_loss, training_optimizer) return model, train_op
def train(model, train_config): """Training function for detection models. Args: model: The detection model object. train_config: a train_*pb2 protobuf. training i.e. loading RPN weights onto AVOD model. """ model = model train_config = train_config # Get model configurations model_config = model.model_config # Create a variable tensor to hold the global step global_step_tensor = tf.Variable( 0, trainable=False, name='global_step') ############################# # Get training configurations ############################# max_iterations = train_config.max_iterations summary_interval = train_config.summary_interval checkpoint_interval = \ train_config.checkpoint_interval max_checkpoints = train_config.max_checkpoints_to_keep paths_config = model_config.paths_config logdir = paths_config.logdir if not os.path.exists(logdir): os.makedirs(logdir) checkpoint_dir = paths_config.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + '/' + \ model_config.checkpoint_name global_summaries = set([]) # The model should return a dictionary of predictions prediction_dict = model.build() #WZN: for debug only #img_input_debug = model._rpn_model._img_preprocessed #bev_input_debug = model._rpn_model._bev_preprocessed #bev_pooled_debug = model._rpn_model.bev_input_pooled #img_pooled_debug = model._rpn_model.img_input_pooled #import numpy as np #import matplotlib.pyplot as plt summary_histograms = train_config.summary_histograms summary_img_images = train_config.summary_img_images summary_bev_images = train_config.summary_bev_images ############################## # Setup loss ############################## losses_dict, total_loss = model.loss(prediction_dict) # Optimizer training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries, global_step_tensor) # Create the train op with tf.variable_scope('train_op'): train_op = slim.learning.create_train_op( total_loss, training_optimizer, clip_gradient_norm=1.0, global_step=global_step_tensor) # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True) # Add the result of the train_op to the summary tf.summary.scalar("training_loss", train_op) # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse()) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_merged = summary_utils.summaries_to_keep( summaries, global_summaries, histograms=summary_histograms, input_imgs=summary_img_images, input_bevs=summary_bev_images ) allow_gpu_mem_growth = train_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth sess = tf.Session(config=config) else: sess = tf.Session() # Create unique folder name using datetime for summary writer datetime_str = str(datetime.datetime.now()) logdir = logdir + '/train' train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str, sess.graph) # Create init op init = tf.global_variables_initializer() # Continue from last saved checkpoint if not train_config.overwrite_checkpoints: trainer_utils.load_checkpoints(checkpoint_dir, saver) if len(saver.last_checkpoints) > 0: checkpoint_to_restore = saver.last_checkpoints[-1] saver.restore(sess, checkpoint_to_restore) else: # Initialize the variables sess.run(init) else: # Initialize the variables sess.run(init) # Read the global step if restored global_step = tf.train.global_step(sess, global_step_tensor) print('Starting from step {} / {}'.format( global_step, max_iterations)) # Main Training Loop last_time = time.time() for step in range(global_step, max_iterations + 1): # Save checkpoint if step % checkpoint_interval == 0: global_step = tf.train.global_step(sess, global_step_tensor) saver.save(sess, save_path=checkpoint_path, global_step=global_step) print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format( step, max_iterations, checkpoint_path, global_step)) # Create feed_dict for inferencing feed_dict = model.create_feed_dict() #WZN: only run for debug #bev_p,img_p = sess.run([bev_pooled_debug,img_pooled_debug],feed_dict) #bev_p = np.squeeze(bev_p) #img_p = np.squeeze(img_p) #import pdb #pdb.set_trace() # Write summaries and train op if step % summary_interval == 0: current_time = time.time() time_elapsed = current_time - last_time last_time = current_time train_op_loss, summary_out = sess.run( [train_op, summary_merged], feed_dict=feed_dict) print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format( step, train_op_loss, time_elapsed)) train_writer.add_summary(summary_out, step) else: # Run the train op only sess.run(train_op, feed_dict) # Close the summary writers train_writer.close()
def train(model, train_config): """Training function for detection models. Args: model: The detection model object. train_config: a train_*pb2 protobuf. training i.e. loading RPN weights onto AVOD model. """ model = model train_config = train_config # Get model configurations model_config = model.model_config # Create a variable tensor to hold the global step global_step_tensor = tf.Variable(0, trainable=False, name='global_step') ############################# # Get training configurations ############################# max_iterations = train_config.max_iterations summary_interval = train_config.summary_interval checkpoint_interval = \ train_config.checkpoint_interval max_checkpoints = train_config.max_checkpoints_to_keep paths_config = model_config.paths_config logdir = paths_config.logdir if not os.path.exists(logdir): os.makedirs(logdir) checkpoint_dir = paths_config.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + '/' + \ model_config.checkpoint_name global_summaries = set([]) # The model should return a dictionary of predictions prediction_dict = model.build() summary_histograms = train_config.summary_histograms summary_img_images = train_config.summary_img_images summary_bev_images = train_config.summary_bev_images ############################## # Setup loss ############################## loss_dict, total_loss, rpn_score_2d_loss, \ rpn_acc_score_neg, rpn_acc_score_pos, \ rpn_class_loss, rpn_reg_loss, rpn_acc_all, \ rpn_acc_pos, refine_class_loss, refine_reg_loss, \ avod_acc_all, avod_acc_pos = model.loss(prediction_dict) # Optimizer training_optimizer = optimizer_builder.build(train_config.optimizer, global_summaries, global_step_tensor) # Create the train op with tf.variable_scope('train_op'): train_op = slim.learning.create_train_op( total_loss, training_optimizer, clip_gradient_norm=1.0, global_step=global_step_tensor) # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True) # Add the result of the train_op to the summary tf.summary.scalar("training_loss", train_op) # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse()) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_merged = summary_utils.summaries_to_keep( summaries, global_summaries, histograms=summary_histograms, input_imgs=summary_img_images, input_bevs=summary_bev_images) allow_gpu_mem_growth = train_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth sess = tf.Session(config=config) else: sess = tf.Session() # Create unique folder name using datetime for summary writer datetime_str = str(datetime.datetime.now()) logdir = logdir + '/train' train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str, sess.graph) # Create init op init = tf.global_variables_initializer() # Continue from last saved checkpoint if not train_config.overwrite_checkpoints: trainer_utils.load_checkpoints(checkpoint_dir, saver) all_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES) sess.run(tf.variables_initializer(all_variables)) var_list = [ var for var in all_variables if "beta" not in var.name and 'Adam' not in var.name and 'Average' not in var.name ] saver_part = tf.train.Saver(var_list=var_list) if train_config.use_pretrained: saver_part.restore(sess, train_config.pretrained) print('Model loaded from: {}'.format(train_config.pretrained)) step_base = tf.train.global_step(sess, global_step_tensor) elif len(saver.last_checkpoints) > 0: checkpoint_to_restore = saver.last_checkpoints[-1] saver_part.restore(sess, checkpoint_to_restore) step_base = 0 else: # Initialize the variables sess.run(init) else: # Initialize the variables sess.run(init) global_step = tf.train.global_step(sess, global_step_tensor) - step_base print('Starting from step {} / {}'.format(global_step, max_iterations)) # Main Training Loop last_time = time.time() for step in range(global_step, max_iterations + 1): # Save checkpoint if step % checkpoint_interval == 0: global_step = tf.train.global_step(sess, global_step_tensor) - step_base saver.save(sess, save_path=checkpoint_path, global_step=global_step) print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format( step, max_iterations, checkpoint_path, global_step)) # Create feed_dict for inferencing feed_dict = model.create_feed_dict() # Write summaries and train op if step % summary_interval == 0: current_time = time.time() time_elapsed = current_time - last_time last_time = current_time #train_op_loss, summary_out = sess.run( # [train_op, summary_merged], feed_dict=feed_dict) train_op_loss, summary_out, \ rpn_score_2d_loss_np, \ rpn_acc_score_neg_np, \ rpn_acc_score_pos_np, \ rpn_class_loss_np, rpn_reg_loss_np, \ rpn_acc_all_np, rpn_acc_pos_np, \ refine_class_loss_np, refine_reg_loss_np, \ avod_acc_all_np, avod_acc_pos_np = sess.run( [train_op, summary_merged, rpn_score_2d_loss, rpn_acc_score_neg, rpn_acc_score_pos, rpn_class_loss, rpn_reg_loss, rpn_acc_all, rpn_acc_pos, refine_class_loss, refine_reg_loss, avod_acc_all, avod_acc_pos], feed_dict=feed_dict) print('Step {}, Total Loss {:0.3f} | Score {:0.3f}, Acc {:0.2f} {:0.2f} | RPN Class {:0.3f}, Reg {:0.3f}, Acc {:0.2f} {:0.2f} | Final Class {:0.3f}, Reg {:0.3f}, Acc {:0.2f} {:0.2f}'.format(\ step, train_op_loss, rpn_score_2d_loss_np, rpn_acc_score_neg_np * 100, \ rpn_acc_score_pos_np * 100, rpn_class_loss_np, rpn_reg_loss_np, rpn_acc_all_np * 100, \ rpn_acc_pos_np * 100, refine_class_loss_np, refine_reg_loss_np, avod_acc_all_np * 100, avod_acc_pos_np * 100)) train_writer.add_summary(summary_out, step) else: # Run the train op only sess.run(train_op, feed_dict) # Close the summary writers train_writer.close()
def train(model, train_config): """Training function for detection models. Args: model: The detection model object. train_config: a train_*pb2 protobuf. training i.e. loading RPN weights onto AVOD model. """ model = model train_config = train_config # Get model configurations model_config = model.model_config # Create a variable tensor to hold the global step global_step_tensor = tf.Variable(0, trainable=False, name='global_step') ############################# # Get training configurations ############################# max_iterations = train_config.max_iterations summary_interval = train_config.summary_interval checkpoint_interval = \ train_config.checkpoint_interval max_checkpoints = train_config.max_checkpoints_to_keep paths_config = model_config.paths_config logdir = paths_config.logdir if not os.path.exists(logdir): os.makedirs(logdir) checkpoint_dir = paths_config.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + '/' + \ model_config.checkpoint_name global_summaries = set([]) # The model should return a dictionary of predictions prediction_dict = model.build() summary_histograms = train_config.summary_histograms summary_img_images = train_config.summary_img_images summary_bev_images = train_config.summary_bev_images ############################## # Setup loss ############################## losses_dict, total_loss = model.loss(prediction_dict) ############################################################################################## # Select trainable variables of the original AVOD model to set gradient to 0(var0) var_moe = [ var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mix_of_experts') ] var0 = [var for var in tf.trainable_variables()] var_all_but_var_moe = [ var for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ] for var in var_moe: var0.remove(var) var_all_but_var_moe.remove(var) ############################################################################################## # Create optimizer with 0 gradient for the AVOD model and an optimizer for the MoE training_optimizer0 = tf.train.GradientDescentOptimizer(0.0) training_optimizer1 = optimizer_builder.build(train_config.optimizer, global_summaries, global_step_tensor) ############################################################################################## # Create the train op. train_op1 is for MoE and train_op0 is for AVOD with tf.variable_scope('train_op'): #Create training operations train_op1 = slim.learning.create_train_op( total_loss, training_optimizer1, variables_to_train=var_moe, clip_gradient_norm=1.0, global_step=global_step_tensor) train_op0 = slim.learning.create_train_op( total_loss, training_optimizer0, variables_to_train=var0, clip_gradient_norm=1.0, global_step=global_step_tensor) train_op = tf.group(train_op1, train_op0) ############################################################################################## # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True) # Add the result of the train_op to the summary tf.summary.scalar("training_loss", train_op1) # Add maximum memory usage summary op # This op can only be run on device with gpu # so it's skipped on travis is_travis = 'TRAVIS' in os.environ if not is_travis: # tf.summary.scalar('bytes_in_use', # tf.contrib.memory_stats.BytesInUse()) tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse()) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_merged = summary_utils.summaries_to_keep( summaries, global_summaries, histograms=summary_histograms, input_imgs=summary_img_images, input_bevs=summary_bev_images) allow_gpu_mem_growth = train_config.allow_gpu_mem_growth if allow_gpu_mem_growth: # GPU memory config config = tf.ConfigProto() config.gpu_options.allow_growth = allow_gpu_mem_growth sess = tf.Session(config=config) else: sess = tf.Session() # Create unique folder name using datetime for summary writer datetime_str = str(datetime.datetime.now()) logdir = logdir + '/train' train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str, sess.graph) # Create init op init = tf.global_variables_initializer() # Continue from last saved checkpoint if not train_config.overwrite_checkpoints: trainer_utils.load_checkpoints(checkpoint_dir, saver) if len(saver.last_checkpoints) > 0: checkpoint_to_restore = saver.last_checkpoints[-1] saver.restore(sess, checkpoint_to_restore) else: # Initialize the variables # Restore checkpoints from original avod model. Give the correct path to restore checkpoint_path_start = train_config.moe_config.initial_avod_checkpoint_path variables_to_restore = dict() for var in var_all_but_var_moe: variables_to_restore[var.op.name] = slim.get_unique_variable( var.op.name) init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path_start, variables_to_restore) sess.run(init) sess.run(init_assign_op, init_feed_dict) ############################################################################################## else: # Initialize the variables sess.run(init) # Read the global step if restored global_step = tf.train.global_step(sess, global_step_tensor) print('Starting from step {} / {}'.format(global_step, max_iterations)) # Main Training Loop last_time = time.time() for step in range(global_step, max_iterations + 1): # Save checkpoint if step % checkpoint_interval == 0: global_step = tf.train.global_step(sess, global_step_tensor) saver.save(sess, save_path=checkpoint_path, global_step=global_step) print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format( step, max_iterations, checkpoint_path, global_step)) # Create feed_dict for inferencing feed_dict = model.create_feed_dict() # Write summaries and train op if step % summary_interval == 0: current_time = time.time() time_elapsed = current_time - last_time last_time = current_time train_op_loss, summary_out = sess.run([train_op1, summary_merged], feed_dict=feed_dict) print(train_op_loss) print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format( step, train_op_loss, time_elapsed)) train_writer.add_summary(summary_out, step) else: # Run the train op only sess.run(train_op1, feed_dict) # Close the summary writers train_writer.close()
def test_vgg_layers_build(self): train_config_text_proto = """ optimizer { gradient_descent { learning_rate { constant_learning_rate { learning_rate: 0.1 } } } } """ train_config = train_pb2.TrainConfig() text_format.Merge(train_config_text_proto, train_config) global_summaries = set([]) batch_size = 1 with tf.Graph().as_default(): with tf.name_scope('input'): # BEV image placeholder image_placeholder = tf.placeholder(tf.float32, (None, 700, 800, 6)) image_summary = tf.expand_dims(image_placeholder, axis=0) tf.summary.image("image", image_summary, max_outputs=5) # Check invalid BEV shape bev_shape = (300, 300) processed_image = self.bev_vgg_cls.preprocess_input( image_placeholder, bev_shape) predictions, end_points = self.bev_vgg_cls.build(processed_image, num_classes=1, is_training=True) feed_dict, label_pl = fill_feed_dict(self.dataset, image_placeholder, batch_size) ########################### # Loss Function ########################### cross_entropy = tf.nn.weighted_cross_entropy_with_logits( label_pl, predictions, 1.0) loss = tf.reduce_mean(cross_entropy) ########################### # Optimizer ########################### training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries) ########################### # Train-op ########################### train_op = slim.learning.create_train_op(loss, training_optimizer) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) loss = sess.run(train_op, feed_dict=feed_dict) self.assertLess(loss, 1) print('Loss ', loss)