def main(_): if FLAGS.dataset_dir is None: raise ValueError( 'you must supply the dataset directory with --dataset_dir=') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): global_step = tf.train.create_global_step() dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_dir, FLAGS) iterator = dataset.make_one_shot_iterator() model = mcnn.McnnModel() h, w = RESIZED_IMAGE_SHAPE f_h = model.get_feature_map_height(h) f_w = model.get_feature_map_width(w) b_img, b_dmap = iterator.get_next() b_img = tf.reshape(b_img, [FLAGS.batch_size, h, w, CHANNELS]) b_dmap = tf.reshape(b_dmap, [FLAGS.batch_size, f_h, f_w]) pred_dmap = model.net(b_img) sqr_loss = model.losses(pred_dmap, b_dmap) tf.summary.image('image', b_img) # add summaries for loss total_loss = slim.losses.get_total_loss() summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar(loss.op.name, loss)) summaries.add(tf.summary.scalar('total_loss', total_loss)) # add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) learning_rate = tf_utils.create_learning_rate(FLAGS, DATA_SIZE, global_step) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9, name='Momentum') optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=1.0) variables_to_train = tf_utils.get_variables_to_train(FLAGS) train_op = slim.learning.create_train_op( total_loss, optimizer, variables_to_train=variables_to_train) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) summary_op = tf.summary.merge(list(summaries), name='summary_op') slim.learning.train(train_op, FLAGS.train_dir, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, save_summaries_secs=FLAGS.save_summaries_secs, log_every_n_steps=FLAGS.log_every_n_steps, number_of_steps=FLAGS.max_number_steps, saver=saver)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Config model_deploy. Keep TF Slim Models structure. # Useful if want to need multiple GPUs and/or servers in the future. deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() # Select the dataset. dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = nets_factory.get_network(FLAGS.model_name) ssd_params = ssd_class.default_params._replace( num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. [image, shape, glabels, gbboxes] = provider.get( ['image', 'shape', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # Training batches and queue. r = tf.train.batch(tf_utils.reshape_list( [image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) # Intermediate queueing: unique batch computation pipeline for all # GPUs running the training. batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list( [b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) # Add loss function. ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # =================================================================== # # Add summaries from first clone. # =================================================================== # clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train(train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): ###################### # Config model_deploy# ###################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() network_fn = nets_factory.get_network(FLAGS.model_name) params = network_fn.default_params params = params._replace(match_threshold=FLAGS.match_threshold) # initalize the net net = network_fn(params) out_shape = net.params.img_shape anchors = net.anchors(out_shape) # create batch dataset with tf.device(deploy_config.inputs_device()): b_image, b_glocalisations, b_gscores = \ load_batch.get_batch(FLAGS.dataset_dir, FLAGS.num_readers, FLAGS.batch_size, out_shape, net, anchors, FLAGS, file_pattern = FLAGS.file_pattern, is_training = True, shuffe = FLAGS.shuffle_data) allgscores = [] allglocalization = [] for i in range(len(anchors)): allgscores.append(tf.reshape(b_gscores[i], [-1])) allglocalization.append( tf.reshape(b_glocalisations[i], [-1, 4])) b_gscores = tf.concat(allgscores, 0) b_glocalisations = tf.concat(allglocalization, 0) batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list([b_image, b_glocalisations, b_gscores]), num_threads=8, capacity=16 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # def clone_fn(batch_queue): #Allows data parallelism by creating multiple #clones of network_fn. # Dequeue batch. batch_shape = [1] * 3 b_image, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. arg_scope = net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=FLAGS.data_format) with slim.arg_scope(arg_scope): localisations, logits, end_points = \ net.net(b_image, is_training=True, use_batch=FLAGS.use_batch) # Add loss function. net.losses(logits, localisations, b_glocalisations, b_gscores, negative_ratio=FLAGS.negative_ratio, use_hard_neg=FLAGS.use_hard_neg, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) return end_points summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate( FLAGS, FLAGS.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.fine_tune: gradient_multipliers = pickle.load( open('nets/multiplier_300.pkl', 'rb')) else: gradient_multipliers = None if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') #train_tensor = slim.learning.create_train_op(total_loss, optimizer, gradient_multipliers=gradient_multipliers) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction, allocator_type="BFC") config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True, inter_op_parallelism_threads=0, intra_op_parallelism_threads=1, ) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train(train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Create global_step. global_step = slim.create_global_step() # Select the dataset. dataset = pascalvoc_2012.get_split('train', FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = ssd_vgg_300.SSDNet ssd_params = ssd_class.default_params._replace( num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) ssd_shape = ssd_net.params.img_shape # 计算所有先验框位置和大小[anchor=(x,y,h,w)....] ssd_anchors = ssd_net.anchors(ssd_shape) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.name_scope('pascalvoc_2012_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. [image, shape, glabels, gbboxes ] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = \ ssd_vgg_preprocessing.preprocess_image(image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT, is_training=True) # Encode groundtruth labels and bboxes. gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # Training batches and queue. r = tf.train.batch(tf_utils.reshape_list( [image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) # Intermediate queueing batch_queue = slim.prefetch_queue.prefetch_queue(tf_utils.reshape_list( [b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2) # Dequeue batch. b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. # 读取网络中的默认参数 arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) # Add loss function. ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=0.0) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # =================================================================== # # Add summaries. # =================================================================== # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Add summaries for end_points. for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES'): summaries.add(tf.summary.scalar(loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the optimization procedure. # =================================================================== # learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=1.0) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss = tf.add_n(tf.get_collection(tf.GraphKeys.LOSSES)) gradients = optimizer.compute_gradients(total_loss, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(gradients, global_step=global_step) update_ops.append(grad_updates) # 将所有的更新操作组合成一个operation update_op = tf.group(*update_ops) # 保证所有的更新操作执行后,才获取total_loss train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train(train_tensor, logdir=FLAGS.train_dir, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=0, num_replicas=1, num_ps_tasks=0) with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ssd_class = nets_factory.get_network(FLAGS.model_name) ssd_params = ssd_class.default_params._replace( num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) [image, shape, glabels, gbboxes] = provider.get( ['image', 'shape', 'object/label', 'object/bbox']) image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT) gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 r = tf.train.batch(tf_utils.reshape_list( [image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list( [b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2 * deploy_config.num_clones) def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) return end_points summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.moving_average_decay: update_ops.append( variable_averages.apply(moving_average_variables)) variables_to_train = tf_utils.get_variables_to_train(FLAGS) total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) summaries.add(tf.summary.scalar('total_loss', total_loss)) grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summary_op = tf.summary.merge(list(summaries), name='summary_op') gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train(train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): global_step = slim.create_global_step() dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) tf_class = nets_factory.get_network(FLAGS.model_name) tf_model = tf_class() tf_anchors = tf_model.anchors() tf_input_shape = tf_model.input_shape [image, shape, glabels, gbboxes ] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) # Add Data Augmentation # image : -> random crop & resize & whitening # bboxes : -> random flip & change to relative style image, glabels, gbboxes = image_preprocessing_fn( image, glabels, gbboxes, out_shape=tf_input_shape, data_format=DATA_FORMAT) # Match anchors to gt_bboxes gclasses, glocalisations, gscores = \ tf_model.bboxes_encode(glabels, gbboxes, tf_anchors) b_image, b_gclasses, b_glocalisations, b_gscores = tf.train.batch( [image, gclasses, glocalisations, gscores], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) predictions, localisations, logits, end_points = \ tf_model.TF_net(b_image, is_training=True) total_losses = tf_model.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) summaries.add(tf.summary.scalar('total_loss', total_losses)) for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES'): summaries.add(tf.summary.scalar(loss.op.name, loss)) for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) # summaries.add(tf.summary.scalar('learning_rate', learning_rate)) variables_to_train = tf_utils.get_variables_to_train(FLAGS) grad_vars = optimizer.compute_gradients(total_losses, variables_to_train) grad_updates = optimizer.apply_gradients(grad_vars, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_losses, name='train_op') gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) summary_op = tf.summary.merge(list(summaries), name='summary_op') # missing_vars = tf_utils.missing_vars(FLAGS) # if FLAGS.ignore_missing_vars is not None: # FLAGS.ignore_missing_vars += missing_vars slim.learning.train(train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Config model_deploy. Keep TF Slim Models structure. # Useful if want to need multiple GPUs and/or servers in the future. deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, # 1 clone_on_cpu=FLAGS.clone_on_cpu, # False replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() # Select the dataset. # 'pascalvoc_2012', 'train', tfr文件存储位置 # TFR文件命名格式:'voc_2012_%s_*.tfrecord',%s使用train或者test dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = nets_factory.get_network(FLAGS.model_name) # 'ssd_300_vgg' ssd_params = ssd_class.default_params._replace( num_classes=FLAGS.num_classes) # 替换类属性为21 ssd_net = ssd_class(ssd_params) # 创建类实例 ssd_shape = ssd_net.params.img_shape # 获取类属性(300,300) ssd_anchors = ssd_net.anchors(ssd_shape) # 调用类方法,创建搜素框 # Select the preprocessing function. # 'ssd_300_vgg', 如果 preprocessing_name 是 None 就使用 model_name preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # # '/job:ps/device:CPU:0' 或者 '/device:CPU:0' with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, # DatasetDataProvider 需要 slim.dataset.Dataset 做参数 num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes.c # DatasetDataProvider可以通过TFR字段获取batch size数据 [image, shape, glabels, gbboxes] = provider.get( ['image', 'shape', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. # 'CHW' (n,) (n, 4) image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, # (300,300) data_format=DATA_FORMAT) # 'NCHW' # Encode groundtruth labels and bboxes. # f层个(m,m,k),f层个(m,m,k,4xywh),f层个(m,m,k) f层表示提取ssd特征的层的数目 # 0-20数字,方便loss的坐标记录,IOU值 gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # (1,f层,f层,f层) # Training batches and queue. r = tf.train.batch( # 图片,中心点类别,真实框坐标,得分 tf_utils.reshape_list( [image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, # 32 num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) # pp.pprint([image, gclasses, glocalisations, gscores]) # pp.pprint(r) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) # pp.pprint([b_image, b_gclasses, b_glocalisations, b_gscores]) # Intermediate queueing: unique batch computation pipeline for all # GPUs running the training. batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list( [b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # 重整list # Construct SSD network. # 这个实例方法会返回之前定义的函数ssd_arg_scope(允许修改两个参数) arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): # predictions: (BS, H, W, 4, 21) # localisations: (BS, H, W, 4, 4) # logits: (BS, H, W, 4, 21) predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) # Add loss function. ssd_net.losses( logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, # .5 negative_ratio=FLAGS.negative_ratio, # 3 alpha=FLAGS.loss_alpha, # 1 label_smoothing=FLAGS.label_smoothing) # .0 return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # =================================================================== # # Add summaries from first clone. # =================================================================== # clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op# # total_loss 并不参与优化,仅记录用 total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), # 看函数实现就明白了 summary_op=summary_op, # tf.summary.merge节点 number_of_steps=FLAGS.max_number_of_steps, # 训练step log_every_n_steps=FLAGS.log_every_n_steps, # 每次model保存step间隔 save_summaries_secs=FLAGS.save_summaries_secs, # 每次summary时间间隔 saver=saver, # tf.train.Saver节点 save_interval_secs=FLAGS.save_interval_secs, session_config=config, # sess参数 sync_optimizer=None)
def main(_): if not FLAGS.data_dir: raise ValueError('You must supply the dataset directory with --data_dir') tf.logging.set_verbosity(tf.logging.DEBUG) #with tf.Graph().as_default(): # with tf.Graph().as_default(), tf.device('/cpu:0'): with tf.device('/cpu:0'): global_step = slim.create_global_step() # Select the dataset. dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.data_dir) # Get the RON network and its anchors. ron_class = nets_factory.get_network(FLAGS.model_name) ron_params = ron_class.default_params._replace(num_classes=FLAGS.num_classes) ron_net = ron_class(ron_params) ron_shape = ron_net.params.img_shape ron_anchors = ron_net.anchors(ron_shape) # for i in range(len(ron_anchors)): # for j in range(len(ron_anchors[i])): # print(ron_anchors[i][j].shape) tf_utils.print_configuration(FLAGS.__flags, ron_params, dataset.data_sources, FLAGS.model_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=120 * FLAGS.batch_size, common_queue_min=80 * FLAGS.batch_size, shuffle=True) # Get for RON network: image, labels, bboxes. # (ymin, xmin, ymax, xmax) fro gbboxes [image, shape, glabels, gbboxes, isdifficult] = provider.get(['image', 'shape', 'object/label', 'object/bbox', 'object/difficult']) #glabels = tf.cast(isdifficult < tf.ones_like(isdifficult), glabels.dtype) * glabels isdifficult_mask =tf.cond(tf.reduce_sum(tf.cast(tf.logical_not(tf.equal(tf.ones_like(isdifficult), isdifficult)), tf.float32)) < 1., lambda : tf.one_hot(0, tf.shape(isdifficult)[0], on_value=True, off_value=False, dtype=tf.bool), lambda : isdifficult < tf.ones_like(isdifficult)) glabels = tf.boolean_mask(glabels, isdifficult_mask) gbboxes = tf.boolean_mask(gbboxes, isdifficult_mask) #glabels = tf.Print(glabels, [glabels,isdifficult], message='glabels: ', summarize=200) #### DEBUG #### #image = tf.Print(image, [shape, glabels, gbboxes], message='before preprocess: ', summarize=20) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = image_preprocessing_fn(image, glabels, gbboxes, out_shape=ron_shape, data_format=DATA_FORMAT) #### DEBUG #### #image = tf.Print(image, [shape, glabels, gbboxes], message='after preprocess: ', summarize=20) #glabels = tf.Print(glabels, [glabels,isdifficult], message='glabels: ', summarize=200) # save_image_op = tf.py_func(save_image_with_bbox, # [image, # tf.reshape(tf.clip_by_value(glabels, 0, 22), [-1]), # #tf.convert_to_tensor(list(rscores.keys()), dtype=tf.int64), # tf.reshape(tf.ones_like(gbboxes), [-1]), # tf.reshape(gbboxes, [-1, 4])], # tf.int64, stateful=True) # Encode groundtruth labels and bboxes. # glocalisations is our regression object # gclasses is the ground_trutuh label # gscores is the the jaccard score with ground_truth gclasses, glocalisations, gscores, gbboxes = \ ron_net.bboxes_encode(glabels, gbboxes, ron_anchors, positive_threshold=FLAGS.match_threshold, ignore_threshold=FLAGS.neg_threshold) #gclasses[1] = tf.Print(gclasses[1], [gclasses[1]], message='gclasses[1]: ', summarize=200) # save_image_op = tf.py_func(save_image_with_bbox, # [image, # tf.reshape(tf.clip_by_value(gclasses[3], 0, 22), [-1]), # #tf.convert_to_tensor(list(rscores.keys()), dtype=tf.int64), # tf.reshape(gscores[3], [-1]), # tf.reshape(gbboxes[3], [-1, 4])], # tf.int64, stateful=True) # save_image_op = tf.py_func(save_image_with_bbox, # [image, # tf.clip_by_value(tf.concat([tf.reshape(_, [-1]) for _ in gclasses], axis=0), 0, 22), # tf.concat([tf.reshape(_, [-1]) for _ in gscores], axis=0), # tf.concat([tf.reshape(_, [-1, 4]) for _ in gbboxes], axis=0)], # tf.int64, stateful=True) # each size of the batch elements # include one image, three others(gclasses, glocalisations, gscores) batch_shape = [1] + [len(ron_anchors)] * 3 #with tf.control_dependencies([save_image_op]): # Training batches and queue. r = tf.train.batch( tf_utils.reshape_list([image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=120 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) with tf.device('/gpu:0'): # Construct RON network. arg_scope = ron_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, logits, objness_pred, objness_logits, localisations, end_points = \ ron_net.net(b_image, is_training=True) # Add loss function. ron_net.losses(logits, localisations, objness_logits, objness_pred, b_gclasses, b_glocalisations, b_gscores, match_threshold = FLAGS.match_threshold, neg_threshold = FLAGS.neg_threshold, objness_threshold = FLAGS.objectness_thres, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, beta=FLAGS.loss_beta, label_smoothing=FLAGS.label_smoothing) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES'): summaries.add(tf.summary.scalar(loss.op.name, loss)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # # learning_rate = tf_utils.configure_learning_rate(FLAGS, # dataset.num_samples, # global_step) lr_values = [FLAGS.learning_rate * decay for decay in [1., 0.1, 0.001]] learning_rate_ = tf.train.piecewise_constant(tf.cast(global_step, tf.int32), [90000, 115000], lr_values) learning_rate = tf.maximum(learning_rate_, tf.constant(FLAGS.end_learning_rate, dtype=learning_rate_.dtype)) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss = tf.losses.get_total_loss() # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grads = optimizer.compute_gradients( total_loss, variables_to_train) grad_updates = optimizer.apply_gradients(grads, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement = False, intra_op_parallelism_threads = FLAGS.num_cpu_threads, inter_op_parallelism_threads = FLAGS.num_cpu_threads, gpu_options = gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours = FLAGS.save_interval_secs/3600., write_version=2, pad_step_number=False) def wrapper_debug(sess): sess = tf_debug.LocalCLIDebugWrapperSession(sess, thread_name_filter="MainThread$") sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) return sess slim.learning.train( train_tensor, logdir=FLAGS.model_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS, os.path.join(FLAGS.data_dir, 'vgg_model/vgg16_reducedfc.ckpt')),#'vgg_model/vgg16_reducedfc.ckpt' summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, session_wrapper=None,#wrapper_debug,# sync_optimizer=None)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = tf.train.get_or_create_global_step() net = model_cmc.Model() with tf.device(deploy_config.inputs_device()): if (FLAGS.model == 'unet'): batch_queue = \ load_batch.get_batch(FLAGS.dataset_dir, FLAGS.num_readers, FLAGS.batch_size, None, FLAGS, file_pattern = FLAGS.file_pattern, is_training = True, shuffe = FLAGS.shuffle_data) elif (FLAGS.model == 'patch'): batch_queue = \ load_batch_patch.get_batch(FLAGS.dataset_dir, FLAGS.num_readers, FLAGS.batch_size, None, FLAGS, file_pattern = FLAGS.file_pattern, is_training = True, shuffe = FLAGS.shuffle_data) elif (FLAGS.model == 'cmc'): batch_queue = \ load_batch_cmc.get_batch(FLAGS.dataset_dir, FLAGS.num_readers, FLAGS.batch_size, None, FLAGS, file_pattern = FLAGS.file_pattern, is_training = True, shuffe = FLAGS.shuffle_data) # =================================================================== # # Define the model running on every GPU. # =================================================================== # print("Batch_loading_successful") def clone_fn(batch_queue): batch_shape = [1] * 3 b_image, label = batch_queue logits, end_points = net.net(b_image) # Add loss function. loss, mean_iou = net.weighted_losses(logits, label) return end_points, mean_iou summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) end_points, mean_iou = clones[0].outputs update_ops.append(mean_iou[1]) #for end_point in end_points: # x = end_points[end_point] # summaries.add(tf.summary.histogram('activations/' + end_point, x)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) # #for variable in slim.get_model_variables(): # summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate( FLAGS, FLAGS.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.fine_tune: gradient_multipliers = pickle.load( open('nets/multiplier_300.pkl', 'rb')) else: gradient_multipliers = None if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) if gradient_multipliers: with ops.name_scope('multiply_grads'): clones_gradients = slim.learning.multiply_gradients( clones_gradients, gradient_multipliers) if FLAGS.clip_gradient_norm > 0: with ops.name_scope('clip_grads'): clones_gradients = slim.learning.clip_gradient_norms( clones_gradients, FLAGS.clip_gradient_norm) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') #train_tensor = slim.learning.create_train_op(total_loss, optimizer, gradient_multipliers=gradient_multipliers) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # def train_step_fn(session, *args, **kwargs): # visualizer = Beholder(session=session, logdir=FLAGS.train_dir) total_loss, should_stop = train_step(session, *args, **kwargs) if train_step_fn.step % FLAGS.validation_check == 0: _mean_iou = session.run(train_step_fn.mean_iou) print('evaluation step %d - loss = %.4f mean_iou = %.2f%%' %\ (train_step_fn.step, total_loss, _mean_iou )) # evaluated_tensors = session.run([end_points['conv4'], end_points['up1']]) # example_frame = session.run(end_points['up2']) # visualizer.update(arrays=evaluated_tensors, frame=example_frame) train_step_fn.step += 1 return [total_loss, should_stop] train_step_fn.step = 0 train_step_fn.end_points = end_points train_step_fn.mean_iou = mean_iou[0] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction, allocator_type="BFC") config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True, inter_op_parallelism_threads=0, intra_op_parallelism_threads=1, ) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train(train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, train_step_fn=train_step_fn, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(): # 打印tf日志 tf.logging.set_verbosity(tf.logging.DEBUG) # 建立生成默认计算图上下文 with tf.Graph().as_default(): # 选择相应的数据集 dataset = dataset_factory.get_dataset(config.dataset_name, config.dataset_split_name, config.dataset_dir) data_provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=config.num_readers, common_queue_capacity=20 * config.batch_size, common_queue_min=10 * config.batch_size, shuffle=True) image_raw, label = data_provider.get(['image', 'label']) # 选择模型 model = nets_factory.get_network_fn( config.model_name, 7, is_training=True, dropout_keep_prob=config.dropout_keep_prob) # 选择预处理器 preprocessing_name = config.preprocessing_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) with tf.device('/gpu:0'): #建立全局步数 global_step = slim.create_global_step() # 预处理 image = image_preprocessing_fn(image_raw, 299, 299) # 喂入数据 images, images_raws, labels = tf.train.batch( [image, image_raw, label], batch_size=config.batch_size, num_threads=config.num_preprocessing_threads, capacity=5 * config.batch_size) # 加载模型 logits, end_points = model(images, num_classes=dataset.num_classes, is_training=True) # 打印模型信息 for k, v in end_points.items(): print('name = {}, shape = {}'.format(v.name, v.get_shape())) # 打印模型参数 print("\n") print("Parameters") for v in slim.get_model_variables(): print('name = {}, shape = {}'.format(v.name, v.get_shape())) # 损失函数 slim.losses.softmax_cross_entropy(logits, labels) total_loss = slim.losses.get_total_loss() # 收集摘要 summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) for end_point in end_points: x = end_points[end_point] summaries.add( tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) summaries.add(tf.summary.scalar('total_loss', total_loss)) for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # 优化器 learning_rate = tf_utils.configure_learning_rate( config, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(config, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) # 摘要合并. summary_op = tf.summary.merge(list(summaries), name='summary_op') # 训练步骤 train_op = slim.learning.create_train_op(total_loss, optimizer) # gpu配置 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config.gpu_memory_fraction) gpu_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options) # 断点续训,存储器 saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) # 训练 final_loss = slim.learning.train( train_op, logdir=config.train_dir, init_fn=tf_utils.get_init_fn(config), summary_op=summary_op, number_of_steps=config.max_number_of_steps, log_every_n_steps=config.log_every_n_steps, save_summaries_secs=config.save_summaries_secs, saver=saver, save_interval_secs=config.save_interval_secs, session_config=gpu_config, sync_optimizer=None) print("Finished training. Last batch loss:", final_loss) print("Checkpoint saved in %s" % config.train_dir)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): global_step = slim.create_global_step() # Get the SSD network and its anchors. #ssd_params = ssd_class.default_params._replace(num_classes=FLAGS.num_classes) ssd_net = ssd_vgg_300.SSDNet() ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) b_image, b_gclasses, b_glocalisations, b_gscores = \ load_batch.get_batch(FLAGS.dataset_dir, FLAGS.num_readers, FLAGS.batch_size, ssd_shape, ssd_net, ssd_anchors, FLAGS.num_preprocessing_threads, is_training = True) with tf.device(FLAGS.gpu_train): arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) # Add loss function. total_loss = ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) for loss in tf.get_collection('EXTRA_LOSSES'): summaries.add(tf.summary.scalar(loss.op.name, loss)) with tf.device(FLAGS.gpu_train): learning_rate = tf_utils.configure_learning_rate( FLAGS, FLAGS.num_samples, global_step) # Configure the optimization procedure optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) ## Training train_op = slim.learning.create_train_op(total_loss, optimizer) # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) saver = tf.train.Saver(max_to_keep=1, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train(train_op, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Config model_deploy. Keep TF Slim Models structure. # Useful if want to need multiple GPUs and/or servers in the future. deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() # Select the dataset. dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = nets_factory.get_network(FLAGS.model_name) ssd_params = ssd_class.default_params._replace(num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # Training batches and queue. r = tf.train.batch( tf_utils.reshape_list([image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) # Intermediate queueing: unique batch computation pipeline for all # GPUs running the training. batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list([b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) # Add loss function. ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # =================================================================== # # Add summaries from first clone. # =================================================================== # clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate(FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(_): if not FLAGS.data_dir: raise ValueError('You must supply the dataset directory with --data_dir') num_gpus = FLAGS.num_gpus if num_gpus < 1: num_gpus = 1 # ps_spec = FLAGS.ps_hosts.split(",") # worker_spec = FLAGS.worker_hosts.split(",") # num_workers = len(worker_spec) # cluster = tf.train.ClusterSpec({ # "ps": ps_spec, # "worker": worker_spec}) # server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) # if FLAGS.job_name == "ps": # with tf.device("/cpu:0"): # server.join() # return tf.logging.set_verbosity(tf.logging.DEBUG) with tf.device('/cpu:0'): global_step = slim.create_global_step() # Select the dataset. dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.data_dir) # Get the RON network and its anchors. ron_class = nets_factory.get_network(FLAGS.model_name) ron_params = ron_class.default_params._replace(num_classes=FLAGS.num_classes) ron_net = ron_class(ron_params) ron_shape = ron_net.params.img_shape ron_anchors = ron_net.anchors(ron_shape) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=120 * FLAGS.batch_size * num_gpus, common_queue_min=80 * FLAGS.batch_size * num_gpus, shuffle=True) # Get for RON network: image, labels, bboxes. # (ymin, xmin, ymax, xmax) fro gbboxes [image, shape, glabels, gbboxes, isdifficult] = provider.get(['image', 'shape', 'object/label', 'object/bbox', 'object/difficult']) isdifficult_mask =tf.cond(tf.reduce_sum(tf.cast(tf.logical_not(tf.equal(tf.ones_like(isdifficult), isdifficult)), tf.float32)) < 1., lambda : tf.one_hot(0, tf.shape(isdifficult)[0], on_value=True, off_value=False, dtype=tf.bool), lambda : isdifficult < tf.ones_like(isdifficult)) glabels = tf.boolean_mask(glabels, isdifficult_mask) gbboxes = tf.boolean_mask(gbboxes, isdifficult_mask) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = image_preprocessing_fn(image, glabels, gbboxes, out_shape=ron_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. # glocalisations is our regression object # gclasses is the ground_trutuh label # gscores is the the jaccard score with ground_truth gclasses, glocalisations, gscores = \ ron_net.bboxes_encode(glabels, gbboxes, ron_anchors, positive_threshold=FLAGS.match_threshold, ignore_threshold=FLAGS.neg_threshold) # each size of the batch elements # include one image, three others(gclasses, glocalisations, gscores) batch_shape = [1] + [len(ron_anchors)] * 3 # Training batches and queue. r = tf.train.batch( tf_utils.reshape_list([image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size * num_gpus, num_threads=FLAGS.num_preprocessing_threads, capacity=120 * FLAGS.batch_size * num_gpus) all_batch = tf_utils.reshape_list(r, batch_shape) b_image = tf.split(all_batch[0], num_or_size_splits=num_gpus, axis=0) _b_gclasses = [tf.split(b, num_or_size_splits=num_gpus, axis=0) for b in all_batch[1]] b_gclasses = [_ for _ in zip(*_b_gclasses)] _b_glocalisations = [tf.split(b, num_or_size_splits=num_gpus, axis=0) for b in all_batch[2]] b_glocalisations = [_ for _ in zip(*_b_glocalisations)] _b_gscores = [tf.split(b, num_or_size_splits=num_gpus, axis=0) for b in all_batch[3]] b_gscores = [_ for _ in zip(*_b_gscores)] # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # =================================================================== # # Configure the optimization procedure. # =================================================================== # learning_rate = tf_utils.configure_learning_rate(FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) # Construct RON network. arg_scope = ron_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) reuse_variables = False tower_grads = [] loss_list = [] with slim.arg_scope(arg_scope): for index in range(num_gpus): with tf.device('/gpu:%d' % index): predictions, logits, objness_pred, objness_logits, localisations, end_points = ron_net.net(b_image[index], is_training=True, reuse = reuse_variables) # Add loss function. ron_net.losses(logits, localisations, objness_logits, objness_pred, b_gclasses[index], b_glocalisations[index], b_gscores[index], match_threshold = FLAGS.match_threshold, neg_threshold = FLAGS.neg_threshold, objness_threshold = FLAGS.objectness_thres, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, beta=FLAGS.loss_beta, label_smoothing=FLAGS.label_smoothing) reuse_variables = True # and returns a train_tensor and summary_op loss = tf.losses.get_total_loss() loss_list.append(loss) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # Create gradient updates. grads = optimizer.compute_gradients(loss, variables_to_train) tower_grads.append(grads) reduce_grads = average_gradients(tower_grads) total_loss = tf.reduce_mean(tf.stack(loss_list, axis=0), axis=0) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) grad_updates = optimizer.apply_gradients(reduce_grads, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours = FLAGS.save_interval_secs/3600., write_version=2, pad_step_number=False) slim.learning.train( train_tensor, logdir=FLAGS.model_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS, os.path.join(FLAGS.data_dir, 'vgg_16.ckpt')), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, session_wrapper=None, sync_optimizer=None)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): network_fn = nets_factory.get_network(FLAGS.model_name) params = network_fn.default_params params = params._replace(match_threshold=FLAGS.match_threshold) # initalize the net net = network_fn(params) out_shape = net.params.img_shape anchors = net.anchors(out_shape) # Create global_step. global_step = slim.create_global_step() # create batch dataset b_image, b_glocalisations, b_gscores = \ load_batch.get_batch(FLAGS.dataset_dir, FLAGS.num_readers, FLAGS.batch_size, out_shape, net, anchors, FLAGS, file_pattern = FLAGS.file_pattern, is_training = True, shuffe = FLAGS.shuffle_data) with tf.device(FLAGS.gpu_train): #with tf.device(FLAGS.gpu_train): arg_scope = net.arg_scope(weight_decay=FLAGS.weight_decay) with slim.arg_scope(arg_scope): localisations, logits, end_points = \ net.net(b_image, is_training=True, use_batch=FLAGS.use_batch) # Add loss function. total_loss = net.losses(logits, localisations, b_glocalisations, b_gscores, negative_ratio=FLAGS.negative_ratio, use_hard_neg=FLAGS.use_hard_neg, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) # Gather summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) ''' for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) #for loss in tf.get_collection(tf.GraphKeys.LOSSES): # summaries.add(tf.summary.scalar(loss.op.name, loss)) ''' for loss in tf.get_collection('EXTRA_LOSSES'): summaries.add(tf.summary.scalar(loss.op.name, loss)) ''' for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ''' update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None with tf.device(FLAGS.gpu_train): learning_rate = tf_utils.configure_learning_rate( FLAGS, FLAGS.num_samples, global_step) # Configure the optimization procedure optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) #summaries.add(tf.summary.scalar('learning_rate', learning_rate)) ## Training #loss = tf.get_collection(tf.GraphKeys.LOSSES) #total_loss = tf.add_n(loss) ''' if FLAGS.fine_tune: gradient_multipliers = pickle.load(open('nets/multiplier_300.pkl','rb')) else: gradient_multipliers = None ''' if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) vars_grad = optimizer.compute_gradients(total_loss, variables_to_train) grad_updates = optimizer.apply_gradients(vars_grad, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_op = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') #train_op = slim.learning.create_train_op(total_loss, optimizer, gradient_multipliers=gradient_multipliers) # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train(train_op, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)