def imagenet_input(is_training): """Data reader for imagenet. Reads in imagenet data and performs pre-processing on the images. Args: is_training: bool specifying if train or validation dataset is needed. Returns: A batch of images and labels. """ if is_training: dataset = dataset_factory.get_dataset('imagenet', 'train', FLAGS.dataset_dir) else: dataset = dataset_factory.get_dataset('imagenet', 'validation', FLAGS.dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=is_training, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) image_preprocessing_fn = preprocessing_factory.get_preprocessing( 'mobilenet_v1', is_training=is_training) image = image_preprocessing_fn(image, FLAGS.image_size, FLAGS.image_size) images, labels = tf.train.batch( tensors=[image, label], batch_size=FLAGS.batch_size, num_threads=4, capacity=5 * FLAGS.batch_size) return images, labels
def config_initialization(): # image shape and feature layers shape inference image_shape = (FLAGS.train_image_height, FLAGS.train_image_width) if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) util.init_logger(log_file = 'log_train_seglink_%d_%d.log'%image_shape, log_path = FLAGS.train_dir, stdout = False, mode = 'a') config.init_config(image_shape, batch_size = FLAGS.batch_size, weight_decay = FLAGS.weight_decay, num_gpus = FLAGS.num_gpus, train_with_ignored = FLAGS.train_with_ignored, seg_loc_loss_weight = FLAGS.seg_loc_loss_weight, link_cls_loss_weight = FLAGS.link_cls_loss_weight, ) batch_size = config.batch_size batch_size_per_gpu = config.batch_size_per_gpu tf.summary.scalar('batch_size', batch_size) tf.summary.scalar('batch_size_per_gpu', batch_size_per_gpu) util.proc.set_proc_name(FLAGS.model_name + '_' + FLAGS.dataset_name) dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) config.print_config(FLAGS, dataset) return dataset
def main(): with tf.Graph().as_default(): if not dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=num_ps_tasks) dataset = dataset_factory.get_dataset( dataset_name, dataset_split_name, dataset_dir) image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) with tf.device(deploy_config.inputs_device()): with tf.name_scope('inputs'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=num_readers, common_queue_capacity=20 * batch_size, common_queue_min=10 * batch_size) [image, label, fp] = provider.get(['image', 'label', 'filepath']) label -= labels_offset train_image_size = 224 image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels, fps = tf.train.batch( [image, label, fp], batch_size=batch_size, num_threads=num_preprocessing_threads, capacity=5 * batch_size) tf.image_summary('image', images, max_images=5) labels = slim.one_hot_encoding( labels, dataset.num_classes - labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels, fps], capacity=2 * deploy_config.num_clones) images, labels, fps = batch_queue.dequeue() sess = tf.Session() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) image_data, label_data, fp_data = sess.run([images, labels, fps]) coord.request_stop() coord.join(threads) sess.close() return image_data, label_data, fp_data
def config_initialization(): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) # image shape and feature layers shape inference image_shape = (FLAGS.train_image_height, FLAGS.train_image_width) config.init_config(image_shape, batch_size = FLAGS.batch_size) util.proc.set_proc_name(FLAGS.model_name + '_' + FLAGS.dataset_name) dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # config.print_config(FLAGS, dataset) return dataset
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=[1, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
def config_initialization(): # image shape and feature layers shape inference image_shape = (FLAGS.eval_image_height, FLAGS.eval_image_width) if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) config.init_config(image_shape, batch_size = 1, seg_conf_threshold = FLAGS.seg_conf_threshold, link_conf_threshold = FLAGS.link_conf_threshold, train_with_ignored = FLAGS.train_with_ignored, seg_loc_loss_weight = FLAGS.seg_loc_loss_weight, link_cls_loss_weight = FLAGS.link_cls_loss_weight, ) util.proc.set_proc_name('eval_' + FLAGS.model_name + '_' + FLAGS.dataset_name ) dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) config.print_config(FLAGS, dataset, print_to_file = False) return dataset
def main(_): if not FLAGS.output_file: raise ValueError('You must supply the path to save to with --output_file') if FLAGS.is_video_model and not FLAGS.num_frames: raise ValueError( 'Number of frames must be specified for video models with --num_frames') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size if FLAGS.is_video_model: input_shape = [FLAGS.batch_size, FLAGS.num_frames, image_size, image_size, 3] else: input_shape = [FLAGS.batch_size, image_size, image_size, 3] placeholder = tf.placeholder(name='input', dtype=tf.float32, shape=input_shape) network_fn(placeholder) if FLAGS.quantize: tf.contrib.quantize.create_eval_graph() graph_def = graph.as_graph_def() if FLAGS.write_text_graphdef: tf.io.write_graph( graph_def, os.path.dirname(FLAGS.output_file), os.path.basename(FLAGS.output_file), as_text=True) else: with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### n_hash = FLAGS.number_hashing_functions L_vec = FLAGS.neuron_vector_length quant_params = [] for i in range(len(n_hash)): quant_params.append([int(n_hash[i]), int(L_vec[i])]) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), quant_params=quant_params, is_training=False) # network_fn = nets_factory.get_network_fn( # FLAGS.model_name, # num_classes=(dataset.num_classes - FLAGS.labels_offset), # is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.compat.v1.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(input=logits, axis=1) labels = tf.squeeze(labels) # Define the metrics: #names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ names_to_values, names_to_updates = aggregate_metric_map({ #'Accuracy': slim.metrics.streaming_accuracy(predictions,labels), 'Accuracy': tf.compat.v1.metrics.accuracy(labels, predictions), ##FIXXED 'Recall_5': ( logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.compat.v1.summary.scalar(summary_name, value, collections=[]) op = tf.compat.v1.Print(op, [value], summary_name) tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.io.gfile.isdir(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.compat.v1.logging.info('Evaluating %s' % checkpoint_path) config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth=True # config.log_device_placement=True slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), session_config=config, variables_to_restore=variables_to_restore)
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from datasets import dataset_factory from deployment import model_deploy from nets import nets_factory from preprocessing import preprocessing_factory slim = tf.contrib.slim dataset_name = 'tianchi' dataset_split_name = 'train' dataset_dir = '/home/fangsh/tianchi/tianchi_dataset/tfrecord' batcg_size = 32 dataset = dataset_factory.get_dataset( dataset_name, dataset_split_name, dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers= 4, common_queue_capacity=20 * batch_size, common_queue_min=10 * batch_size) [image, label] = provider.get(['image_raw', 'label'])
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Config model_deploy. Keep TF Slim Models structure. # Useful if want to need multiple GPUs and/or servers in the future. deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() # Select the dataset. dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = nets_factory.get_network(FLAGS.model_name) ssd_params = ssd_class.default_params._replace(num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # Training batches and queue. r = tf.train.batch( tf_utils.reshape_list([image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) # Intermediate queueing: unique batch computation pipeline for all # GPUs running the training. batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list([b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True) # Add loss function. ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # =================================================================== # # Add summaries from first clone. # =================================================================== # clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate(FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main(): MODEL_NAME = 'inception_resnet_v2' # Where the training (fine-tuned) checkpoint and logs will be saved to. TRAIN_DIR = 'D:/pig_recognize/pig_slim1/flowers-models/inception_resnet_v2/all' # Where the dataset is saved to. DATASET_DIR = 'D:/pig_recognize/pig_slim1/cifar10' FLAGS.checkpoint_path = TRAIN_DIR FLAGS.eval_dir = TRAIN_DIR FLAGS.dataset_name = 'cifar10' FLAGS.dataset_split_name = 'test' FLAGS.dataset_dir = DATASET_DIR FLAGS.model_name = MODEL_NAME if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image1 = image_preprocessing_fn(image, eval_image_size, eval_image_size) # images = tf.expand_dims(image1,0) images, labels = tf.train.batch( [image1, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) soft_result = tf.nn.softmax(logits) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # # Define the metrics: # names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ # 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), # 'Recall_5': slim.metrics.streaming_recall_at_k( # logits, labels, 5), # }) # # # Print the summaries to screen. # for name, value in names_to_values.items(): # summary_name = 'eval/%s' % name # op = tf.summary.scalar(summary_name, value, collections=[]) # op = tf.Print(op, [value], summary_name) # tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) # checkpoint_path = 'flowers-models/inception_resnet_v2/all/35000/model.ckpt-35000' else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) restorer = tf.train.Saver(variables_to_restore) with tf.Session() as sess: # restorer.restore(sess, "D:/pig_recognize/pig_slim1/flowers-models/inception_resnet_v2/all/model.ckpt-2000") restorer.restore(sess, checkpoint_path) print('Model restored.') coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) tag = [] # eval_op=list(names_to_updates.values()) for i in range(60): logits_, soft_result_, labels_, predictions_ = sess.run( [logits, soft_result, labels, predictions]) for m in range(FLAGS.batch_size): for n in range(30): tag.append([ labels_[m], str(n + 1), str('%.8f' % (soft_result_[m, n])) ]) # image_in = misc.imread('D:/pig_recognize/pig_slim1/pig_test_padding/00006.JPG') # image_resize = misc.imresize(image_in, (299,299,3)) # input_img = np.array(image_resize, dtype='uint8') ## input_lab = np.array(1, dtype='int64') ## imgs = np.expand_dims(input_img,0) ## imgs = np.append(imgs,imgs,0) # feed_dict = {image:input_img} # logits_ , predictions_= sess.run([logits, predictions], feed_dict=feed_dict) print(i) # print(logits_, predictions_)_face with open('out_b_19_face.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) for x in tag: writer.writerow(x) coord.request_stop() coord.join(threads)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpus if FLAGS.num_clones == -1: FLAGS.num_clones = len(FLAGS.gpus.split(',')) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # tf.set_random_seed(42) tf.set_random_seed(0) ###################### # Config model_deploy# ###################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir.split(','), dataset_list_dir=FLAGS.dataset_list_dir, num_samples=FLAGS.frames_per_video, modality=FLAGS.modality, split_id=FLAGS.split_id) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), batch_size=FLAGS.batch_size, weight_decay=FLAGS.weight_decay, is_training=True, dropout_keep_prob=(1.0-FLAGS.dropout), pooled_dropout_keep_prob=(1.0-FLAGS.pooled_dropout), batch_norm=FLAGS.netvlad_batch_norm) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) # in case of pooling images, # now preprocessing is done video-level ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, bgr_flips=FLAGS.bgr_flip) [image, label] = provider.get(['image', 'label']) # now note that the above image might be a 23 channel image if you have # both RGB and flow streams. It will need to split later, but all the # preprocessing will be done consistently for all frames over all streams label = tf.string_to_number(label, tf.int32) label.set_shape(()) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size scale_ratios=[float(el) for el in FLAGS.scale_ratios.split(',')], image = image_preprocessing_fn(image, train_image_size, train_image_size, scale_ratios=scale_ratios, out_dim_scale=FLAGS.out_dim_scale, model_name=FLAGS.model_name) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) if FLAGS.debug: images = tf.Print(images, [labels], 'Read batch') labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) summarize_images(images, provider.num_channels_stream) #################### # Define the model # #################### kwargs = {} if FLAGS.conv_endpoint is not None: kwargs['conv_endpoint'] = FLAGS.conv_endpoint def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn( images, pool_type=FLAGS.pooling, classifier_type=FLAGS.classifier_type, num_channels_stream=provider.num_channels_stream, netvlad_centers=FLAGS.netvlad_initCenters.split(','), stream_pool_type=FLAGS.stream_pool_type, **kwargs) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weight=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weight=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. global end_points_debug end_points = clones[0].outputs end_points_debug = dict(end_points) end_points_debug['images'] = images end_points_debug['labels'] = labels for end_point in end_points: x = end_points[end_point] summaries.add(tf.histogram_summary('activations/' + end_point, x)) summaries.add(tf.scalar_summary('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.scalar_summary('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.histogram_summary(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.scalar_summary('learning_rate', learning_rate, name='learning_rate')) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() logging.info('Training the following variables: %s' % ( ' '.join([el.name for el in variables_to_train]))) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # clip the gradients if needed if FLAGS.clip_gradients > 0: logging.info('Clipping gradients by %f' % FLAGS.clip_gradients) with tf.name_scope('clip_gradients'): clones_gradients = slim.learning.clip_gradient_norms( clones_gradients, FLAGS.clip_gradients) # Add total_loss to summary. summaries.add(tf.scalar_summary('total_loss', total_loss, name='total_loss')) # Create gradient updates. train_ops = {} if FLAGS.iter_size == 1: grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') train_ops = train_tensor else: gvs = [(grad, var) for grad, var in clones_gradients] varnames = [var.name for grad, var in gvs] varname_to_var = {var.name: var for grad, var in gvs} varname_to_grad = {var.name: grad for grad, var in gvs} varname_to_ref_grad = {} for vn in varnames: grad = varname_to_grad[vn] print("accumulating ... ", (vn, grad.get_shape())) with tf.variable_scope("ref_grad"): with tf.device(deploy_config.variables_device()): ref_var = slim.local_variable( np.zeros(grad.get_shape(),dtype=np.float32), name=vn[:-2]) varname_to_ref_grad[vn] = ref_var all_assign_ref_op = [ref.assign(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items()] all_assign_add_ref_op = [ref.assign_add(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items()] assign_gradients_ref_op = tf.group(*all_assign_ref_op) accmulate_gradients_op = tf.group(*all_assign_add_ref_op) with tf.control_dependencies([accmulate_gradients_op]): final_gvs = [(varname_to_ref_grad[var.name] / float(FLAGS.iter_size), var) for grad, var in gvs] apply_gradients_op = optimizer.apply_gradients(final_gvs, global_step=global_step) update_ops.append(apply_gradients_op) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') for i in range(FLAGS.iter_size): if i == 0: train_ops[i] = assign_gradients_ref_op elif i < FLAGS.iter_size - 1: # because apply_gradients also computes # (see control_dependency), so # no need of running an extra iteration train_ops[i] = accmulate_gradients_op else: train_ops[i] = train_tensor # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.merge_summary(list(summaries), name='summary_op') config = tf.ConfigProto() config.gpu_options.allow_growth = True config.intra_op_parallelism_threads = FLAGS.cpu_threads # config.allow_soft_placement = True # config.gpu_options.per_process_gpu_memory_fraction=0.7 ########################### # Kicks off the training. # ########################### logging.info('RUNNING ON SPLIT %d' % FLAGS.split_id) slim.learning.train( train_ops, train_step_fn=train_step, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None, session_config=config)
def main(_): ###add for pruning if FLAGS.model_name == "vgg": gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=0.9) #add by lzlu sessGPU = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) else: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=0.3) #add by lzlu sessGPU = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) print("FLAGS.model_name:", FLAGS.model_name) #config = tf.ConfigProto() #config.gpu_options.allow_growth=True #sessGPU = tf.Session(config=config) #sessGPU = tf.Session(config=tf.ConfigProto(log_device_placement=True)) #sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) print("FLAGS.max_number_of_steps:", FLAGS.max_number_of_steps) print("FLAGS.learning_rate:", FLAGS.learning_rate) print("FLAGS.weight_decay:", FLAGS.weight_decay) print("FLAGS.batch_size:", FLAGS.batch_size) print("FLAGS.trainable_scopes:", FLAGS.trainable_scopes) print("FLAGS.pruning_rates:", FLAGS.pruning_rates) print("FLAGS.train_dir:", FLAGS.train_dir) print("FLAGS.checkpoint_path:", FLAGS.checkpoint_path) print("FLAGS.pruning_gradient_update_ratio:", FLAGS.pruning_gradient_update_ratio) ### if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) print("deploy_config.variables_device():") print(deploy_config.variables_device()) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" with tf.device(deploy_config.inputs_device()): images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy( logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ##add for pruning summaries.add( tf.summary.scalar('pruning_rate/' + variable.op.name, 1 - tf.nn.zero_fraction(variable))) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### print("deploy_config.optimizer_device():") print(deploy_config.optimizer_device()) with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) ###add by lzlu variables = tf.model_variables() slim.model_analyzer.analyze_vars(variables, print_info=True) ##print("variables_to_train:",variables_to_train) ##print("clones_gradients_before_pruning:",clones_gradients) variables_to_pruning = get_variables_to_pruning() pruningMask = get_pruning_mask(variables_to_pruning) ##print("pruningMask__grad:",pruningMask) ##print("My_variables_to_pruning__grad:",variables_to_pruning) clones_gradients = apply_pruning_to_grad(clones_gradients, pruningMask) ##print("clones_gradients_after_pruning:",clones_gradients) ##print("slim.get_model_variables():",slim.get_model_variables()) ### # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ### add for pruning ####################### # Config mySaver # ####################### class mySaver(tf.train.Saver): def restore(self, sess, save_path): ##print("mySaver--restore...!") tf.train.Saver.restore(self, sess, save_path) variables_to_pruning = get_variables_to_pruning() ##print("My_variables_to_pruning__restore:",variables_to_pruning) pruningMask = apply_pruning_to_var(variables_to_pruning, sess) ##print("mySaver--restore done!") def save(self, sess, save_path, global_step=None, latest_filename=None, meta_graph_suffix="meta", write_meta_graph=True, write_state=True): ##print("My Saver--save...!") tf.train.Saver.save(self, sess, save_path, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state) ##print("My Saver--save done!") saver = mySaver(max_to_keep=2) ### ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, #add for pruning save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def eval_model(candidate, N, F): print("eval model") tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'test', FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, candidate, N, F, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) FLAGS.batch_size = 100 images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k(logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=FLAGS.train_dir, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore, session_config=config) return
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # =================================================================== # # Config model_deploy. # # Keep TF Slim Models structure. # # Useful if want to need multiple GPUs and/or servers in the future. # # =================================================================== # deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() # =================================================================== # # Select the dataset. # =================================================================== # dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # =================================================================== # # Select the network # =================================================================== # if FLAGS.model_name == "crnn": crnn_net = nets_factory.get_network(FLAGS.model_name) network_fn = crnn_net(phase='Train', num_classes=(dataset.num_classes - FLAGS.labels_offset)) else: network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) # =================================================================== # # Select the preprocessing function. # =================================================================== # preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) #tf_utils.print_configuration(FLAGS.__flags, # dataset.data_sources, save_dir=FLAGS.train_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device(deploy_config.inputs_device()): if FLAGS.dataset_name == "ocr": image, label = tf_utils.read_features(ops.join( FLAGS.dataset_dir, "ocr_train_000.tfrecord"), num_epochs=None) else: with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) [image, label] = provider.get(['image', 'label']) # Pre-processing image, labels and bboxes. train_image_size = FLAGS.train_image_size or network_fn.default_image_size #image = image_preprocessing_fn(image, 32, 256) # Resize the image to the specified height and width. image = tf.expand_dims(image, 0) image = tf.image.resize_bilinear(image, [IMAGE_H, IMAGE_W], align_corners=False) image = tf.squeeze(image, [0]) image = tf.subtract(image, 0.5) image = tf.multiply(image, 2.0) #label = tf.reshape(label,[MAX_CHAR_LEN]) images, labels = tf.train.shuffle_batch( tensors=[image, label], batch_size=32, capacity=1000 + 2 * 32, min_after_dequeue=100, #enqueue_many=True, num_threads=1) images = tf.cast(x=images, dtype=tf.float32) if FLAGS.model_name != "crnn": labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # #def clone_fn(batch_queue): def clone_fn(images, labels): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. #images, labels = batch_queue.dequeue() with tf.variable_scope('crnn'): logits, end_points = network_fn.build_CRNNnet(images) ############################# # Specify the loss function # ############################# if FLAGS.model_name == "crnn": if FLAGS.dataset_name == "mnist": idx = tf.where(tf.not_equal(labels, 0)) labels = tf.SparseTensor(idx, tf.gather_nd(labels, idx), labels.get_shape()) labels = tf.cast(labels, tf.int32) ctc_loss = tf.nn.ctc_loss( labels=labels, inputs=logits, sequence_length=SEQ_LENGTH, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=True, time_major=True) ctc_loss = tf.reduce_mean(ctc_loss) ctc_loss = tf.Print(ctc_loss, [ctc_loss], message='* Loss : ') tf.losses.add_loss(ctc_loss) decoded, log_prob = tf.nn.ctc_beam_search_decoder( logits, sequence_length=SEQ_LENGTH, merge_repeated=False) sequence_dist = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), labels)) else: if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points, ctc_loss, sequence_dist, labels, decoded if FLAGS.model_name == "crnn": end_points, ctc_loss, sequence_dist, labels, decoded = clone_fn( images, labels) network_fn.train_crnn(FLAGS, global_step, ctc_loss, sequence_dist, labels, decoded) else: # =================================================================== # # Add summaries from first clone. # =================================================================== # # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) #clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) clones = model_deploy.create_clones(deploy_config, clone_fn, [images, labels]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points, ctc_loss, sequence_dist, labels, decoded = clones[ 0].outputs for end_point in end_points: x = end_points[end_point] summaries.add( tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add( tf.summary.scalar('losses/%s' % loss.op.name, loss)) summaries.add(tf.summary.scalar('losses/ctc_loss', tensor=ctc_loss)) summaries.add(tf.summary.scalar('Seq_Dist', tensor=sequence_dist)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) #optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate).minimize(cost) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) #optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate).minimize(loss=ctc_loss, global_step=global_step) summaries.add( tf.summary.scalar('learning_rate', tensor=learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, #regularization_losses = ctc_loss, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') train_tensor = slim.learning.create_train_op( total_loss, optimizer) """ train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') """ # Add the summaries from the first clone. These contain the summaries #summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, # first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Configure the saver procedure. # =================================================================== # saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) model_save_dir = './checkpoints/' + FLAGS.model_name if not ops.exists(model_save_dir): os.makedirs(model_save_dir) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'CRNNnet_{:s}.ckpt'.format(str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) # =================================================================== # # Kicks off the training. # =================================================================== # #summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num),graph=tf.get_default_graph()) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, #session_wrapper=tfdbg.LocalCLIDebugWrapperSession, sync_optimizer=None)
def main(_): ''' training with optimization ''' if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): network_fn = get_network_fn(num_classes=FLAGS.num_classes, is_training=True) deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() train_set = dataset_factory.get_dataset(FLAGS.dataset_name, "train", FLAGS.dataset_dir) #val_set = dataset_factory.get_dataset(FLAGS.dataset_name, "val", FLAGS.dataset_dir) with tf.device(deploy_config.inputs_device()): #####Consider Replace the following until ##### #options = tf.python_io.TFRecordOptions(TFRecordCompressionType.ZLIB) train_provider = slim.dataset_data_provider.DatasetDataProvider( train_set, num_readers=FLAGS.num_readers, # reader_kwargs={'options':options}, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) try: [train_image, train_label, train_boxes ] = train_provider.get(['image', 'label', 'gt_boxes']) #[train_image, train_boxes, train_masks] = train_provider.get(['image', 'gt_boxes', 'gt_masks']) print(train_image, train_label, train_boxes) train_image, train_label, train_boxes, train_masks = _preprocessing.preprocess_image( train_image, train_label, train_boxes, is_training=True) except Exception as e: print(e) return train_images, train_labels = tf.train.batch( [train_image, train_label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) train_batch_queue = slim.prefetch_queue.prefetch_queue( [train_images, train_labels], capacity=2 * FLAGS.num_clones) print(train_batch_queue) #val_provider = slim.dataset_data_provider.DatasetDataProvider( # val_set, # num_readers=FLAGS.num_readers, # reader_kwargs={'options':options}, # common_queue_capacity=20 * FLAGS.batch_size, # common_queue_min=10 * FLAGS.batch_size) # #[val_image, val_label, val_boxes, val_masks] = val_provider.get(['image', 'label', 'gt_boxes', 'gt_masks']) # #val_image, val_label, val_boxes, val_masks = _preprocessing.preprocess_image(val_image, val_label, val_boxes, val_masks) # #val_images, val_labels = tf.train.batch( # [val_image, val_label], # batch_size=FLAGS.batch_size, # num_threads=FLAGS.num_preprocessing_threads, # capacity=5 * FLAGS.batch_size) # val_batch_queue = slim.prefetch_queue.prefetch_queue( # [val_images, val_labels], capacity=2 * FLAGS.num_clones) def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of networks""" images, labels = batch_queue.dequeue() #print(images, labels) images = tf.squeeze(images, [1]) pred_annotation, fc8s, end_points = network_fn(images=images) ############################ ## Loss function # ############################ #print("Pred_annot", pred_annotation, "Labels", labels,"fc8s", fc8s) tf.losses.sparse_softmax_cross_entropy( logits=tf.to_float(pred_annotation), labels=tf.to_int32(labels), weights=1.0, scope="entropy") #loss = tf.reduce_mean((tf.losses.sparse_softmax_cross_entropy(logits=tf.to_float(pred_annotation),labels=tf.to_int32(labels),scope="entropy"))) return images, labels, pred_annotation, end_points summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [train_batch_queue]) clone_scope = deploy_config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone_scope) images, labels, pred_annotation, end_points = clones[0].outputs summaries.add(tf.summary.image("Original_images", images)) summaries.add(tf.summary.image("Ground_truth_masks", labels)) summaries.add( tf.summary.image("Prediction_masks", tf.to_float(pred_annotation))) for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) for loss in tf.get_collection(tf.GraphKeys.LOSSES, clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) with tf.device(deploy_config.optimizer_device()): learning_rate = utils._configure_learning_rate( train_set.num_samples, global_step) optimizer = utils._configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) variables_to_train = utils._get_variables_to_train() for var in variables_to_train: print(var.op.name) total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) print('total_loss', total_loss, 'clone_gradients', clones_gradients) summaries.add(tf.summary.scalar('total_loss', total_loss)) grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, clone_scope)) summary_op = tf.summary.merge(list(summaries), name='summary_op') # Validate Set Evaluation options #network_fn_eval = get_network_fn(num_classes=NUM_OF_CLASSES, is_training=False) #print("val_images", val_images) #val_preds, fc8s, _ = network_fn(images=val_images) #names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ # 'mean_iou': slim.metrics.streaming_mean_iou(val_preds, val_labels, num_classes=NUM_OF_CLASSES), #}) #for name, value in names_to_values.items(): # summary_name = 'eval/%s' % name # op = tf.summary.scalar(summary_name, value, collections=[]) # op = tf.Print(op, [value], summary_name) # tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) #if FLAGS.moving_average_decay: # variable_averages = tf.train.ExponentialMovingAverage( # FLAGS.moving_average_decay, tf_global_step) # variables_to_restore = variable_averages.variables_to_restore( # slim.get_model_variables()) # variables_to_restore[tf_global_step.op.name] = tf_global_step #else: # variables_to_restore = slim.get_variables_to_restore() #for i in range(FLAGS.max_steps / FLAGS.iter_train_steps): slim.learning.train( train_tensor, logdir=FLAGS.logs_dir, master='', is_chief=(FLAGS.task == 0), init_fn=utils._get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS. max_steps, #FLAGS.iter_train_steps*(i+1) if FLAGS.max_steps > FLAGS.iter_train_steps*(i+1) else FLAGS.max_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') if FLAGS.variable_update == "horovod": import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): if FLAGS.variable_update == "horovod": import horovod.tensorflow as hvd # Set different seeds for shuffle queue, so that different workers # start to read different input files. # it's copied from tf_cnn_benchmarks/benchmark_cnn.py. seed_value = 1234 + int(hvd.rank()) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, seed=seed_value if FLAGS.variable_update == "horovod" else None) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') config = tf.ConfigProto() config.intra_op_parallelism_threads = FLAGS.num_intra_threads config.inter_op_parallelism_threads = FLAGS.num_inter_threads if FLAGS.variable_update == "horovod": import horovod.tensorflow as hvd local_init_op_ = hvd.broadcast_global_variables(0) is_chief = hvd.rank() == 0 else: local_init_op_ = _USE_DEFAULT is_chief = FLAGS.task == 0 ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir if is_chief else None, master=FLAGS.master, # all horovod workers are 'chiefs' is_chief=is_chief or FLAGS.variable_update == "horovod", init_fn=_get_init_fn(), local_init_op=local_init_op_, summary_op=summary_op if is_chief else _USE_DEFAULT, session_config=config, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
hm_a = hm.max(axis=0, keepdims=True) dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0) ret.update({'dense_wh': dense_wh, 'dense_wh_mask': dense_wh_mask}) del ret['wh'] elif self.opt.cat_spec_wh: ret.update({ 'cat_spec_wh': cat_spec_wh, 'cat_spec_mask': cat_spec_mask }) del ret['wh'] if self.opt.reg_offset: ret.update({'reg': reg}) if self.opt.debug > 0 or not self.split == 'train': gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \ np.zeros((1, 6), dtype=np.float32) meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id} ret['meta'] = meta return ret if __name__ == '__main__': from datasets.dataset_factory import get_dataset from opts import opts opts_obj = opts() opt = opts_obj.parse() Dataset = get_dataset('pascal', 'ctdet') dataset = Dataset(opt, 'train') opts_obj.update_dataset_info_and_set_heads(opt, dataset) for i in range(len(dataset)): dataset[i]
def main(): #################################################################################################### title = opts.title seed = opts.seed mode = opts.mode gpu_list = opts.gpu_list batch_size = opts.batch_size dataset = opts.dataset preprocess = opts.preprocess network = opts.network optimizer = opts.optimizer lr_decay = opts.lr_decay epoch_step = opts.epoch_step learning_step = opts.learning_step path_load = opts.path_load path_save = opts.path_save print_line() #################################################################################################### time_tag = get_time('%y-%m-%d %X') time_tag_short = time_tag[:8] seed = set_seed(seed) num_check_log = 0 title_temp = title while True: path_log = '../log/' + time_tag_short + '(' + title_temp + ').txt' if os.path.isfile( path_log ) and title != 'temp': # if title is 'temp', we will overwrite it num_check_log += 1 title_temp = title + '_%d' % num_check_log else: title = title_temp del num_check_log, title_temp break print('title: ' + title) set_log(path_log) print_line() #################################################################################################### print(time_tag) print('SEED = %d' % seed) print_opts('options/' + OPTION + '.py') print_line() #################################################################################################### model_dir = '../model/' if isinstance(path_save, bool): # if title is 'temp', we will not save model path_save = model_dir + time_tag_short + '(' + title + ').tf' if path_save and title != 'temp' else None if path_load is not None: # key word search list = glob.glob(model_dir + '*' + path_load + '*.tf.data*') if len(list) == 0: raise FileNotFoundError( 'Could not find any model file match the key words' + path_load) elif len(list) > 1: for list_file in list: print(list_file) raise FileNotFoundError( 'Find more than one model file match the key words' + path_load) path_load = list[0][:list[0].find('.tf.') + 3] print('Find model in', path_load) #################################################################################################### os.environ['CUDA_VISIBLE_DEVICES'] = ''.join( str(gpu) + ',' for gpu in gpu_list) # os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' num_worker = max(len(gpu_list), 1) dataset_train = get_dataset(dataset, split='train') dataset_test = get_dataset(dataset, split='test') num_batch_train = dataset_train.num_sample // batch_size num_batch_test = dataset_test.num_sample // 100 assert batch_size % num_worker == 0, 'batch_size %d can not be divided by number of workers %d' % ( batch_size, num_worker) iterator_train = get_batch(dataset_train, preprocess, True, batch_size // num_worker, num_worker, seed=seed) iterator_test = get_batch(dataset_test, preprocess, False, 100, num_worker, seed=seed) #################################################################################################### if mode in ['input_train', 'input_test']: if mode == 'input_train': num_batch = num_batch_train batch_input = iterator_train.get_next() else: num_batch = num_batch_test batch_input = iterator_test.get_next() print('Testing the speed of data input pipeline.') sess = get_session(gpu_list) while True: for batch in tqdm(range(num_batch), desc='Input pipeline', leave=False, smoothing=0.1): batch_input_ = sess.run(batch_input) #################################################################################################### nets = [] net = get_net_fn(network) if num_worker == 1: if len(gpu_list) == 0: print('Multi-CPU training, it might be slow', ) print( 'All parameters are pinned to CPU, all Ops are pinned to CPU') is_cpu_ps = True else: print('Single-GPU training with gpu', gpu_list[0]) print( 'All parameters are pinned to GPU, all Ops are pinned to GPU') is_cpu_ps = False elif num_worker > 1: print('Multi-GPU training tower with gpu list', gpu_list) print('All parameters are pinned to CPU, all Ops are pinned to GPU') print( 'Get batchnorm moving average updates from data in the first GPU for speed' ) print('Get L2 decay grads in the second GPU for speed') is_cpu_ps = True else: raise NotImplementedError('Unrecognized device settings') tower_grads = [] tower_losses = [] tower_errors = [] # Loops over the number of workers and creates a copy ("tower") of the model on each worker. for i in range(num_worker): worker = '/gpu:%d' % i if gpu_list else '/cpu:0' # Creates a device setter used to determine where Ops are to be placed. if is_cpu_ps: # tf.train.replica_device_setter supports placing variables on the CPU, all # on one GPU, or on ps_servers defined in a cluster_spec. device_setter = tf.train.replica_device_setter( worker_device=worker, ps_device='/cpu:0', ps_tasks=1) else: device_setter = worker ''' 1. pin ops to GPU 2. pin parameters to CPU (multi-GPU training) or GPU (single-GPU training) 3. reuse parameters multi-GPU training # Creates variables on the first loop. On subsequent loops reuse is set # to True, which results in the "towers" sharing variables. # tf.device calls the device_setter for each Op that is created. # device_setter returns the device the Op is to be placed on. ''' with tf.variable_scope(tf.get_variable_scope(), reuse=bool(i != 0)), \ tf.device(device_setter): print('Training model on GPU %d' % gpu_list[i]) if gpu_list else print('Training model on CPUs') batch_train = iterator_train.get_next() if mode == 'speed_net': with tf.device('/cpu:0'): print( 'Testing the speed of model by synthesized data, ' 'which is theoretically the maximum speed for training this model' ) batch_train = iterator_train.get_next() shape_x = [batch_size // num_worker ] + batch_train[0].get_shape().as_list()[1:] shape_y = [batch_size // num_worker ] + batch_train[1].get_shape().as_list()[1:] batch_train_x = tf.zeros(shape_x, dtype=tf.float32) batch_train_y = tf.zeros(shape_y, dtype=tf.float32) batch_train = [batch_train_x, batch_train_y] nets.append( net(batch_train[0], batch_train[1], opts=opts, is_training=True)) tower_losses.append(nets[i].loss) tower_errors.append(nets[i].error) if i == 0: # We only get batchnorm moving average updates from data in the first worker for speed update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) nets[-1].count_parameters() nets[-1].count_MACs() nets[-1].count_MEMs() loss_worker = nets[i].loss if num_worker == 1: # Single-GPU or multi-CPU training loss_worker += nets[i].get_l2_loss() elif i == 1: # We only compute L2 grads in the second worker for speed. # In this case, L2 grads should multiple num_worker to maintain the equivalence loss_worker += num_worker * nets[i].get_l2_loss() tower_grads.append( optimizer.compute_gradients(loss_worker, colocate_gradients_with_ops=True)) if i == num_worker - 1: print('Testing model on GPU %d' % gpu_list[i]) if gpu_list else print( 'Testing model on CPUs') tf.get_variable_scope().reuse_variables() batch_test = iterator_test.get_next() nets.append( net(batch_test[0], batch_test[1], opts=opts, is_training=False)) error_batch_test = nets[-1].error if mode in ['attack']: print('Attack model on GPU %d' % gpu_list[i - 1]) if gpu_list else print( 'Attack model on CPUs') tf.get_variable_scope().reuse_variables() batch_attack_x = tf.placeholder( shape=batch_test[0].get_shape(), dtype=batch_test[0].dtype) batch_attack_y = tf.placeholder( shape=batch_test[1].get_shape(), dtype=batch_test[1].dtype) nets.append( net(batch_attack_x, batch_attack_y, opts=opts, is_training=False)) error_batch_attack = nets[-1].error with tf.device('/cpu:0' if is_cpu_ps else worker): grad_batch_train = aggregate_gradients(tower_grads) loss_batch_train = aggregate_statistics(tower_losses) error_batch_train = aggregate_statistics(tower_errors) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grad_batch_train, global_step=learning_step) #################################################################################################### if hasattr(opts, 'delay'): delay4gpus(opts.delay, gpu_list=gpu_list) sess = get_session(gpu_list) saver = tf.train.Saver(max_to_keep=None) def evaluate(): error_test = 0. for _ in tqdm(range(num_batch_test), desc='Test', leave=False, smoothing=0.1): error_test += sess.run(error_batch_test) return error_test / num_batch_test def attack(black=False): error_fgsm = 0. delta = 1. / 32 if black is False: adversial_x = [] adversial_y = [] for _ in tqdm(range(num_batch_test), desc='Attack', leave=False, smoothing=0.1): test_x, test_y, grads = sess.run( [nets[1].H[0], nets[1].Y[0], nets[1].grads_H[0]]) fsgm_x = test_x + delta * np.sign(grads) error_fgsm += sess.run(error_batch_attack, feed_dict={ batch_attack_x: fsgm_x, batch_attack_y: test_y }) adversial_x.append(fsgm_x) adversial_y.append(test_y) else: adversial_sample = np.load('adversial_sample.npz') adversial_x = adversial_sample['x'] adversial_y = adversial_sample['y'] for i in tqdm(range(adversial_x.shape[0]), desc='Attack', leave=False, smoothing=0.1): error_fgsm += sess.run(error_batch_attack, feed_dict={ batch_attack_x: adversial_x[i, ...], batch_attack_y: adversial_y[i, ...] }) adversial_x = np.array(adversial_x) adversial_y = np.array(adversial_y) np.savez('adversial_sample.npz', x=adversial_x, y=adversial_y) return error_fgsm / num_batch_test def save_model(path): saver.save(sess, path) print('S', end='') def load_model(path): print('Loading model from %s ...' % path_load) saver.restore(sess, path_load) if path_load is not None: load_model(path_load) error_test_best = evaluate() print('Test: %.4f' % error_test_best) if mode == 'attack': print(attack(black=False)) if mode == 'export': vars_list = get_variable('batchnorm/gamma:') vars_numpy = sess.run(vars_list) export(vars_numpy, 'gamma') if mode in ['test', 'export', 'attack']: exit(0) if mode == 'restart': sess.run(epoch_step.assign(0)) print_line() #################################################################################################### while True: # update learning rate lr_epoch = sess.run(lr_decay) if lr_epoch <= 0: break epoch = sess.run(epoch_step) print('Epoch: %03d' % epoch, end=' ') loss_epoch = 0. error_epoch = 0. t0 = get_time() for batch in tqdm(range(num_batch_train), desc='Epoch: %03d' % epoch, leave=False, smoothing=0.1): if mode == 'debug': print('DEBUG: '), _, loss_delta, error_delta, H, W, gradsH, gradsW, label_ = sess.run( [ train_op, loss_batch_train, error_batch_train, nets[0].H, nets[0].W, nets[0].grads_H, nets[0].grads_W, nets[0].Y ]) else: _, loss_delta, error_delta = sess.run( [train_op, loss_batch_train, error_batch_train]) loss_epoch += loss_delta error_epoch += error_delta print('Loss: %.6f Train: %.4f' % (loss_epoch / num_batch_train, error_epoch / num_batch_train), end=' ') FPS = num_batch_train * batch_size / (get_time() - t0) error_test = evaluate() assert error_test > 1e-4, ( 'Invalid test error %f, something goes wrong' % error_test) print('Test: %.4f lr: %.4f FPS: %d' % (error_test, lr_epoch, FPS), end=' ') sess.run(epoch_step.assign(epoch + 1)) if epoch == 1: error_test_best = min(error_test, 0.9) if error_test < error_test_best: print('B', end=' ') if path_save is not None: save_model(path_save) error_test_best = error_test print('') print_line() #################################################################################################### sess.close() print('Optimization ended at ' + get_time('%y-%m-%d %X')) return 0
def main(_): tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() tf.logging.info("Preparing dataset") dataset = dataset_factory.get_dataset(dataset_name, dataset_split_name, dataset_dir) network_fn = nets_factory.get_network_fn( model_name, num_classes=dataset.num_classes, is_training=False) tf.logging.info("Initializing dataset provider") provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=32, common_queue_min=1) tf.logging.info("Initialized provider, now getting image and label") [image, label] = provider.get(['image', 'label']) tf.logging.info("Got image with label %s" % label) image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch([image, label], batch_size=1) logits, _ = network_fn(images) variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels) # 'Recall_5': slim.metrics.streaming_recall_at_k( #logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) num_batches = 1 # with tf.Session() as sess: # sess.run(tf.global_variables_initializer()) # sess.run(tf.local_variables_initializer()) # # sess.run(names_to_updates.values()) # # metric_values = sess.run(names_to_values.values()) # for metric, value in zip(names_to_values.keys(), metric_values): # tf.logging.info('Metric %s has value: %f' % (metric, value)) tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master='', checkpoint_path=checkpoint_path, logdir=eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def main(_): if not FLAGS.train_dir and not FLAGS.checkpoint_path: print('Either --train_dir or --checkpoint_path flags has to be provided.') if FLAGS.train_dir and FLAGS.checkpoint_path: print('Only one of --train_dir or --checkpoint_path should be provided.') params = model_lib.default_hparams() params.parse(FLAGS.hparams) tf.logging.info('User provided hparams: %s', FLAGS.hparams) tf.logging.info('All hyper parameters: %s', params) batch_size = params.eval_batch_size graph = tf.Graph() with graph.as_default(): # dataset dataset, num_examples, num_classes, bounds = dataset_factory.get_dataset( FLAGS.dataset, FLAGS.split_name, batch_size, FLAGS.dataset_image_size, is_training=False) dataset_iterator = dataset.make_one_shot_iterator() images, labels = dataset_iterator.get_next() if FLAGS.num_examples > 0: num_examples = min(num_examples, FLAGS.num_examples) # setup model global_step = tf.train.get_or_create_global_step() model_fn_two_args = model_lib.get_model(FLAGS.model_name, num_classes) model_fn = lambda x: model_fn_two_args(x, is_training=False) if not FLAGS.adv_method or FLAGS.adv_method == 'clean': logits = model_fn(images) else: adv_examples = adversarial_attack.generate_adversarial_examples( images, bounds, model_fn, FLAGS.adv_method) logits = model_fn(adv_examples) # update trainable variables if fine tuning is used model_lib.filter_trainable_variables(FLAGS.trainable_scopes) # Setup the moving averages if FLAGS.moving_average_decay and (FLAGS.moving_average_decay > 0): variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) variables_to_restore = variable_averages.variables_to_restore( tf.contrib.framework.get_model_variables()) variables_to_restore[global_step.op.name] = global_step else: variables_to_restore = tf.contrib.framework.get_variables_to_restore() # Setup evaluation metric with tf.name_scope('Eval'): names_to_values, names_to_updates = ( tf.contrib.metrics.aggregate_metric_map({ 'Accuracy': tf.metrics.accuracy(labels, tf.argmax(logits, 1)), 'Top5': tf.metrics.recall_at_k(tf.to_int64(labels), logits, 5) })) for name, value in names_to_values.iteritems(): tf.summary.scalar(name, value) # Run evaluation num_batches = int(num_examples / batch_size) if FLAGS.train_dir: output_dir = os.path.join(FLAGS.train_dir, FLAGS.eval_name) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) tf.contrib.training.evaluate_repeatedly( FLAGS.train_dir, master=FLAGS.master, scaffold=tf.train.Scaffold( saver=tf.train.Saver(variables_to_restore)), eval_ops=names_to_updates.values(), eval_interval_secs=FLAGS.eval_interval_secs, hooks=[ tf.contrib.training.StopAfterNEvalsHook(num_batches), tf.contrib.training.SummaryAtEndHook(output_dir), tf.train.LoggingTensorHook(names_to_values, at_end=True), ], max_number_of_evaluations=1 if FLAGS.eval_once else None) else: result = tf.contrib.training.evaluate_once( FLAGS.checkpoint_path, master=FLAGS.master, scaffold=tf.train.Scaffold( saver=tf.train.Saver(variables_to_restore)), eval_ops=names_to_updates.values(), final_ops=names_to_values, hooks=[ tf.contrib.training.StopAfterNEvalsHook(num_batches), tf.train.LoggingTensorHook(names_to_values, at_end=True), ]) if FLAGS.output_file: with tf.gfile.Open(FLAGS.output_file, 'a') as f: f.write('%s,%.3f,%.3f\n' % (FLAGS.eval_name, result['Accuracy'], result['Top5']))
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() print(slim.get_model_variables()) predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_sparse_recall_at_k(logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True, width_multiplier=FLAGS.width_multiplier) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) # gt_bboxes format [ymin, xmin, ymax, xmax] [image, img_shape, gt_labels, gt_bboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) # Preprocesing # gt_bboxes = scale_bboxes(gt_bboxes, img_shape) # bboxes format [0,1) for tf draw image, gt_labels, gt_bboxes = image_preprocessing_fn(image, config.IMG_HEIGHT, config.IMG_WIDTH, labels=gt_labels, bboxes=gt_bboxes, ) ############################################# # Encode annotations for losses computation # ############################################# # anchors format [cx, cy, w, h] anchors = tf.convert_to_tensor(config.ANCHOR_SHAPE, dtype=tf.float32) # encode annos, box_input format [cx, cy, w, h] input_mask, labels_input, box_delta_input, box_input = encode_annos(gt_labels, gt_bboxes, anchors, config.NUM_CLASSES) images, b_input_mask, b_labels_input, b_box_delta_input, b_box_input = tf.train.batch( [image, input_mask, labels_input, box_delta_input, box_input], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) batch_queue = slim.prefetch_queue.prefetch_queue( [images, b_input_mask, b_labels_input, b_box_delta_input, b_box_input], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, b_input_mask, b_labels_input, b_box_delta_input, b_box_input = batch_queue.dequeue() anchors = tf.convert_to_tensor(config.ANCHOR_SHAPE, dtype=tf.float32) end_points = network_fn(images) end_points["viz_images"] = images conv_ds_14 = end_points['MobileNet/conv_ds_14/depthwise_conv'] dropout = slim.dropout(conv_ds_14, keep_prob=0.5, is_training=True) num_output = config.NUM_ANCHORS * (config.NUM_CLASSES + 1 + 4) predict = slim.conv2d(dropout, num_output, kernel_size=(3, 3), stride=1, padding='SAME', activation_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=0.0001), scope="MobileNet/conv_predict") with tf.name_scope("Interpre_prediction") as scope: pred_box_delta, pred_class_probs, pred_conf, ious, det_probs, det_boxes, det_class = \ interpre_prediction(predict, b_input_mask, anchors, b_box_input) end_points["viz_det_probs"] = det_probs end_points["viz_det_boxes"] = det_boxes end_points["viz_det_class"] = det_class with tf.name_scope("Losses") as scope: losses(b_input_mask, b_labels_input, ious, b_box_delta_input, pred_class_probs, pred_conf, pred_box_delta) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: if end_point not in ["viz_images", "viz_det_probs", "viz_det_boxes", "viz_det_class"]: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for det result TODO(shizehao): vizulize prediction # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def prefetch_test(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str Dataset = get_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) Logger(opt) Detector = detector_factory[opt.task] dataset = Dataset(opt, 'test') model_begin = 100 model_end = 140 if opt.load_model != '': model_begin = 0 model_end = 0 if opt.test_with_eval: map_dcit = {'best_id': 0, 'best_map': 0} best_output = [] for model_id in range(model_begin, model_end + 1): if opt.load_model == '': model_path = opt.save_dir[:-4] if opt.save_dir.endswith( 'TEST') else opt.save_dir opt.load_model = os.path.join(model_path, 'model_' + str(model_id) + '.pth') detector = Detector(opt) data_loader = torch.utils.data.DataLoader(PrefetchDataset( opt, dataset, detector.pre_process), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) num_iters = len(dataset) print("----epoch :{} -----".format(model_id)) bar = Bar('{}'.format(opt.exp_id), max=num_iters) time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge'] avg_time_stats = {t: AverageMeter() for t in time_stats} output_hoi = [] for ind, (img_id, pre_processed_images) in enumerate(data_loader): ret = detector.run(pre_processed_images) output_i = ret['results_rel'].copy() output_i['file_name'] = dataset.hoi_annotations[int( img_id)]['file_name'] output_hoi.append(output_i) Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format( ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td) for t in avg_time_stats: avg_time_stats[t].update(ret[t]) Bar.suffix = Bar.suffix + '|{} {tm.val:.3f}s ({tm.avg:.3f}s) '.format( t, tm=avg_time_stats[t]) bar.next() bar.finish() if opt.test_with_eval: if 'hico' in opt.dataset: hoi_eval = hico( os.path.join(opt.root_path, 'hico_det/annotations/test_hico.json')) elif 'vcoco' in opt.dataset: hoi_eval = vcoco( os.path.join(opt.root_path, 'verbcoco/annotations/test_vcoco.json')) elif 'hoia' in opt.dataset: hoi_eval = hoia( os.path.join(opt.root_path, 'hoia/annotations/test_hoia.json')) map = hoi_eval.evalution(output_hoi) if map > map_dcit['best_map']: map_dcit['best_map'] = map map_dcit['best_id'] = model_id best_output = output_hoi if opt.save_predictions: save_json(output_hoi, model_path, 'predictions_model_' + str(model_id) + '.json') if opt.test_with_eval: print('best model id: {}, best map: {}'.format(map_dcit['best_id'], map_dcit['best_map'])) save_json(best_output, model_path, 'best_predictions.json')
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step() # Get images and labels for CIFAR-10. # Force input pipeline to CPU:0 to avoid operations sometimes ending up on # GPU and resulting in a slow down. with tf.device('/cpu:0'): dataset = dataset_factory.get_dataset("color", "train", "D:/colors") examples_per_shard = 1024 min_queue_examples = examples_per_shard * 50 provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=8, common_queue_capacity=min_queue_examples + 3 * 12, common_queue_min=min_queue_examples) [image, label] = provider.get(['image', 'label']) # image,label=set(image,label_1,label_2,FLAGS.coarse,fw[FLAGS.coarse]) preprocessing_name = "color" # or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) image = image_preprocessing_fn(image, 24, 24) images, labels = tf.train.shuffle_batch([image, label], batch_size=12, num_threads=4, capacity=2 * 4 * 12, min_after_dequeue=48) # labels = slim.one_hot_encoding(labels, 10) batch_queue = slim.prefetch_queue.prefetch_queue([images, labels], capacity=16 * 1, num_threads=4) images, labels = batch_queue.dequeue() # with tf.device('/cpu:0'): # img, label = cifar10.read_and_decode("tmp/cifar10_newdata/train.tfrecords") # img_batch, label_batch = tf.train.shuffle_batch([img, label], # batch_size=128, capacity=2000, # min_after_dequeue=1000) # Build a Graph that computes the logits predictions from the # inference model. logits = colors.inference(images) # logits=cifar10.resnet_50(images, classes=10,is_training=True) # model = cifar10_model.ResNetCifar10( # 44, # is_training=True, # batch_norm_decay=0.997, # batch_norm_epsilon=1e-5, # data_format='channels_last') # logits = model.forward_pass(images, input_data_format='channels_last') # logits=cifar10.resnet_50(images) # logits=cifar10.resnet_50(images) # Calculate loss and acc. loss, accuracy = colors.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = colors.train(loss, global_step) ##### validation step # with tf.device('/cpu:0'): # eval_images, eval_labels = cifar10.inputs(eval_data="test") # # eval_logits = cifar10.alexnet_cifar_FC(eval_images, True) # # eval_logits = model.forward_pass(eval_images, input_data_format='channels_last') # top_k_op = cifar10.my_accuracy(eval_logits, eval_labels) class _LoggerHook(tf.train.SessionRunHook): """Logs loss,runtime and accuracy.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs( [loss, accuracy, logits, labels]) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value, acc_value, logitss, labless = run_values.results x = np.argmax(logits) examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f, batch_accuracy=%.4f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, acc_value, examples_per_sec, sec_per_batch)) config = tf.ConfigProto( log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True add_global = global_step.assign_add(1) # saver = tf.train.Saver() var_list = tf.trainable_variables() g_list = tf.global_variables() bn_moving_vars = [g for g in g_list if 'moving_mean' in g.name] bn_moving_vars += [g for g in g_list if 'moving_variance' in g.name] var_list += bn_moving_vars with tf.train.MonitoredTrainingSession( save_checkpoint_secs=60, checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), tf.train.SummarySaverHook( save_steps=1000, output_dir=FLAGS.train_dir, summary_op=tf.summary.merge_all()), _LoggerHook() ], config=config) as mon_sess: f = open("result.txt", 'a+') while not mon_sess.should_stop(): mon_sess.run(train_op) step = mon_sess.run(add_global) if step % 1000 == 0: lr = mon_sess.run(tf.get_collection('learning_rate')) f.write("step %d-----------------------------" % step) f.write("lr>>%.5f " % lr[0])
def train_input_fn(): # Select the dataset. dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.data_dir) tf_utils.print_configuration(FLAGS.__flags, ron_params, dataset.data_sources, FLAGS.model_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=120 * FLAGS.batch_size, common_queue_min=80 * FLAGS.batch_size, shuffle=True) # Get for RON network: image, labels, bboxes. # (ymin, xmin, ymax, xmax) fro gbboxes [image, shape, glabels, gbboxes, isdifficult] = provider.get([ 'image', 'shape', 'object/label', 'object/bbox', 'object/difficult' ]) isdifficult_mask = tf.cond( tf.reduce_sum( tf.cast( tf.logical_not( tf.equal(tf.ones_like(isdifficult), isdifficult)), tf.float32)) < 1., lambda: tf.one_hot(0, tf.shape(isdifficult)[0], on_value=True, off_value=False, dtype=tf.bool), lambda: isdifficult < tf.ones_like(isdifficult)) glabels = tf.boolean_mask(glabels, isdifficult_mask) gbboxes = tf.boolean_mask(gbboxes, isdifficult_mask) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) # Pre-processing image, labels and bboxes. image, glabels, gbboxes = image_preprocessing_fn( image, glabels, gbboxes, out_shape=ron_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. # glocalisations is our regression object # gclasses is the ground_trutuh label # gscores is the the jaccard score with ground_truth gclasses, glocalisations, gscores = ron_net.bboxes_encode( glabels, gbboxes, ron_anchors, positive_threshold=FLAGS.match_threshold, ignore_threshold=FLAGS.neg_threshold) # each size of the batch elements # include one image, three others(gclasses, glocalisations, gscores) batch_shape = [1] + [len(ron_anchors)] * 3 # Training batches and queue. r = tf.train.batch(tf_utils.reshape_list( [image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=120 * FLAGS.batch_size, shared_name=None) b_image, b_gclasses, b_glocalisations, b_gscores = tf_utils.reshape_list( r, batch_shape) return b_image, { 'b_gclasses': b_gclasses, 'b_glocalisations': b_glocalisations, 'b_gscores': b_gscores }
def main(): args = parse_args() if args.cfg_file is not None: cfg_from_file(args.cfg_file) tf.logging.info('Using Config:') pprint.pprint(cfg) train_dir = get_output_dir( 'default' if args.cfg_file is None else args.cfg_file) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPUS num_clones = len(cfg.GPUS.split(',')) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ###################### # Config model_deploy# ###################### tf.set_random_seed(cfg.RNG_SEED) deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones, clone_on_cpu=False, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### kwargs = {} if cfg.TRAIN.VIDEO_FRAMES_PER_VIDEO > 1: kwargs['num_samples'] = cfg.TRAIN.VIDEO_FRAMES_PER_VIDEO kwargs['randomFromSegmentStyle'] = cfg.TRAIN.READ_SEGMENT_STYLE kwargs['modality'] = cfg.INPUT.VIDEO.MODALITY kwargs['split_id'] = cfg.INPUT.SPLIT_ID if cfg.DATASET_LIST_DIR != '': kwargs['dataset_list_dir'] = cfg.DATASET_LIST_DIR if cfg.INPUT_FILE_STYLE_LABEL != '': kwargs['input_file_style_label'] = cfg.INPUT_FILE_STYLE_LABEL dataset, num_pose_keypoints = dataset_factory.get_dataset( cfg.DATASET_NAME, cfg.TRAIN.DATASET_SPLIT_NAME, cfg.DATASET_DIR, **kwargs) #################### # Select the network # #################### network_fn = nets_factory.get_network_fn( cfg.MODEL_NAME, num_classes=(dataset.num_classes), num_pose_keypoints=num_pose_keypoints, weight_decay=cfg.TRAIN.WEIGHT_DECAY, is_training=True, cfg=cfg) # advanced network creation controlled with cfg.NET ##################################### # Select the preprocessing function # ##################################### preprocessing_name = cfg.MODEL_NAME image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=cfg.NUM_READERS, common_queue_capacity=20 * cfg.TRAIN.BATCH_SIZE, common_queue_min=10 * cfg.TRAIN.BATCH_SIZE) [image, pose_label_hmap, pose_label_valid, action_label] = train_preprocess_pipeline(provider, cfg, network_fn, num_pose_keypoints, image_preprocessing_fn) # input_data = [preprocess_pipeline( # provider, cfg, network_fn, num_pose_keypoints, image_preprocessing_fn) # for _ in range(cfg.NUM_PREPROCESSING_THREADS)] images, pose_labels_hmap, pose_labels_valid, action_labels = tf.train.batch( [image, pose_label_hmap, pose_label_valid, action_label], # input_data, batch_size=cfg.TRAIN.BATCH_SIZE, num_threads=cfg.NUM_PREPROCESSING_THREADS, capacity=5 * cfg.TRAIN.BATCH_SIZE) batch_queue = slim.prefetch_queue.prefetch_queue( [images, pose_labels_hmap, pose_labels_valid, action_labels], capacity=5 * deploy_config.num_clones * cfg.TRAIN.ITER_SIZE) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels_pose, labels_pose_valid, labels_action = batch_queue.dequeue( ) # due to the multi-frame/video thing, need to squeeze first 2 dimensions labels_pose = tf.concat(tf.unstack(labels_pose), axis=0) labels_pose_valid = tf.concat(tf.unstack(labels_pose_valid), axis=0) logits, end_points = network_fn(images) pose_logits = end_points['PoseLogits'] ############################# # Specify the loss function # ############################# # if 'AuxLogits' in end_points: # slim.losses.softmax_cross_entropy( # end_points['AuxLogits'], labels, # label_smoothing=cfg.TRAIN.LABEL_SMOOTHING, weight=0.4, scope='aux_loss') # slim.losses.softmax_cross_entropy( # logits, labels, label_smoothing=cfg.TRAIN.LABEL_SMOOTHING, weight=1.0) end_points['Images'] = images end_points['PoseLabels'] = labels_pose end_points['ActionLabels'] = labels_action end_points['ActionLogits'] = logits tf.logging.info('PoseLogits shape is {}.'.format( pose_logits.get_shape().as_list())) gen_losses(labels_action, logits, cfg.TRAIN.LOSS_FN_ACTION, dataset.num_classes, cfg.TRAIN.LOSS_FN_ACTION_WT, labels_pose, pose_logits, cfg.TRAIN.LOSS_FN_POSE, labels_pose_valid, cfg.TRAIN.LOSS_FN_POSE_WT, end_points, cfg) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs # store the end points in a global variable for debugging in train_step global end_points_debug end_points_debug = end_points for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) # summaries.add(tf.summary.scalar(tf.nn.zero_fraction(x), # name='sparsity/' + end_point)) sum_img = tf.concat(tf.unstack(end_points['Images']), axis=0) if sum_img.get_shape().as_list()[-1] not in [1, 3, 4]: sum_img = tf.reduce_sum(sum_img, axis=-1, keep_dims=True) sum_img = sum_img - tf.reduce_min(sum_img) sum_img = sum_img / (tf.reduce_max(sum_img) + cfg.EPS) summaries.add(tf.summary.image('images', sum_img)) for epname in cfg.TRAIN.OTHER_IMG_SUMMARIES_TO_ADD: if epname in end_points: summaries.add( tf.summary.image('image_vis/' + epname, end_points[epname])) summaries = summaries.union( _summarize_heatmaps('labels', end_points['PoseLabels'], sum_img)) summaries = summaries.union( _summarize_heatmaps('pose', end_points['PoseLogits'], sum_img)) if 'PoseLossMask' in end_points: summaries = summaries.union( _summarize_heatmaps('loss_mask/pose', end_points['PoseLossMask'], sum_img)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add( tf.summary.scalar(tensor=loss, name='losses/%s' % loss.op.name)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if cfg.TRAIN.MOVING_AVERAGE_VARIABLES: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( cfg.TRAIN.MOVING_AVERAGE_VARIABLES, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, num_clones, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add( tf.summary.scalar(tensor=learning_rate, name='learning_rate')) # if cfg.sync_replicas: # # If sync_replicas is enabled, the averaging will be done in the chief # # queue runner. # optimizer = tf.train.SyncReplicasOptimizer( # opt=optimizer, # replicas_to_aggregate=, # variable_averages=variable_averages, # variables_to_average=moving_average_variables, # replica_id=tf.constant(cfg.task, tf.int32, shape=()), # total_num_replicas=cfg.worker_replicas) # elif cfg.moving_average_decay: # # Update ops executed locally by trainer. # update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() tf.logging.info('Training the following variables: {}'.format( ', '.join([var.op.name for var in variables_to_train]))) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train, clip_gradients=cfg.TRAIN.CLIP_GRADIENTS) # Add total_loss to summary. summaries.add(tf.summary.scalar(tensor=total_loss, name='total_loss')) # Create gradient updates. train_ops = {} if cfg.TRAIN.ITER_SIZE == 1: grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') train_ops = train_tensor else: with tf.name_scope('AccumulateGradients'): # copied as is from my previous code gvs = [(grad, var) for grad, var in clones_gradients] varnames = [var.name for grad, var in gvs] varname_to_var = {var.name: var for grad, var in gvs} varname_to_grad = {var.name: grad for grad, var in gvs} varname_to_ref_grad = {} for vn in varnames: grad = varname_to_grad[vn] print("accumulating ... ", (vn, grad.get_shape())) with tf.variable_scope("ref_grad"): with tf.device(deploy_config.variables_device()): ref_var = slim.local_variable(np.zeros( grad.get_shape(), dtype=np.float32), name=vn[:-2]) varname_to_ref_grad[vn] = ref_var all_assign_ref_op = [ ref.assign(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items() ] all_assign_add_ref_op = [ ref.assign_add(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items() ] assign_gradients_ref_op = tf.group(*all_assign_ref_op) accmulate_gradients_op = tf.group(*all_assign_add_ref_op) with tf.control_dependencies([accmulate_gradients_op]): final_gvs = [(varname_to_ref_grad[var.name] / float(cfg.TRAIN.ITER_SIZE), var) for grad, var in gvs] apply_gradients_op = optimizer.apply_gradients( final_gvs, global_step=global_step) update_ops.append(apply_gradients_op) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies( [update_op], total_loss, name='train_op') for i in range(cfg.TRAIN.ITER_SIZE): if i == 0: train_ops[i] = assign_gradients_ref_op elif i < cfg.TRAIN.ITER_SIZE - 1: # because apply_gradients also computes # (see control_dependency), so # no need of running an extra iteration train_ops[i] = accmulate_gradients_op else: train_ops[i] = train_tensor # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.intra_op_parallelism_threads = 4 # to avoid too many threads # The following seems optimal... though not sure config.inter_op_parallelism_threads = max( cfg.NUM_PREPROCESSING_THREADS, 12) ########################### # Kicks off the training. # ########################### slim.learning.train(train_ops, train_step_fn=_train_step, logdir=train_dir, master='', is_chief=True, init_fn=_get_init_fn(train_dir), summary_op=summary_op, number_of_steps=cfg.TRAIN.MAX_NUMBER_OF_STEPS, log_every_n_steps=cfg.TRAIN.LOG_EVERY_N_STEPS, save_summaries_secs=cfg.TRAIN.SAVE_SUMMARIES_SECS, save_interval_secs=cfg.TRAIN.SAVE_INTERVAL_SECS, sync_optimizer=None, session_config=config)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ###################### # Config model_deploy# ###################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the network # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################ accurancy = 0 global logits_global global labels_global logits_global = logits labels_global = labels if 'AuxLogits' in end_points: print('auxlogits') slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weight=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weight=1.0) #predictions = tf.argmax(logits, 1) #accurancy = return end_points return end_points global logits_global global labels_global # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) print(logits_global) print(labels_global) predictions = tf.squeeze(tf.argmax(logits_global, 1)) labels = tf.squeeze(tf.argmax(labels_global, 1)) accurancy = 1.0 - slim.metrics.streaming_accuracy(predictions, labels)[1] #accurancies = get_loss(batch_queue) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.histogram_summary('activations/' + end_point, x)) summaries.add( tf.scalar_summary('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.scalar_summary('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.histogram_summary(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add( tf.scalar_summary('learning_rate', learning_rate, name='learning_rate')) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add( tf.scalar_summary('total_loss', total_loss, name='total_loss')) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.merge_summary(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### loss = learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None, accurancies=accurancy) print('Training loss: ' + str(loss))
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # 创建图 with tf.Graph().as_default() * ####################### # Config model_deploy # 利用model_deploy配置模型部署 ####################### '''num_clones=1, clone_on_cpu=False, replica_id=0, worker_replicas=1, num_ps_tasks=0 ''' deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step 创建全局步 # 创建图 tf.device() 使用默认图形的Graph.device()包装器 ** # 要在上下文中使用的设备名称或函数device='/device:CPU:0' with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # 选择数据集 ###################### 'name','train'or'validation'or'test','/tmp/' dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # 选择神经网络 ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # 选择预处理函数 ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # # 创建一个从数据集加载数据的提供程序 ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # 定义模型 #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn. 通过创建network_fn的多个克隆来允许数据并行""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # 指定损失函数 ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None if FLAGS.quantize_delay >= 0: tf.contrib.quantize.create_training_graph( quant_delay=FLAGS.quantize_delay) ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_): if FLAGS.train_on_cpu: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_device if not FLAGS.dataset_dir: raise ValueError( "You must supply the dataset directory with --dataset-dir.") tf.logging.set_verbosity(tf.logging.DEBUG) g = tf.Graph() with g.as_default(): # select the dataset dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # create global step, used for optimizer moving average decay with tf.device("/cpu:0"): global_step = tf.train.create_global_step() # pdb.set_trace() # get the ssd network and its anchors ssd_cls = ssd.SSDnet ssd_params = ssd_cls.default_params._replace( num_classes=FLAGS.num_classes) ssd_net = ssd_cls(ssd_params) image_size = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(img_shape=image_size) # select the preprocessing function preprocessing_name = FLAGS.preprocessing_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) # create a dataset provider and batches. with tf.device("/cpu:0"): with tf.name_scope(FLAGS.dataset_name + "_data_provider"): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # get for ssd network: image,labels,bboxes [image, shape, glabels, gbboxes] = provider.get( ["image", "shape", "object/label", "object/bbox"]) # pdb.set_trace() # preprocessing image,glabels,gbboxes = \ image_preprocessing_fn(image, glabels,gbboxes, out_shape=image_size, data_format="NHWC") # encode groundtruth labels and bboxes gclasses,glocalisations,gscores= \ ssd_net.bboxes_encode(glabels,gbboxes,ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # training batches and queue r = tf.train.batch(tf_utils.reshape_list( [image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image,b_gclasses,b_glocalisations,b_gscores = \ tf_utils.reshape_list(r,batch_shape) # prefetch queue batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list( [b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=8) # dequeue batch b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # gather initial summaries summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay) with slim.arg_scope(arg_scope): predictions,localisations,logits,end_points,mobilenet_var_list = \ ssd_net.net(b_image,is_training=True) # add loss function ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # add summaries for end_points for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram("activations/" + end_point, x)) summaries.add( tf.summary.scalar("sparsity/" + end_point, tf.nn.zero_fraction(x))) # add summaries for losses and extra losses for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection("EXTRA_LOSSES"): summaries.add(tf.summary.scalar(loss.op.name, loss)) # add summaries for variables for var in slim.get_model_variables(): summaries.add(tf.summary.histogram(var.op.name, var)) # configure the moving averages if FLAGS.moving_average_decay: # use moving average decay on weights variables moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # configure the optimization procedure with tf.device("/cpu:0"): learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar("learning_rate", learning_rate)) if FLAGS.moving_average_decay: # update ops executed by trainer update_ops.append( variable_averages.apply(moving_average_variables)) # get variables to train variables_to_train = tf_utils.get_variables_to_train(FLAGS) # return a train tensor and summary op total_losses = tf.get_collection(tf.GraphKeys.LOSSES) total_loss = tf.add_n(total_losses, name="total_loss") summaries.add(tf.summary.scalar("total_loss", total_loss)) # create gradient updates grads = optimizer.compute_gradients(total_loss, var_list=variables_to_train) grad_updates = optimizer.apply_gradients(grads, global_step=global_step) update_ops.append(grad_updates) # create train op update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name="train_op") # merge all summaries together summary_op = tf.summary.merge(list(summaries), name="summary_op") # start training gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction, allow_growth=FLAGS.allow_growth) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=2, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) # create initial assignment op init_assign_op, init_feed_dict = slim.assign_from_checkpoint( FLAGS.checkpoint_path, mobilenet_var_list, ignore_missing_vars=FLAGS.ignore_missing_vars) # create an initial assignment function for k, v in init_feed_dict.items(): if "global_step" in k.name: g_step = k init_feed_dict[g_step] = 0 # change the global_step to zero. init_fn = lambda sess: sess.run(init_assign_op, init_feed_dict) # run training slim.learning.train( train_tensor, logdir=FLAGS.train_dir, init_fn=init_fn, summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, session_config=config, saver=saver, )
def get_from_tfrecord(): return dataset_factory.get_dataset('pascalvoc_2007', 'train', 'D:\Data\VOC\\train')
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() # =================================================================== # # Dataset + SSD model + Pre-processing # =================================================================== # dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = nets_factory.get_network(FLAGS.model_name) ssd_params = ssd_class.default_params._replace( num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) # Evaluation shape and associated anchors: eval_image_size ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.eval_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device('/cpu:0'): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size, shuffle=False) # Get for SSD network: image, labels, bboxes. [image, shape, glabels, gbboxes] = provider.get( ['image', 'shape', 'object/label', 'object/bbox']) if FLAGS.remove_difficult: [gdifficults] = provider.get(['object/difficult']) else: gdifficults = tf.zeros(tf.shape(glabels), dtype=tf.int64) # Pre-processing image, labels and bboxes. image, glabels, gbboxes, gbbox_img = \ image_preprocessing_fn(image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT, resize=FLAGS.eval_resize, difficults=None) # Encode groundtruth labels and bboxes. gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] * 5 + [len(ssd_anchors)] * 3 # Evaluation batch. r = tf.train.batch(tf_utils.reshape_list([ image, glabels, gbboxes, gdifficults, gbbox_img, gclasses, glocalisations, gscores ]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size, dynamic_pad=True) (b_image, b_glabels, b_gbboxes, b_gdifficults, b_gbbox_img, b_gclasses, b_glocalisations, b_gscores) = tf_utils.reshape_list(r, batch_shape) # =================================================================== # # SSD Network + Ouputs decoding. # =================================================================== # dict_metrics = {} arg_scope = ssd_net.arg_scope(data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=False) # Add losses functions. ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores) # Performing post-processing on CPU: loop-intensive, usually more efficient. with tf.device('/device:CPU:0'): # Detected objects from SSD output. localisations = ssd_net.bboxes_decode(localisations, ssd_anchors) rscores, rbboxes = \ ssd_net.detected_bboxes(predictions, localisations, select_threshold=FLAGS.select_threshold, nms_threshold=FLAGS.nms_threshold, clipping_bbox=None, top_k=FLAGS.select_top_k, keep_top_k=FLAGS.keep_top_k) # Compute TP and FP statistics. num_gbboxes, tp, fp, rscores = \ tfe.bboxes_matching_batch(rscores.keys(), rscores, rbboxes, b_glabels, b_gbboxes, b_gdifficults, matching_threshold=FLAGS.matching_threshold) # Variables to restore: moving avg. or normal weights. if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() # =================================================================== # # Evaluation metrics. # =================================================================== # with tf.device('/device:CPU:0'): dict_metrics = {} # First add all losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): dict_metrics[loss.op.name] = slim.metrics.streaming_mean(loss) # Extra losses as well. for loss in tf.get_collection('EXTRA_LOSSES'): dict_metrics[loss.op.name] = slim.metrics.streaming_mean(loss) # Add metrics to summaries and Print on screen. for name, metric in dict_metrics.items(): # summary_name = 'eval/%s' % name summary_name = name op = tf.summary.scalar(summary_name, metric[0], collections=[]) # op = tf.Print(op, [metric[0]], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # FP and TP metrics. tp_fp_metric = tfe.streaming_tp_fp_arrays(num_gbboxes, tp, fp, rscores) for c in tp_fp_metric[0].keys(): dict_metrics['tp_fp_%s' % c] = (tp_fp_metric[0][c], tp_fp_metric[1][c]) # Add to summaries precision/recall values. aps_voc07 = {} aps_voc12 = {} for c in tp_fp_metric[0].keys(): # Precison and recall values. prec, rec = tfe.precision_recall(*tp_fp_metric[0][c]) # Average precision VOC07. v = tfe.average_precision_voc07(prec, rec) summary_name = 'AP_VOC07/%s' % c op = tf.summary.scalar(summary_name, v, collections=[]) # op = tf.Print(op, [v], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) aps_voc07[c] = v # Average precision VOC12. v = tfe.average_precision_voc12(prec, rec) summary_name = 'AP_VOC12/%s' % c op = tf.summary.scalar(summary_name, v, collections=[]) # op = tf.Print(op, [v], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) aps_voc12[c] = v # Mean average precision VOC07. summary_name = 'AP_VOC07/mAP' mAP = tf.add_n(list(aps_voc07.values())) / len(aps_voc07) op = tf.summary.scalar(summary_name, mAP, collections=[]) op = tf.Print(op, [mAP], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # Mean average precision VOC12. summary_name = 'AP_VOC12/mAP' mAP = tf.add_n(list(aps_voc12.values())) / len(aps_voc12) op = tf.summary.scalar(summary_name, mAP, collections=[]) op = tf.Print(op, [mAP], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # for i, v in enumerate(l_precisions): # summary_name = 'eval/precision_at_recall_%.2f' % LIST_RECALLS[i] # op = tf.summary.scalar(summary_name, v, collections=[]) # op = tf.Print(op, [v], summary_name) # tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # Split into values and updates ops. names_to_values, names_to_updates = slim.metrics.aggregate_metric_map( dict_metrics) # =================================================================== # # Evaluation loop. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 # Number of batches... if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if not FLAGS.wait_for_checkpoints: if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) # Standard evaluation loop. start = time.time() slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore, session_config=config) # Log time spent. elapsed = time.time() elapsed = elapsed - start print('Time spent : %.3f seconds.' % elapsed) print('Time spent per BATCH: %.3f seconds.' % (elapsed / num_batches)) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) # Waiting loop. slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore, eval_interval_secs=60, max_number_of_evaluations=np.inf, session_config=config, timeout=None)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=1, clone_on_cpu=False, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( 'flowers', 'train', FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( 'mobilenet_v1', num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### image_preprocessing_fn = preprocessing_factory.get_preprocessing( 'mobilenet_v1', is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=4, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=4, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): num_epochs_per_decay = 2.5 decay_steps = int(dataset.num_samples / FLAGS.batch_size * num_epochs_per_decay) learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps, _LEARNING_RATE_DECAY_FACTOR, staircase=True, name='exponential_decay_learning_rate') optimizer = tf.train.RMSPropOptimizer( learning_rate, decay=FLAGS.rmsprop_decay, momentum=FLAGS.rmsprop_momentum, epsilon=FLAGS.opt_epsilon) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=True, session_config=session_config, init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=10, save_summaries_secs=300, save_interval_secs=300, sync_optimizer=optimizer if False else None)
def main(opt): # torch.manual_seed(opt.seed) # torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test paddle.seed(opt.seed) print('Setting up data...') Dataset = get_dataset(opt.dataset, opt.task) f = open(opt.data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() transforms = T.Compose([T.ToTensor()]) dataset = Dataset(opt, dataset_root, trainset_paths, (1088, 608), augment=True, transforms=transforms) opt = opts().update_dataset_info_and_set_heads(opt, dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str # opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') opt.device = paddle.get_device() print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) start_epoch = 0 # Get dataloader # train_loader = torch.utils.data.DataLoader( # dataset, # batch_size=opt.batch_size, # shuffle=True, # num_workers=opt.num_workers, # pin_memory=True, # drop_last=True # ) train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, use_shared_memory=False, drop_last=True) print('Starting training...') Trainer = train_factory[opt.task] # optimizer = torch.optim.Adam(model.parameters(), opt.lr) # optimizer = paddle.optimizer.Adam(learning_rate=opt.lr, parameters=model.parameters()) # 这句代码的作用纯粹是为了传个参数, # trainer = Trainer(opt, model, optimizer) trainer = Trainer(opt, model) optimizer = trainer.optimizer # 见base_trainer.py id_classifier = trainer.loss.classifier # 见base_trainer.py trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if 'fairmot_hrnet_w18' in opt.load_model: model = load_model(model, opt.load_model) elif opt.load_model != '': model, optimizer, start_epoch, id_classifier = load_model( model, opt.load_model, trainer.optimizer, trainer.loss.classifier, opt.resume, opt.lr, opt.lr_step) for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model( os.path.join(opt.save_dir, 'model_{}.pdparams'.format(mark)), epoch, model, optimizer, id_classifier) else: save_model(os.path.join(opt.save_dir, 'model_last.pdparams'), epoch, model, optimizer, id_classifier) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pdparams'.format(epoch)), epoch, model, optimizer, id_classifier) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) # for param_group in optimizer.param_groups: # param_group['lr'] = lr optimizer.set_lr(lr) if epoch % 5 == 0 or epoch >= 25: save_model( os.path.join(opt.save_dir, 'model_{}.pdparams'.format(epoch)), epoch, model, optimizer, id_classifier) logger.close()
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) batch_size = FLAGS.batch_size; with tf.Graph().as_default(): # Select the dataset. dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) util.proc.set_proc_name(FLAGS.model_name + '_' + FLAGS.dataset_name) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device('/cpu:0'): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * batch_size, common_queue_min=10 * batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. [image, shape, gignored, gbboxes, x1, x2, x3, x4, y1, y2, y3, y4] = provider.get(['image', 'shape', 'object/ignored', 'object/bbox', 'object/oriented_bbox/x1', 'object/oriented_bbox/x2', 'object/oriented_bbox/x3', 'object/oriented_bbox/x4', 'object/oriented_bbox/y1', 'object/oriented_bbox/y2', 'object/oriented_bbox/y3', 'object/oriented_bbox/y4' ]) gxs = tf.transpose(tf.stack([x1, x2, x3, x4])) #shape = (N, 4) gys = tf.transpose(tf.stack([y1, y2, y3, y4])) image = tf.identity(image, 'input_image') # Pre-processing image, labels and bboxes. image_shape = (FLAGS.train_image_size, FLAGS.train_image_size) image, gignored, gbboxes, gxs, gys = \ ssd_vgg_preprocessing.preprocess_image(image, gignored, gbboxes, gxs, gys, out_shape=image_shape, is_training = True) gxs = gxs * tf.cast(image_shape[1], gxs.dtype) gys = gys * tf.cast(image_shape[0], gys.dtype) gorbboxes = tfe_seglink.tf_min_area_rect(gxs, gys) image = tf.identity(image, 'processed_image') with tf.Session() as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) i = 0 while i < 2: i += 1 image_data, label_data, bbox_data, xs_data, ys_data, orbboxes = \ sess.run([image, gignored, gbboxes, gxs, gys, gorbboxes]) image_data = image_data + [123., 117., 104.] image_data = np.asarray(image_data, np.uint8) h, w = image_data.shape[0:-1] bbox_data = bbox_data * [h, w, h, w] I_bbox = image_data.copy() I_xys = image_data.copy() I_orbbox = image_data.copy() for idx in range(bbox_data.shape[0]): def draw_bbox(): y1, x1, y2, x2 = bbox_data[idx, :] util.img.rectangle(I_bbox, (x1, y1), (x2, y2), color = util.img.COLOR_WHITE) def draw_xys(): points = zip(xs_data[idx, :], ys_data[idx, :]) cnts = util.img.points_to_contours(points); util.img.draw_contours(I_xys, cnts, -1, color = util.img.COLOR_GREEN) def draw_orbbox(): orbox = orbboxes[idx, :] import cv2 rect = ((orbox[0], orbox[1]), (orbox[2], orbox[3]), orbox[4]) box = cv2.cv.BoxPoints(rect) box = np.int0(box) cv2.drawContours(I_orbbox, [box], 0, util.img.COLOR_RGB_RED, 1) draw_bbox() draw_xys(); draw_orbbox(); print util.sit(I_bbox) print util.sit(I_xys) print util.sit(I_orbbox) print 'check the images and make sure that bboxes in difference colors are the same.' coord.request_stop() coord.join(threads)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Config model_deploy. Keep TF Slim Models structure. # Useful if want to need multiple GPUs and/or servers in the future. deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step. with tf.device(deploy_config.variables_device()): # 分配设备 global_step = slim.create_global_step() # Select the dataset.#得到数据 dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Get the SSD network and its anchors. ssd_class = nets_factory.get_network( FLAGS.model_name) # 返回ssd_vgg_300.SSDNet ssd_params = ssd_class.default_params._replace( num_classes=FLAGS.num_classes) ssd_net = ssd_class(ssd_params) ssd_shape = ssd_net.params.img_shape ssd_anchors = ssd_net.anchors(ssd_shape) # 为每个特征图生成anchors # Select the preprocessing function. preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name # 得到处理数据的程序 image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) tf_utils.print_configuration(FLAGS.__flags, ssd_params, dataset.data_sources, FLAGS.train_dir) # =================================================================== # # Create a dataset provider and batches. # =================================================================== # with tf.device(deploy_config.inputs_device()): with tf.name_scope(FLAGS.dataset_name + '_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. [image, glabels, gbboxes] = provider.get(['image', 'object/label', 'object/bbox']) # Pre-processing image, labels and bboxes. # 对图像进行预处理 image, glabels, gbboxes = image_preprocessing_fn( image, glabels, gbboxes, out_shape=ssd_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. ###############################################################没看懂 gclasses, glocalisations, gscores = ssd_net.bboxes_encode( glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 # Training batches and queue. r = tf.train.batch(tf_utils.reshape_list( [image, gclasses, glocalisations, gscores]), batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(r, batch_shape) # Intermediate queueing: unique batch computation pipeline for all # GPUs running the training. batch_queue = slim.prefetch_queue.prefetch_queue( tf_utils.reshape_list( [b_image, b_gclasses, b_glocalisations, b_gscores]), capacity=2 * deploy_config.num_clones) # =================================================================== # # Define the model running on every GPU. # =================================================================== # def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): predictions, localisations, logits, end_points = \ ssd_net.net(b_image, is_training=True,DSSD_FLAG = FLAGS.DSSD_FLAG) # Add loss function. ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations, b_gscores, match_threshold=FLAGS.match_threshold, negative_ratio=FLAGS.negative_ratio, alpha=FLAGS.loss_alpha, label_smoothing=FLAGS.label_smoothing) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # =================================================================== # # Add summaries from first clone. # =================================================================== # ##########################################没看懂 clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses and extra losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): summaries.add(tf.summary.scalar(loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None # =================================================================== # # Configure the optimization procedure. # =================================================================== # with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate( FLAGS, dataset.num_samples, global_step) optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = tf_utils.get_variables_to_train(FLAGS) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # =================================================================== # # Kicks off the training. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) # n = tf.all_variables() if FLAGS.DSSD_FLAG: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) # reader = tf.train.NewCheckpointReader(ckpt.model_checkpoint_path) variables_to_restore = [ var.name for var in tf.all_variables() if var.name.startswith("_box", 18) or var.name.startswith("_box", 19) ] variables_to_restore = slim.get_variables_to_restore( exclude=variables_to_restore) # # restore = tf.train.Saver(variables_to_restore) init_fn = slim.assign_from_checkpoint_fn( ckpt.model_checkpoint_path, variables_to_restore, ignore_missing_vars=True, reshape_variables=False) else: init_fn = tf_utils.get_init_fn(FLAGS) # with tf.Session() as sess: # # init_fn(sess) # ckpt_filename = './checkpoints_fpn/model.ckpt-87149' # saver.restore(sess, ckpt_filename) # print(".................................") slim.learning.train(train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=init_fn, summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
def main_fun(argv, ctx): import tensorflow as tf from tensorflow.python.ops import control_flow_ops from datasets import dataset_factory from deployment import model_deploy from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer( 'num_gpus', '1', 'The number of GPUs to use per node') tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.') tf.app.flags.DEFINE_string( 'master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'train_dir', '/tmp/tfmodel/', 'Directory where checkpoints and event logs are written to.') tf.app.flags.DEFINE_integer('num_clones', 1, 'Number of model clones to deploy.') tf.app.flags.DEFINE_boolean('clone_on_cpu', False, 'Use CPUs to deploy clones.') tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.') tf.app.flags.DEFINE_integer( 'num_ps_tasks', 0, 'The number of parameter servers. If the value is 0, then the parameters ' 'are handled locally by the worker.') tf.app.flags.DEFINE_integer( 'num_readers', 4, 'The number of parallel readers that read data from the dataset.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_integer( 'log_every_n_steps', 10, 'The frequency with which logs are print.') tf.app.flags.DEFINE_integer( 'save_summaries_secs', 600, 'The frequency with which summaries are saved, in seconds.') tf.app.flags.DEFINE_integer( 'save_interval_secs', 600, 'The frequency with which the model is saved, in seconds.') tf.app.flags.DEFINE_integer( 'task', 0, 'Task id of the replica running the training.') ###################### # Optimization Flags # ###################### tf.app.flags.DEFINE_float( 'weight_decay', 0.00004, 'The weight decay on the model weights.') tf.app.flags.DEFINE_string( 'optimizer', 'rmsprop', 'The name of the optimizer, one of "adadelta", "adagrad", "adam",' '"ftrl", "momentum", "sgd" or "rmsprop".') tf.app.flags.DEFINE_float( 'adadelta_rho', 0.95, 'The decay rate for adadelta.') tf.app.flags.DEFINE_float( 'adagrad_initial_accumulator_value', 0.1, 'Starting value for the AdaGrad accumulators.') tf.app.flags.DEFINE_float( 'adam_beta1', 0.9, 'The exponential decay rate for the 1st moment estimates.') tf.app.flags.DEFINE_float( 'adam_beta2', 0.999, 'The exponential decay rate for the 2nd moment estimates.') tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.') tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5, 'The learning rate power.') tf.app.flags.DEFINE_float( 'ftrl_initial_accumulator_value', 0.1, 'Starting value for the FTRL accumulators.') tf.app.flags.DEFINE_float( 'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.') tf.app.flags.DEFINE_float( 'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.') tf.app.flags.DEFINE_float( 'momentum', 0.9, 'The momentum for the MomentumOptimizer and RMSPropOptimizer.') tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.') ####################### # Learning Rate Flags # ####################### tf.app.flags.DEFINE_string( 'learning_rate_decay_type', 'exponential', 'Specifies how the learning rate is decayed. One of "fixed", "exponential",' ' or "polynomial"') tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') tf.app.flags.DEFINE_float( 'end_learning_rate', 0.0001, 'The minimal end learning rate used by a polynomial decay learning rate.') tf.app.flags.DEFINE_float( 'label_smoothing', 0.0, 'The amount of label smoothing.') tf.app.flags.DEFINE_float( 'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.') tf.app.flags.DEFINE_float( 'num_epochs_per_decay', 2.0, 'Number of epochs after which learning rate decays.') tf.app.flags.DEFINE_bool( 'sync_replicas', False, 'Whether or not to synchronize the replicas during training.') tf.app.flags.DEFINE_integer( 'replicas_to_aggregate', 1, 'The Number of gradients to collect before updating params.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') ####################### # Dataset Flags # ####################### tf.app.flags.DEFINE_string( 'dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string( 'dataset_split_name', 'train', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string( 'model_name', 'inception_v3', 'The name of the architecture to train.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_integer( 'batch_size', 32, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer( 'train_image_size', None, 'Train image size') tf.app.flags.DEFINE_integer('max_number_of_steps', None, 'The maximum number of training steps.') ##################### # Fine-Tuning Flags # ##################### tf.app.flags.DEFINE_string( 'checkpoint_path', None, 'The path to a checkpoint from which to fine-tune.') tf.app.flags.DEFINE_string( 'checkpoint_exclude_scopes', None, 'Comma-separated list of scopes of variables to exclude when restoring ' 'from a checkpoint.') tf.app.flags.DEFINE_string( 'trainable_scopes', None, 'Comma-separated list of scopes to filter the set of variables to train.' 'By default, None would train all the variables.') tf.app.flags.DEFINE_boolean( 'ignore_missing_vars', False, 'When restoring a checkpoint would ignore missing variables.') FLAGS = tf.app.flags.FLAGS FLAGS.job_name = ctx.job_name FLAGS.task = ctx.task_index FLAGS.num_clones = FLAGS.num_gpus FLAGS.worker_replicas = len(ctx.cluster_spec['worker']) assert(FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps']) if 'ps' in ctx.cluster_spec else 0)) def _configure_learning_rate(num_samples_per_epoch, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. Raises: ValueError: if """ decay_steps = int(num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) if FLAGS.sync_replicas: decay_steps /= FLAGS.replicas_to_aggregate if FLAGS.learning_rate_decay_type == 'exponential': return tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True, name='exponential_decay_learning_rate') elif FLAGS.learning_rate_decay_type == 'fixed': return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate') elif FLAGS.learning_rate_decay_type == 'polynomial': return tf.train.polynomial_decay(FLAGS.learning_rate, global_step, decay_steps, FLAGS.end_learning_rate, power=1.0, cycle=False, name='polynomial_decay_learning_rate') else: raise ValueError('learning_rate_decay_type [%s] was not recognized', FLAGS.learning_rate_decay_type) def _configure_optimizer(learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. Raises: ValueError: if FLAGS.optimizer is not recognized. """ if FLAGS.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate, rho=FLAGS.adadelta_rho, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer( learning_rate, initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value) elif FLAGS.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate, beta1=FLAGS.adam_beta1, beta2=FLAGS.adam_beta2, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer( learning_rate, learning_rate_power=FLAGS.ftrl_learning_rate_power, initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value, l1_regularization_strength=FLAGS.ftrl_l1, l2_regularization_strength=FLAGS.ftrl_l2) elif FLAGS.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=FLAGS.momentum, name='Momentum') elif FLAGS.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer( learning_rate, decay=FLAGS.rmsprop_decay, momentum=FLAGS.momentum, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer) return optimizer def _add_variables_summaries(learning_rate): summaries = [] for variable in slim.get_model_variables(): summaries.append(tf.summary.histogram(variable.op.name, variable)) summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate)) return summaries def _get_init_fn(): """Returns a function run by the chief worker to warm-start the training. Note that the init_fn is only run when initializing the model during the very first global step. Returns: An init function run by the supervisor. """ if FLAGS.checkpoint_path is None: return None # Warn the user if a checkpoint exists in the train_dir. Then we'll be # ignoring the checkpoint anyway. if tf.train.latest_checkpoint(FLAGS.train_dir): tf.logging.info( 'Ignoring --checkpoint_path because a checkpoint already exists in %s' % FLAGS.train_dir) return None exclusions = [] if FLAGS.checkpoint_exclude_scopes: exclusions = [scope.strip() for scope in FLAGS.checkpoint_exclude_scopes.split(',')] # TODO(sguada) variables.filter_variables() variables_to_restore = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True break if not excluded: variables_to_restore.append(var) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Fine-tuning from %s' % checkpoint_path) return slim.assign_from_checkpoint_fn( checkpoint_path, variables_to_restore, ignore_missing_vars=FLAGS.ignore_missing_vars) def _get_variables_to_train(): """Returns a list of variables to train. Returns: A list of variables to train by the optimizer. """ if FLAGS.trainable_scopes is None: return tf.trainable_variables() else: scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')] variables_to_train = [] for scope in scopes: variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) variables_to_train.extend(variables) return variables_to_train # main cluster_spec, server = TFNode.start_cluster_server(ctx=ctx, num_gpus=FLAGS.num_gpus, rdma=FLAGS.rdma) if ctx.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step #with tf.device(deploy_config.variables_device()): # global_step = slim.create_global_step() with tf.device("/job:ps/task:0"): global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy( logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph()) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=server.target, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, summary_writer=summary_writer, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from preprocessing import preprocessing_factory from configs.kitti_config import config from nets.mobilenetdet import scale_bboxes from datasets import dataset_factory from tensorflow.contrib import slim dataset = dataset_factory.get_dataset( 'kitti', 'train', '/home/zehao/Dataset/KITII/tfrecord') # def conver_box(bboxes, img_h, img_w): # [ymin, xmin, ymax, xmax] = tf.unstack(bboxes, axis=1) # img_h = tf.cast(img_h, tf.float32) # img_w = tf.cast(img_w, tf.float32) # ymin = tf.truediv(ymin, img_h) # xmin = tf.truediv(xmin, img_w) # ymax = tf.truediv(ymax, img_h) # xmax = tf.truediv(xmax, img_w) # return tf.expand_dims(tf.stack([ymin,xmin,ymax,xmax], axis=1), axis=0) with tf.Graph().as_default() as graph: with tf.device('/cpu:0'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=1, common_queue_capacity=20 * 1,
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) csvl_train = CSVLoggerDL(os.path.join(opt.save_dir, "hist_train.csv")) csvl_val = CSVLoggerDL(os.path.join(opt.save_dir, "hist_val.csv")) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) csvl_train.add(log_dict_train, epoch=epoch) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) csvl_val.add(log_dict_val, epoch=epoch) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label, name] = provider.get(['image', 'label', 'name']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels, names = tf.train.batch( [image, label, name], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) prob = tf.nn.softmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), #'Recall_5': slim.metrics.streaming_sparse_recall_at_k( # logits, labels, 3), #"summary_result" : _get_streaming_metrics(logits, label, # 2), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) restore_fn = slim.assign_from_checkpoint_fn(checkpoint_path, variables_to_restore) sv = tf.train.Supervisor(logdir=FLAGS.eval_dir, saver=None, init_fn=restore_fn) pi_prediction = [] pi_name = [] pi_label = [] with sv.managed_session() as sess: for step in range(int(num_batches)): sess.run(sv.global_step) (s_pred, s_label, s_name) = sess.run([prob, labels, names]) pi_prediction.extend(s_pred) pi_label.extend(s_label) pi_name.extend(s_name) csv_path = "" if FLAGS.csv_name is None: csv_path = os.path.join( FLAGS.eval_dir, strftime("%Y_%m_%d_%H_%M_%S", gmtime()) + "_" + FLAGS.dataset_split_name + "_prediciton.csv") else: csv_path = FLAGS.csv_name with open(csv_path, "w+") as f: fieldnames = ['src', 'label', 'predict'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for i in range(len(pi_name)): d = { 'src': pi_name[i], 'label': pi_label[i], 'predict': pi_prediction[i] } writer.writerow(d) print("write file to %s", csv_path)
def main_fun(argv, ctx): import math import six import tensorflow as tf from datasets import dataset_factory from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer( 'batch_size', 100, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer( 'max_num_batches', None, 'Max number of batches to evaluate by default use all.') tf.app.flags.DEFINE_string( 'master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'checkpoint_path', '/tmp/tfmodel/', 'The directory where the model was written to or an absolute path to a ' 'checkpoint file.') tf.app.flags.DEFINE_string( 'eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_string( 'dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string( 'dataset_split_name', 'test', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string( 'model_name', 'inception_v3', 'The name of the architecture to evaluate.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') tf.app.flags.DEFINE_integer( 'eval_image_size', None, 'Eval image size') FLAGS = tf.app.flags.FLAGS if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') cluster_spec, server = TFNode.start_cluster_server(ctx) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): #tf_global_step = slim.get_or_create_global_step() tf_global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k( logits, labels, 5), }) # Print the summaries to screen. for name, value in six.iteritems(names_to_values): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def main_fun(argv, ctx): import tensorflow as tf from tensorflow.python.ops import control_flow_ops from datasets import dataset_factory from deployment import model_deploy from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer('num_gpus', '1', 'The number of GPUs to use per node') tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.') tf.app.flags.DEFINE_string('master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'train_dir', '/tmp/tfmodel/', 'Directory where checkpoints and event logs are written to.') tf.app.flags.DEFINE_integer('num_clones', 1, 'Number of model clones to deploy.') tf.app.flags.DEFINE_boolean('clone_on_cpu', False, 'Use CPUs to deploy clones.') tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.') tf.app.flags.DEFINE_integer( 'num_ps_tasks', 0, 'The number of parameter servers. If the value is 0, then the parameters ' 'are handled locally by the worker.') tf.app.flags.DEFINE_integer( 'num_readers', 4, 'The number of parallel readers that read data from the dataset.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_integer('log_every_n_steps', 10, 'The frequency with which logs are print.') tf.app.flags.DEFINE_integer( 'save_summaries_secs', 600, 'The frequency with which summaries are saved, in seconds.') tf.app.flags.DEFINE_integer( 'save_interval_secs', 600, 'The frequency with which the model is saved, in seconds.') tf.app.flags.DEFINE_integer( 'task', 0, 'Task id of the replica running the training.') ###################### # Optimization Flags # ###################### tf.app.flags.DEFINE_float('weight_decay', 0.00004, 'The weight decay on the model weights.') tf.app.flags.DEFINE_string( 'optimizer', 'rmsprop', 'The name of the optimizer, one of "adadelta", "adagrad", "adam",' '"ftrl", "momentum", "sgd" or "rmsprop".') tf.app.flags.DEFINE_float('adadelta_rho', 0.95, 'The decay rate for adadelta.') tf.app.flags.DEFINE_float('adagrad_initial_accumulator_value', 0.1, 'Starting value for the AdaGrad accumulators.') tf.app.flags.DEFINE_float( 'adam_beta1', 0.9, 'The exponential decay rate for the 1st moment estimates.') tf.app.flags.DEFINE_float( 'adam_beta2', 0.999, 'The exponential decay rate for the 2nd moment estimates.') tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.') tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5, 'The learning rate power.') tf.app.flags.DEFINE_float('ftrl_initial_accumulator_value', 0.1, 'Starting value for the FTRL accumulators.') tf.app.flags.DEFINE_float('ftrl_l1', 0.0, 'The FTRL l1 regularization strength.') tf.app.flags.DEFINE_float('ftrl_l2', 0.0, 'The FTRL l2 regularization strength.') tf.app.flags.DEFINE_float( 'momentum', 0.9, 'The momentum for the MomentumOptimizer and RMSPropOptimizer.') tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.') ####################### # Learning Rate Flags # ####################### tf.app.flags.DEFINE_string( 'learning_rate_decay_type', 'exponential', 'Specifies how the learning rate is decayed. One of "fixed", "exponential",' ' or "polynomial"') tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') tf.app.flags.DEFINE_float( 'end_learning_rate', 0.0001, 'The minimal end learning rate used by a polynomial decay learning rate.' ) tf.app.flags.DEFINE_float('label_smoothing', 0.0, 'The amount of label smoothing.') tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.94, 'Learning rate decay factor.') tf.app.flags.DEFINE_float( 'num_epochs_per_decay', 2.0, 'Number of epochs after which learning rate decays.') tf.app.flags.DEFINE_bool( 'sync_replicas', False, 'Whether or not to synchronize the replicas during training.') tf.app.flags.DEFINE_integer( 'replicas_to_aggregate', 1, 'The Number of gradients to collect before updating params.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') ####################### # Dataset Flags # ####################### tf.app.flags.DEFINE_string('dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string('dataset_split_name', 'train', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string('model_name', 'inception_v3', 'The name of the architecture to train.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_integer('batch_size', 32, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer('train_image_size', None, 'Train image size') tf.app.flags.DEFINE_integer('max_number_of_steps', None, 'The maximum number of training steps.') ##################### # Fine-Tuning Flags # ##################### tf.app.flags.DEFINE_string( 'checkpoint_path', None, 'The path to a checkpoint from which to fine-tune.') tf.app.flags.DEFINE_string( 'checkpoint_exclude_scopes', None, 'Comma-separated list of scopes of variables to exclude when restoring ' 'from a checkpoint.') tf.app.flags.DEFINE_string( 'trainable_scopes', None, 'Comma-separated list of scopes to filter the set of variables to train.' 'By default, None would train all the variables.') tf.app.flags.DEFINE_boolean( 'ignore_missing_vars', False, 'When restoring a checkpoint would ignore missing variables.') FLAGS = tf.app.flags.FLAGS FLAGS.job_name = ctx.job_name FLAGS.task = ctx.task_index FLAGS.num_clones = FLAGS.num_gpus FLAGS.worker_replicas = len(ctx.cluster_spec['worker']) assert (FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps']) if 'ps' in ctx.cluster_spec else 0)) def _configure_learning_rate(num_samples_per_epoch, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. Raises: ValueError: if """ decay_steps = int(num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) if FLAGS.sync_replicas: decay_steps /= FLAGS.replicas_to_aggregate if FLAGS.learning_rate_decay_type == 'exponential': return tf.train.exponential_decay( FLAGS.learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True, name='exponential_decay_learning_rate') elif FLAGS.learning_rate_decay_type == 'fixed': return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate') elif FLAGS.learning_rate_decay_type == 'polynomial': return tf.train.polynomial_decay( FLAGS.learning_rate, global_step, decay_steps, FLAGS.end_learning_rate, power=1.0, cycle=False, name='polynomial_decay_learning_rate') else: raise ValueError( 'learning_rate_decay_type [%s] was not recognized', FLAGS.learning_rate_decay_type) def _configure_optimizer(learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. Raises: ValueError: if FLAGS.optimizer is not recognized. """ if FLAGS.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate, rho=FLAGS.adadelta_rho, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer( learning_rate, initial_accumulator_value=FLAGS. adagrad_initial_accumulator_value) elif FLAGS.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate, beta1=FLAGS.adam_beta1, beta2=FLAGS.adam_beta2, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer( learning_rate, learning_rate_power=FLAGS.ftrl_learning_rate_power, initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value, l1_regularization_strength=FLAGS.ftrl_l1, l2_regularization_strength=FLAGS.ftrl_l2) elif FLAGS.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=FLAGS.momentum, name='Momentum') elif FLAGS.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=FLAGS.rmsprop_decay, momentum=FLAGS.momentum, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer) return optimizer def _add_variables_summaries(learning_rate): summaries = [] for variable in slim.get_model_variables(): summaries.append(tf.summary.histogram(variable.op.name, variable)) summaries.append( tf.summary.scalar('training/Learning Rate', learning_rate)) return summaries def _get_init_fn(): """Returns a function run by the chief worker to warm-start the training. Note that the init_fn is only run when initializing the model during the very first global step. Returns: An init function run by the supervisor. """ if FLAGS.checkpoint_path is None: return None # Warn the user if a checkpoint exists in the train_dir. Then we'll be # ignoring the checkpoint anyway. if tf.train.latest_checkpoint(FLAGS.train_dir): tf.logging.info( 'Ignoring --checkpoint_path because a checkpoint already exists in %s' % FLAGS.train_dir) return None exclusions = [] if FLAGS.checkpoint_exclude_scopes: exclusions = [ scope.strip() for scope in FLAGS.checkpoint_exclude_scopes.split(',') ] # TODO(sguada) variables.filter_variables() variables_to_restore = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True break if not excluded: variables_to_restore.append(var) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Fine-tuning from %s' % checkpoint_path) return slim.assign_from_checkpoint_fn( checkpoint_path, variables_to_restore, ignore_missing_vars=FLAGS.ignore_missing_vars) def _get_variables_to_train(): """Returns a list of variables to train. Returns: A list of variables to train by the optimizer. """ if FLAGS.trainable_scopes is None: return tf.trainable_variables() else: scopes = [ scope.strip() for scope in FLAGS.trainable_scopes.split(',') ] variables_to_train = [] for scope in scopes: variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) variables_to_train.extend(variables) return variables_to_train # main cluster_spec, server = TFNode.start_cluster_server(ctx=ctx, num_gpus=FLAGS.num_gpus, rdma=FLAGS.rdma) if ctx.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step #with tf.device(deploy_config.variables_device()): # global_step = slim.create_global_step() with tf.device("/job:ps/task:0"): global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy( logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add( tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add( tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate( dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### summary_writer = tf.summary.FileWriter( "tensorboard_%d" % (ctx.worker_num), graph=tf.get_default_graph()) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=server.target, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, summary_writer=summary_writer, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_): assert FLAGS.output_dir, '--output_dir has to be provided' if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) params = model_lib.default_hparams() params.parse(FLAGS.hparams) tf.logging.info('User provided hparams: %s', FLAGS.hparams) tf.logging.info('All hyper parameters: %s', params) batch_size = params.batch_size graph = tf.Graph() with graph.as_default(): with tf.device(tf.train.replica_device_setter(ps_tasks=FLAGS.ps_tasks)): # dataset dataset, examples_per_epoch, num_classes, bounds = ( dataset_factory.get_dataset( FLAGS.dataset, 'train', batch_size, FLAGS.dataset_image_size, is_training=True)) dataset_iterator = dataset.make_one_shot_iterator() images, labels = dataset_iterator.get_next() one_hot_labels = tf.one_hot(labels, num_classes) # set up model global_step = tf.train.get_or_create_global_step() model_fn = model_lib.get_model(FLAGS.model_name, num_classes) if params.train_adv_method == 'clean': logits = model_fn(images, is_training=True) adv_examples = None else: model_fn_eval_mode = lambda x: model_fn(x, is_training=False) adv_examples = adversarial_attack.generate_adversarial_examples( images, bounds, model_fn_eval_mode, params.train_adv_method) all_examples = tf.concat([images, adv_examples], axis=0) logits = model_fn(all_examples, is_training=True) one_hot_labels = tf.concat([one_hot_labels, one_hot_labels], axis=0) # update trainable variables if fine tuning is used model_lib.filter_trainable_variables( FLAGS.finetune_trainable_scopes) # set up losses total_loss = tf.losses.softmax_cross_entropy( onehot_labels=one_hot_labels, logits=logits, label_smoothing=params.label_smoothing) tf.summary.scalar('loss_xent', total_loss) if params.train_lp_weight > 0: images1, images2 = tf.split(logits, 2) loss_lp = tf.losses.mean_squared_error( images1, images2, weights=params.train_lp_weight) tf.summary.scalar('loss_lp', loss_lp) total_loss += loss_lp if params.weight_decay > 0: loss_wd = ( params.weight_decay * tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) ) tf.summary.scalar('loss_wd', loss_wd) total_loss += loss_wd # Setup the moving averages: if FLAGS.moving_average_decay and (FLAGS.moving_average_decay > 0): with tf.name_scope('moving_average'): moving_average_variables = tf.contrib.framework.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables = None variable_averages = None # set up optimizer and training op learning_rate, steps_per_epoch = model_lib.get_lr_schedule( params, examples_per_epoch, FLAGS.replicas_to_aggregate) optimizer = model_lib.get_optimizer(params, learning_rate) optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) train_op = tf.contrib.training.create_train_op( total_loss, optimizer, update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS)) tf.summary.image('images', images[0:FLAGS.num_summary_images]) if adv_examples is not None: tf.summary.image('adv_images', adv_examples[0:FLAGS.num_summary_images]) tf.summary.scalar('total_loss', total_loss) tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('current_epoch', tf.to_double(global_step) / steps_per_epoch) # Training is_chief = FLAGS.task == 0 scaffold = tf.train.Scaffold( init_fn=_get_finetuning_init_fn(variable_averages)) hooks = [ tf.train.LoggingTensorHook({'total_loss': total_loss, 'global_step': global_step}, every_n_iter=1), tf.train.NanTensorHook(total_loss), ] chief_only_hooks = [ tf.train.SummarySaverHook(save_steps=FLAGS.save_summaries_steps, save_secs=FLAGS.save_summaries_secs, output_dir=FLAGS.output_dir, scaffold=scaffold), tf.train.CheckpointSaverHook(FLAGS.output_dir, save_steps=FLAGS.save_model_steps, scaffold=scaffold), ] if FLAGS.max_steps > 0: hooks.append( tf.train.StopAtStepHook(last_step=FLAGS.max_steps)) # hook for sync replica training hooks.append(optimizer.make_session_run_hook(is_chief)) with tf.train.MonitoredTrainingSession( master=FLAGS.master, is_chief=is_chief, checkpoint_dir=FLAGS.output_dir, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_only_hooks, save_checkpoint_secs=None, save_summaries_steps=None, save_summaries_secs=None) as session: while not session.should_stop(): session.run([train_op])
with.tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig( num_clones=1, clone_on_cpu=False, replica_id=0, num_replicas=2, num_ps_tasks=0) with tf.device(deploy.config.variables_device()): global_step = slim.create_global_step dataset = dataset_factory.get_dataset(cve_diseases, train, "/home/johnnyof/workspace/slim/tmp") cnn1 = nets_factory.get_network_fn( inception_resnet_v2, num_classes=(1001), weight_decay=0.00004, is_training=True) cnn2 = nets_factory.get_network_fn( alexnet_v2, num_classes=(1001), weight_decay=0.00004, is_training=True) image_preprocessing_fn = preprocessing_factory.get_preprocessing( inception,
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label, coarse_label] = provider.get( ['image', 'label', 'coarse_label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size # image = tf.image.grayscale_to_rgb(image) image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels, coarse_labels = tf.train.batch( [image, label, coarse_label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) coarse_labels = tf.cast(coarse_labels, tf.int32) tf.image_summary('image', images, max_images=5) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() one_hot_labels = slim.one_hot_encoding(labels, 2) loss = slim.losses.softmax_cross_entropy(logits, one_hot_labels) predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Total_Loss': slim.metrics.streaming_mean(loss), 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), }) with tf.variable_scope('coarse_label_accuracy', values=[predictions, labels, coarse_labels]): totals = tf.Variable( initial_value=tf.zeros([len(dataset.coarse_labels_to_names)]), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], dtype=tf.float32, name='totals') counts = tf.Variable( initial_value=tf.zeros([len(dataset.coarse_labels_to_names)]), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], dtype=tf.float32, name='counts') correct = tf.cast(tf.equal(predictions, labels), tf.int32) accuracy_ops = [] for index, coarse_key in list(enumerate(dataset.coarse_labels_to_names)): label_correct = tf.boolean_mask(correct, tf.equal(coarse_key, coarse_labels)) sum_correct = tf.reduce_sum(label_correct) sum_correct = tf.cast(tf.expand_dims(sum_correct, 0), tf.float32) delta_totals = tf.SparseTensor([[index]], sum_correct, totals.get_shape()) label_count = tf.cast(tf.shape(label_correct), tf.float32) delta_counts = tf.SparseTensor([[index]], label_count, counts.get_shape()) totals_compute_op = tf.assign_add( totals, tf.sparse_tensor_to_dense(delta_totals), use_locking=True) counts_compute_op = tf.assign_add( counts, tf.sparse_tensor_to_dense(delta_counts), use_locking=True) accuracy_ops.append(totals_compute_op) accuracy_ops.append(counts_compute_op) with tf.control_dependencies(accuracy_ops): update_op = tf.select(tf.equal(counts, 0), tf.zeros_like(counts, tf.float32), tf.div(totals, counts)) names_to_updates['Coarse_Label_Accuracy'] = update_op if FLAGS.recall: recall_value, recall_update = slim.metrics.streaming_recall_at_k( logits, labels, 5) names_to_values['Recall@5'] = recall_value names_to_updates['Recall@5'] = recall_update # Print the summaries to screen. # TODO(vonclites) list(d.items()) is for Python 3... check compatibility for name, value in list(names_to_values.items()): summary_name = 'eval/%s' % name op = tf.scalar_summary(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) for index, label_name in list(enumerate(dataset.coarse_labels_to_names.values())): summary_name = 'eval/%s' % label_name op = tf.scalar_summary(summary_name, update_op[index], collections=[]) op = tf.Print(op, [update_op[index]], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) # if tf.gfile.IsDirectory(FLAGS.checkpoint_path): # checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) # else: # checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % FLAGS.checkpoint_path) slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=FLAGS.checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), eval_interval_secs=FLAGS.eval_interval_secs, variables_to_restore=slim.get_variables_to_restore())
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) num_classes = dataset.num_styles if FLAGS.target_style else dataset.num_classes #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(num_classes - FLAGS.labels_offset), is_training=False) preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size provider = slim.dataset_data_provider.DatasetDataProvider( dataset, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size, shuffle=False) [data, content_label, style_label, file_path, u_data, u_labels, u_file_paths] = \ provider.get(['image', 'label', 'uid', 'file_path', 'user_set/images', 'user_set/labels', 'user_set/file_paths']) content_label -= FLAGS.labels_offset data = image_preprocessing_fn(data, eval_image_size, eval_image_size) u_data = tf.map_fn(lambda u_instance: image_preprocessing_fn( u_instance, eval_image_size, eval_image_size), u_data, dtype=tf.float32) image_batch, content_label_batch, style_label_batch, f_path_batch, u_data_batch, u_labels_batch, u_file_paths_batch = \ tf.train.batch( [data, content_label, style_label, file_path, u_data, u_labels, u_file_paths], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size, #allow_smaller_final_batch=True, dynamic_pad=True) label_batch = style_label_batch // 2 if FLAGS.target_style else content_label_batch num_classes = dataset.num_styles if FLAGS.target_style else dataset.num_classes label_batch = slim.one_hot_encoding(label_batch, num_classes - FLAGS.labels_offset) image_batch = tf.Print(image_batch, [tf.reduce_mean(image_batch)], message="mean") #################### # Define the model # #################### ds = DataStream(Task.CLASSIFICATION, DataType.IMAGE) # = batch_queue.dequeue() data_instance_list = [ ds.encode(*t) for t in zip( tf.unstack(image_batch), tf.unstack(content_label_batch), tf.unstack(style_label_batch), tf.unstack(u_data_batch), tf.unstack(u_labels_batch), tf.unstack(f_path_batch), tf.unstack(u_file_paths_batch)) ] f_path_batch = tf.Print(f_path_batch, [tf.shape(f_path_batch)], message='path batch') with tf.variable_scope('network_fn'): logits, _ = network_fn(data_instance_list) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) loss = tf.losses.softmax_cross_entropy(label_batch, logits, weights=1.0) label_batch = tf.argmax(label_batch, 1) # predictions = tf.Print(predictions, data=[loss], message="Loss value") # Define the metrics: idxs = tf.squeeze(tf.where(tf.not_equal(predictions, label_batch))) names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, label_batch), 'Recall@5': slim.metrics.streaming_recall_at_k(logits, label_batch, 5), 'uid_missed': slim.metrics.streaming_concat( tf.reshape(tf.gather(style_label_batch, idxs), [-1])) }) _md = os.path.join(FLAGS.eval_dir, FLAGS.dataset_split_name, "mistakes/") if not os.path.exists(_md): os.system("mkdir -p %s" % _md) mis_dir = tf.constant(_md) # uid_mistakes = tf.get_variable("uids_mistaken", initializer=tf.zeros([0], dtype=tf.int64)) # with tf.control_dependencies(names_to_updates.values()): # um_v, um_u = # uid_mistakes = tf.concat([uid_mistakes, ], 0) # uid_batch = tf.Print(uid_batch, [uid_batch, tf.shape(uid_batch)], "uids and its shape") """ with tf.control_dependencies(names_to_updates.values()): idxs = tf.cast(idxs, tf.int32) # eval_op = tf.Print(idxs, [idxs, tf.shape(idxs)], message="IDX and shape") def body(i): fp = f_path_batch[idxs[i]] s = tf.string_split([fp], "/").values fp = tf.string_join([s[0], s[1]], "_") w_op = tf.write_file( tf.string_join([mis_dir, fp]), #label_batch[idx], #tf.constant("_as_"), #predictions[idx]]), tf.image.encode_png(tf.cast(image_batch[idxs[i]]*128+128, tf.uint8))) deps = [tf.cond(tf.rank(f_path_batch)>0, lambda: w_op, lambda: tf.no_op())] with tf.control_dependencies(deps): i += 1 # i = tf.Print(i, [tf.string_join([mis_dir, fp])], message="filename") return [i] eval_op = tf.while_loop( lambda i: tf.less(i, tf.shape(f_path_batch)[0]), body, [tf.constant(0)])""" # Print the summaries to screen. for name, value in names_to_values.iteritems(): if name == 'uid_missed': continue summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) print("debug %s %s " % (label_batch, predictions)) _um, conf_matrix, otp, pred = slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=names_to_updates.values(), variables_to_restore=variables_to_restore, final_op=[ names_to_values['uid_missed'], tf.confusion_matrix(label_batch, predictions, num_classes=dataset.num_classes), label_batch, predictions ]) import numpy as np import sys import collections # full print of the confusion matrix np.set_printoptions( threshold=np.nan, linewidth=np.inf) #, formatter={'int': '{: 03d}'.format}) #names = [dataset.labels_to_names[label] for label in range(dataset.num_classes)[:36]] #sys.stdout.write(" " + str(np.asarray(names)) + "\n") #for i, row in enumerate(conf_matrix): #sys.stdout.write (names[i] + str(row) + '\n') #print (conf_matrix) print(len(pred), pred) print(len(otp), otp) #_um = [(u//2)*2 for u in _um] _um = [u for u in _um] font_freq = collections.Counter(_um) print("Mistakes per font label: %s" % font_freq) print("Total number of mistaken uids %d" % len(_um)) if FLAGS.metadir: import scipy.misc as misc im_file = os.path.join(FLAGS.metadir, "sprite.png") labels_file = os.path.join(FLAGS.metadir, "labels.tsv") if os.path.exists(im_file) and os.path.exists(labels_file): sprite_img = misc.imread(im_file) with open(labels_file, "r") as lf: labels = [int(line) for line in lf.readlines()] ncol = int(math.sqrt(len(labels))) + 1 ms_arr = np.zeros([ncol, ncol], np.int32) for li in range(len(labels)): ms_arr[li // ncol, li % ncol] = font_freq[labels[li]] _ims = eval_image_size mask = np.zeros([_ims * ncol, _ims * ncol]) mx = max(font_freq.values()) for r in range(ncol): for c in range(ncol): mask[r * _ims:(r + 1) * _ims, c * _ims:(c + 1) * _ims] = (ms_arr[r][c] / mx) * 255. misc.imsave( os.path.join(FLAGS.eval_dir, FLAGS.dataset_split_name, "mask.png"), mask) print("Number of mistakes per font %s", ms_arr) else: print( "Metadata dir supplied is missing either the sprite image or labels file" )
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k( logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
import cv2 import pycocotools.coco as coco from lib.detectors.detector_factory import detector_factory from datasets.dataset_factory import get_dataset from utils.debugger import Debugger from opts import opts #img_dir = os.path.join(os.getcwd(), 'data\\egg\\val') img_dir = r'P:\\Robert\\tf-test\\workspace\\egg-counting\\images\\test-2' if __name__ == '__main__': performance_results = [[ 'filename', 'num labelled', 'num predicted', 'abs. error', 'pct. error' ]] opt = opts().init() Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) Detector = detector_factory[opt.task] start_t = timeit.default_timer() detector = Detector(opt) print('model load time:', timeit.default_timer() - start_t) parsed = coco.COCO(opt.demo) for i, imgId in enumerate(parsed.imgs): file_name = parsed.imgs[imgId]['file_name'] img = cv2.imread(os.path.join(img_dir, file_name)) print('processing image at', os.path.join(img_dir, file_name)) run_dict = detector.run(img) num_predicted = len( [result for result in run_dict['results'][1] if result[-1] > 0.3]) num_labelled = len([ parsed.loadAnns(ids=[annID])
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" with tf.device(deploy_config.inputs_device()): images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy( logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') if not os.path.isfile(FLAGS.checkpoint_path): FLAGS.eval_dir = os.path.join(FLAGS.checkpoint_path, 'eval') else: FLAGS.eval_dir = os.path.join( os.path.dirname(FLAGS.checkpoint_path), 'eval') try: os.makedirs(FLAGS.eval_dir) except OSError: pass tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir.split(','), FLAGS.dataset_list_dir, num_samples=FLAGS.frames_per_video, modality=FLAGS.modality, split_id=FLAGS.split_id) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), batch_size=FLAGS.batch_size, is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = dataset_data_provider.DatasetDataProvider( dataset, shuffle=FLAGS.force_random_shuffle, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size, bgr_flips=FLAGS.bgr_flip) [image, label] = provider.get(['image', 'label']) label = tf.cast(tf.string_to_number(label, tf.int32), tf.int64) label.set_shape(()) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size, model_name=FLAGS.model_name, ncrops=FLAGS.ncrops, out_dim_scale=FLAGS.out_dim_scale) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=1 if FLAGS.store_feat is not None else FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### kwargs = {} if FLAGS.conv_endpoint is not None: kwargs['conv_endpoint'] = FLAGS.conv_endpoint logits, end_points = network_fn( images, pool_type=FLAGS.pooling, classifier_type=FLAGS.classifier_type, num_channels_stream=provider.num_channels_stream, netvlad_centers=FLAGS.netvlad_initCenters.split(','), stream_pool_type=FLAGS.stream_pool_type, **kwargs) end_points['images'] = images end_points['labels'] = labels if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) # rgirdhar: Because of the following, can't use with batch_size=1 if FLAGS.batch_size > 1: labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall@5': slim.metrics.streaming_recall_at_k( logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.iteritems(): summary_name = 'eval/%s' % name op = tf.scalar_summary(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = int(math.ceil(dataset.num_samples / float(FLAGS.batch_size))) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if FLAGS.store_feat is not None: assert(FLAGS.store_feat_path is not None) from tensorflow.python.training import supervisor from tensorflow.python.framework import ops import h5py saver = tf.train.Saver(variables_to_restore) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=None, summary_op=None, summary_writer=None, global_step=None, saver=None) ept_names_to_store = FLAGS.store_feat.split(',') try: ept_to_store = [end_points[el] for el in ept_names_to_store] except: logging.error('Endpoint not found') logging.error('Choose from %s' % ','.join(end_points.keys())) raise KeyError() res = dict([(epname, []) for epname in ept_names_to_store]) with sv.managed_session( FLAGS.master, start_standard_services=False, config=config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) for j in range(num_batches): if j % 10 == 0: logging.info('Doing batch %d/%d' % (j, num_batches)) feats = sess.run(ept_to_store) for eid, epname in enumerate(ept_names_to_store): res[epname].append(feats[eid]) logging.info('Writing out features to %s' % FLAGS.store_feat_path) with h5py.File(FLAGS.store_feat_path, 'w') as fout: for epname in res.keys(): fout.create_dataset(epname, data=np.concatenate(res[epname], axis=0), compression='gzip', compression_opts=FLAGS.feat_store_compression_opt) else: slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=names_to_updates.values(), variables_to_restore=variables_to_restore, session_config=config)