def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_biasCNN.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_biasCNN.get_preprocessing( preprocessing_name, is_training=True, flipLR = FLAGS.flipLR, random_scale = FLAGS.random_scale) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None if FLAGS.quantize_delay >= 0: tf.contrib.quantize.create_training_graph( quant_delay=FLAGS.quantize_delay) ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_biasCNN.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) dataset_val = dataset_biasCNN.get_dataset( FLAGS.dataset_name, 'validation', FLAGS.dataset_dir) ###################### # Select the network # ###################### if FLAGS.weights_initializer is None: weights_initializer = None # default value will be defined in argscope, it is xavier_initializer elif FLAGS.weights_initializer=='zeros': weights_initializer = tf.zeros_initializer() elif FLAGS.weights_initializer=='ones': weights_initializer = tf.ones_initializer() elif FLAGS.weights_initializer=='trunc_normal': weights_initializer = tf.truncated_normal_initializer() elif FLAGS.weights_initializer=='xavier': weights_initializer = initializers.xavier_initializer() elif FLAGS.weights_initializer=='var_scaling': weights_initializer = initializers.variance_scaling_initializer() else: raise ValueError('weight initializer not found') if FLAGS.biases_initializer is None: biases_initializer = None # default value will be defined in argscope, it is zeros_initializer elif biases_initializer=='zeros': biases_initializer = tf.zeros_initializer() elif FLAGS.biases_initializer=='ones': biases_initializer = tf.ones_initializer() elif FLAGS.biases_initializer=='trunc_normal': biases_initializer = tf.truncated_normal_initializer() elif FLAGS.biases_initializer=='xavier': biases_initializer = initializers.xavier_initializer() elif FLAGS.biases_initializer=='var_scaling': biases_initializer = initializers.variance_scaling_initializer() else: raise ValueError('biases initializer not found') network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, weights_initializer=weights_initializer, biases_initializer=biases_initializer, is_training=True) network_fn_val = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weights_initializer=weights_initializer, biases_initializer=biases_initializer, is_training=False) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_biasCNN.get_preprocessing( preprocessing_name, is_training=True, flipLR = FLAGS.flipLR, random_scale = FLAGS.random_scale, is_windowed = FLAGS.is_windowed) image_preprocessing_fn_val = preprocessing_biasCNN.get_preprocessing( preprocessing_name, is_training=False, flipLR = FLAGS.flipLR, random_scale = FLAGS.random_scale, is_windowed=FLAGS.is_windowed) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) ############################################ # Create a provider for the validation set # ############################################ provider_val = slim.dataset_data_provider.DatasetDataProvider( dataset_val, shuffle=True, common_queue_capacity=2 * FLAGS.batch_size_val, common_queue_min=FLAGS.batch_size_val) [image_val, label_val] = provider_val.get(['image', 'label']) label_val -= FLAGS.labels_offset eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image_val = image_preprocessing_fn_val(image_val, eval_image_size, eval_image_size) images_val, labels_val = tf.train.batch( [image_val, label_val], batch_size=FLAGS.batch_size_val, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size_val) labels_val_onehot = slim.one_hot_encoding( labels_val, dataset.num_classes - FLAGS.labels_offset) ############################### # Define the model (training) # ############################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() with tf.variable_scope('my_scope'): logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( labels, logits, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] # adding in a picture of the activations at each layer, this is a good way to double check that the rotated images look rotated to our eyes if 'conv' in end_point: dims = x.get_shape() for ii in range(5): summaries.add(tf.summary.image('image_out/' + end_point + '/image_' + str(ii), tf.slice(x,[ii,0,0,0],[1,dims[1],dims[2],1]))) summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None if FLAGS.quantize_delay >= 0: tf.contrib.quantize.create_training_graph( quant_delay=FLAGS.quantize_delay) ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) ################################# # Define the model (validation) # ################################# # get the validation set logits (predictions) with tf.variable_scope('my_scope',reuse=True): logits_val, _ = network_fn_val(images_val) predictions_val = tf.argmax(logits_val, 1) # Define loss on validation set, add a summary tf.losses.softmax_cross_entropy( labels_val_onehot, logits_val, label_smoothing=FLAGS.label_smoothing, weights=1.0, loss_collection = 'eval_losses') for loss in tf.get_collection('eval_losses'): summaries.add(tf.summary.scalar('eval_losses/%s' % loss.op.name, loss)) # Define the validation set metrics: # Will define each metric twice as separate operation. # One set will be made resettable, the other set will be streaming. with tf.name_scope('eval_metrics'): eval_acc_value, eval_acc_op = tf.metrics.accuracy(predictions=predictions_val,labels=labels_val) eval_recall_5_value, eval_recall_5_op = slim.metrics.streaming_recall_at_k(predictions=logits_val, labels=labels_val,k=5) # add these variables as summaries for tensorboard summaries.add(tf.summary.scalar('eval_recall_5', eval_recall_5_value)) summaries.add(tf.summary.scalar('eval_acc', eval_acc_value)) with tf.name_scope('eval_metrics_streaming'): eval_acc_streaming_value, eval_acc_streaming_op = tf.metrics.accuracy(predictions=predictions_val,labels=labels_val) eval_recall_5_streaming_value, eval_recall_5_streaming_op = slim.metrics.streaming_recall_at_k(predictions=logits_val, labels=labels_val,k=5) # add these variables as summaries for tensorboard summaries.add(tf.summary.scalar('eval_recall_5_streaming', eval_recall_5_streaming_value)) summaries.add(tf.summary.scalar('eval_acc_streaming', eval_acc_streaming_value)) # also add summaries of all the local variables used to compute the eval metrics... for metric in tf.get_collection(tf.GraphKeys.METRIC_VARIABLES, 'eval_metrics'): summaries.add(tf.summary.scalar('%s' % metric.op.name, metric)) for metric in tf.get_collection(tf.GraphKeys.METRIC_VARIABLES, 'eval_streaming_metrics'): summaries.add(tf.summary.scalar('%s' % metric.op.name, metric)) # gather up all the variables that are used to compute eval metrics stream_vars = [i for i in tf.local_variables() if i.name.split('/')[0]=='eval_metrics'] # make an operation that'll let us re-initialize just these vars. reset_op = tf.initialize_variables(stream_vars) # make an operation that'll let us run evaluation (all metrics) eval_op = list([eval_acc_op, eval_recall_5_op, eval_acc_streaming_op, eval_recall_5_streaming_op]) # Gather validation summaries summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Merge all summaries together (this includes training summaries too). summary_op = tf.summary.merge(list(summaries), name='summary_op') # Create a non-default saver so we don't delete all the old checkpoints. my_saver = tf_saver.Saver(max_to_keep=FLAGS.max_checkpoints_to_keep, keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours,) # Create a non-default dictionary of options for train_step_fn # This is a hack that lets us pass everything we need to run evaluation, into the training loop function with ops.name_scope('train_step'): train_step_kwargs = {} if FLAGS.max_number_of_steps: should_stop_op = math_ops.greater_equal(global_step, FLAGS.max_number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if FLAGS.log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, FLAGS.log_every_n_steps), 0) train_step_kwargs['should_val'] = math_ops.equal( math_ops.mod(global_step, FLAGS.val_every_n_steps),0) train_step_kwargs['should_reset_eval_metrics'] = math_ops.equal( math_ops.mod(global_step, tf.to_int64(math_ops.multiply(FLAGS.reset_eval_metrics_every_n_vals, FLAGS.val_every_n_steps))),0) train_step_kwargs['eval_op'] = eval_op train_step_kwargs['reset_op'] = reset_op ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None, saver=my_saver, train_step_fn=learning_biasCNN.train_step_fn, train_step_kwargs = train_step_kwargs)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') num_batches = FLAGS.num_batches # for bb in np.arange(0,num_batches): for bb in [0]: batch_name = 'batch' + str(bb) # tf.app.flags.DEFINE_string( # 'dataset_split_name',batch_name, 'The name of the train/test split.') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_biasCNN.get_dataset( FLAGS.dataset_name, batch_name, FLAGS.dataset_dir, num_classes=FLAGS.num_classes) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=1, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_biasCNN.get_preprocessing( preprocessing_name, is_training=False, flipLR=False, random_scale=False, is_windowed=FLAGS.is_windowed) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) # ims_orig = tf.identity(images); # labels_orig = tf.identity(labels); #################### # Define the model # #################### logits, end_pts = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: if FLAGS.append_scope_string: # If I've specified a string for the name of the scope in the checkpoint file, append it here so we can match up the layer names variables_to_restore_orig = slim.get_variables_to_restore() variables_to_restore = {} for var in variables_to_restore_orig: curr_name = var.op.name if 'global_step' not in curr_name: new_name = FLAGS.append_scope_string + '/' + curr_name else: new_name = curr_name variables_to_restore[new_name] = var else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map( { 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k(logits, labels, 5), }) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for first layer to visualize the image, double check whether rotations are CW or CCW keylist = list(end_pts.keys()) x = end_pts[keylist[0]] dims = x.get_shape() for ii in range(dims[0]): summaries.add( tf.summary.image( 'image_out/' + keylist[0] + '/label_' + str(ii), tf.slice(x, [ii, 0, 0, 0], [1, dims[1], dims[2], 1]))) summaries.add( tf.summary.scalar('image_label/label_' + str(ii), labels[ii])) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # Merge all summaries together (this includes training summaries too). summary_op = tf.summary.merge(list(summaries), name='summary_op') # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) out = slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), summary_op=summary_op, final_op={ 'logits': logits, 'end_pts': end_pts, 'images': images, 'labels': labels, 'predictions': predictions }, variables_to_restore=variables_to_restore)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_biasCNN.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) dataset_val = dataset_biasCNN.get_dataset(FLAGS.dataset_name, 'validation', FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) network_fn_val = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_biasCNN.get_preprocessing( preprocessing_name, is_training=True, flipLR=FLAGS.flipLR, random_scale=FLAGS.random_scale, is_windowed=FLAGS.is_windowed) image_preprocessing_fn_val = preprocessing_biasCNN.get_preprocessing( preprocessing_name, is_training=False, flipLR=FLAGS.flipLR, random_scale=FLAGS.random_scale, is_windowed=FLAGS.is_windowed) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) ############################################ # Create a provider for the validation set # ############################################ provider_val = slim.dataset_data_provider.DatasetDataProvider( dataset_val, shuffle=True, common_queue_capacity=2 * FLAGS.batch_size_val, common_queue_min=FLAGS.batch_size_val) [image_val, label_val] = provider_val.get(['image', 'label']) label_val -= FLAGS.labels_offset eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image_val = image_preprocessing_fn_val(image_val, eval_image_size, eval_image_size) images_val, labels_val = tf.train.batch( [image_val, label_val], batch_size=FLAGS.batch_size_val, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size_val) ############################### # Define the model (training) # ############################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() with tf.variable_scope('my_scope'): logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None if FLAGS.quantize_delay >= 0: tf.contrib.quantize.create_training_graph( quant_delay=FLAGS.quantize_delay) ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) ################################# # Define the model (validation) # ################################# with tf.variable_scope('my_scope', reuse=True): logits_val, _ = network_fn_val(images_val) predictions_val = tf.argmax(logits_val, 1) labels_val = tf.squeeze(labels_val) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions_val, labels_val), 'Recall_5': slim.metrics.streaming_recall_at_k(logits_val, labels_val, 5) }) for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection('summaries', op) # Gather validation summaries summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Create a non-default saver so we don't delete all the old checkpoints. my_saver = tf_saver.Saver( max_to_keep=FLAGS.max_checkpoints_to_keep, keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours, ) # Create a non-default dictionary of options for train_step_fn # This is a hack that lets us pass everything we need to run evaluation, into the training loop function from tensorflow.python.framework import ops from tensorflow.python.framework import constant_op from tensorflow.python.ops import math_ops with ops.name_scope('train_step'): train_step_kwargs = {} if FLAGS.max_number_of_steps: should_stop_op = math_ops.greater_equal( global_step, FLAGS.max_number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if FLAGS.log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, FLAGS.log_every_n_steps), 0) train_step_kwargs['should_val'] = math_ops.equal( math_ops.mod(global_step, FLAGS.val_every_n_steps), 0) train_step_kwargs['eval_op'] = list(names_to_updates.values()) # assert(FLAGS.max_number_of_steps==100000) print(should_stop_op) ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None, saver=my_saver, train_step_fn=learning_biasCNN.train_step_fn, train_step_kwargs=train_step_kwargs)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') num_batches= FLAGS.num_batches for bb in np.arange(0,num_batches): batch_name = 'batch'+str(bb) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_biasCNN.get_dataset( FLAGS.dataset_name, batch_name, FLAGS.dataset_dir, num_classes=FLAGS.num_classes) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=1, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_biasCNN.get_preprocessing( preprocessing_name, is_training=False, flipLR=False,random_scale=False, is_windowed=FLAGS.is_windowed) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, end_pts = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: if FLAGS.append_scope_string: # If I've specified a string for the name of the scope in the checkpoint file, append it here so we can match up the layer names variables_to_restore_orig = slim.get_variables_to_restore() variables_to_restore = {} for var in variables_to_restore_orig: curr_name = var.op.name if 'global_step' not in curr_name: new_name = FLAGS.append_scope_string + '/' + curr_name else: new_name = curr_name variables_to_restore[new_name]= var else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k( logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) out = slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), final_op={'logits':logits, 'end_pts':end_pts,'images':images,'labels':labels,'predictions':predictions}, variables_to_restore=variables_to_restore) end_pts= out['end_pts'] keylist= list(end_pts.keys()) for kk in range(np.size(keylist)): keystr = keylist[kk] keystr = keystr.replace('/','_') fn2save = FLAGS.eval_dir + '/' + batch_name + '_' + keystr + '.npy' np.save(fn2save, end_pts[keylist[kk]]) logits = out['logits'] labels = out['labels'] predictions = out['predictions'] fn2save = FLAGS.eval_dir + '/' + batch_name + '_logits.npy' np.save(fn2save, logits) fn2save = FLAGS.eval_dir + '/' + batch_name + '_labels_orig.npy' np.save(fn2save, labels) fn2save = FLAGS.eval_dir + '/' + batch_name + '_labels_predicted.npy' np.save(fn2save, predictions)