def graph_fn(global_step): boundaries = [2, 3, 7] rates = [1.0, 2.0, 3.0, 4.0] learning_rate = learning_schedules.manual_stepping( global_step, boundaries, rates) assert learning_rate.op.name.endswith('learning_rate') return (learning_rate, )
def graph_fn(global_step): boundaries = [4, 6, 8] rates = [0.02, 0.10, 0.01, 0.001] learning_rate = learning_schedules.manual_stepping(global_step, boundaries, rates, warmup=True) assert learning_rate.op.name.endswith('learning_rate') return (learning_rate, )
def _create_learning_rate(learning_rate_config): """Create optimizer learning rate based on config. Args: learning_rate_config: A LearningRate proto message. Returns: A learning rate. Raises: ValueError: when using an unsupported input data type. """ learning_rate = None learning_rate_type = learning_rate_config.WhichOneof('learning_rate') if learning_rate_type == 'constant_learning_rate': config = learning_rate_config.constant_learning_rate learning_rate = tf.constant(config.learning_rate, dtype=tf.float32, name='learning_rate') if learning_rate_type == 'exponential_decay_learning_rate': config = learning_rate_config.exponential_decay_learning_rate learning_rate = learning_schedules.exponential_decay_with_burnin( tf.train.get_or_create_global_step(), config.initial_learning_rate, config.decay_steps, config.decay_factor, burnin_learning_rate=config.burnin_learning_rate, burnin_steps=config.burnin_steps, min_learning_rate=config.min_learning_rate, staircase=config.staircase) if learning_rate_type == 'manual_step_learning_rate': config = learning_rate_config.manual_step_learning_rate if not config.schedule: raise ValueError('Empty learning rate schedule.') learning_rate_step_boundaries = [x.step for x in config.schedule] learning_rate_sequence = [config.initial_learning_rate] learning_rate_sequence += [x.learning_rate for x in config.schedule] learning_rate = learning_schedules.manual_stepping( tf.train.get_or_create_global_step(), learning_rate_step_boundaries, learning_rate_sequence, config.warmup) if learning_rate_type == 'cosine_decay_learning_rate': config = learning_rate_config.cosine_decay_learning_rate learning_rate = learning_schedules.cosine_decay_with_warmup( tf.train.get_or_create_global_step(), config.learning_rate_base, config.total_steps, config.warmup_learning_rate, config.warmup_steps, config.hold_base_rate_steps) if learning_rate is None: raise ValueError('Learning_rate %s not supported.' % learning_rate_type) return learning_rate
def testManualStepping(self): global_step = tf.placeholder(tf.int64, []) boundaries = [2, 3, 7] rates = [1.0, 2.0, 3.0, 4.0] exp_rates = [1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0] learning_rate = learning_schedules.manual_stepping( global_step, boundaries, rates) with self.test_session() as sess: output_rates = [] for input_global_step in range(10): output_rate = sess.run( learning_rate, feed_dict={global_step: input_global_step}) output_rates.append(output_rate) self.assertAllClose(output_rates, exp_rates)
def _create_learning_rate(learning_rate_config, global_summaries): """Create optimizer learning rate based on config. Args: learning_rate_config: A LearningRate proto message. global_summaries: A set to attach learning rate summary to. Returns: A learning rate. Raises: ValueError: when using an unsupported input data type. """ learning_rate = None learning_rate_type = learning_rate_config.WhichOneof('learning_rate') if learning_rate_type == 'constant_learning_rate': config = learning_rate_config.constant_learning_rate learning_rate = config.learning_rate if learning_rate_type == 'exponential_decay_learning_rate': config = learning_rate_config.exponential_decay_learning_rate learning_rate = tf.train.exponential_decay( config.initial_learning_rate, slim.get_or_create_global_step(), config.decay_steps, config.decay_factor, staircase=config.staircase) if learning_rate_type == 'manual_step_learning_rate': config = learning_rate_config.manual_step_learning_rate if not config.schedule: raise ValueError('Empty learning rate schedule.') learning_rate_step_boundaries = [x.step for x in config.schedule] learning_rate_sequence = [config.initial_learning_rate] learning_rate_sequence += [x.learning_rate for x in config.schedule] learning_rate = learning_schedules.manual_stepping( slim.get_or_create_global_step(), learning_rate_step_boundaries, learning_rate_sequence) if learning_rate is None: raise ValueError('Learning_rate %s not supported.' % learning_rate_type) global_summaries.add(tf.summary.scalar('Learning_Rate', learning_rate)) return learning_rate
def main(_): with tf.Graph().as_default() as graph: summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) num_batches_epoch = num_samples // (FLAGS.batch_size * FLAGS.num_clones) print(num_batches_epoch) ####################### # Config model_deploy # ####################### config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.ps_tasks) # Create global_step with tf.device(config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### with tf.device(config.inputs_device()): # Train Process dataset = get_split('train', FLAGS.dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=FLAGS.batch_size * 20, common_queue_min=FLAGS.batch_size * 10) [image_a, image_b, label] = provider.get(['image_a', 'image_b', 'label']) probe = image_a galleries = tf.unstack(image_b) galleries_process = [] probe = process_image(probe) probe.set_shape([FLAGS.target_height, FLAGS.target_width, 3]) gallery_target = tf.slice(image_b, [label, 0, 0, 0], [1, -1, -1, -1]) gallery_target = tf.squeeze(gallery_target, axis=[0]) gallery = process_image(gallery_target) gallery.set_shape([FLAGS.target_height, FLAGS.target_width, 3]) galleries_process.append(gallery) for Idx in range(FLAGS.top_k - 1): imgIdx = tf.cond(Idx >= label, lambda: Idx + 1, lambda: Idx) gallery_other = tf.slice(image_b, [imgIdx, 0, 0, 0], [1, -1, -1, -1]) gallery_other = tf.squeeze(gallery_other, axis=[0]) gallery = process_image(gallery_other) gallery.set_shape([FLAGS.target_height, FLAGS.target_width, 3]) galleries_process.append(gallery) label_new = 0 galleries_process = tf.stack(galleries_process) probe_batch, galleries_batch, labels = tf.train.batch( [probe, galleries_process, label_new], batch_size=FLAGS.batch_size, num_threads=8, capacity=FLAGS.batch_size * 10) inputs_queue = prefetch_queue( [probe_batch, galleries_batch, labels]) ###################### # Select the network # ###################### def model_fn(inputs_queue): probe_batch, galleries_batch, labels = inputs_queue.dequeue() probe_batch_tile = tf.tile(tf.expand_dims(probe_batch, axis=1), [1, FLAGS.top_k, 1, 1, 1]) shape = probe_batch_tile.get_shape().as_list() probe_batch_reshape = tf.reshape( probe_batch_tile, [-1, shape[2], shape[3], shape[4]]) galleries_batch_reshape = tf.reshape( galleries_batch, [-1, shape[2], shape[3], shape[4]]) images_a = probe_batch_reshape images_b = galleries_batch_reshape model = find_class_by_name(FLAGS.model, [models])() logits = model.create_model(images_a, images_b, reuse=False, is_training=True) logits = tf.reshape(logits, [FLAGS.batch_size, -1]) label_onehot = tf.one_hot(labels, FLAGS.top_k) crossentropy_loss = tf.losses.softmax_cross_entropy( onehot_labels=label_onehot, logits=logits) tf.summary.histogram('images_a', images_a) clones = model_deploy.create_clones(config, model_fn, [inputs_queue]) first_clone_scope = clones[0].scope ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(config.optimizer_device()): learning_rate_step_boundaries = [ int(num_batches_epoch * num_epoches * 0.60), int(num_batches_epoch * num_epoches * 0.75), int(num_batches_epoch * num_epoches * 0.90) ] learning_rate_sequence = [FLAGS.learning_rate] learning_rate_sequence += [ FLAGS.learning_rate * 0.1, FLAGS.learning_rate * 0.01, FLAGS.learning_rate * 0.001 ] learning_rate = learning_schedules.manual_stepping( global_step, learning_rate_step_boundaries, learning_rate_sequence) # learning_rate = learning_schedules.exponential_decay_with_burnin(global_step, # FLAGS.learning_rate,num_batches_epoch*num_epoches,0.001/FLAGS.learning_rate, # burnin_learning_rate=0.01, # burnin_steps=5000) if FLAGS.optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate) if FLAGS.optimizer == 'momentum': opt = tf.train.MomentumOptimizer(learning_rate, momentum=0.9) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(config.optimizer_device()): training_optimizer = opt # Create ops required to initialize the model from a given checkpoint. TODO!! init_fn = None if FLAGS.model == 'DCSL': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionResnetV2') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_resnet_v2.ckpt'), slim.get_model_variables('InceptionResnetV2')) if FLAGS.model == 'DCSL_inception_v1': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'DCSL_NAS': # if FLAGS.weights is None: # # if not FLAGS.moving_average_decay: # variables = slim.get_model_variables('NAS') # init_fn = slim.assign_from_checkpoint_fn( # os.path.join(FLAGS.checkpoints_dir, 'nasnet-a_large_04_10_2017/model.ckpt'), # slim.get_model_variables('NAS')) def restore_map(): variables_to_restore = {} for variable in tf.global_variables(): for scope_name in ['NAS']: if variable.op.name.startswith(scope_name): var_name = variable.op.name.replace( scope_name + '/', '') # var_name = variable.op.name variables_to_restore[ var_name + '/ExponentialMovingAverage'] = variable # variables_to_restore[var_name] = variable return variables_to_restore var_map = restore_map() # restore_var = [v for v in tf.global_variables() if 'global_step' not in v.name] available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, FLAGS.weights)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, FLAGS.weights) init_fn = initializer_fn if FLAGS.model == 'MultiHeadAttentionBaseModel_set': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) else: restore_var = [ v for v in slim.get_model_variables() if 'Score' not in v.name ] init_fn = slim.assign_from_checkpoint_fn( FLAGS.weights, restore_var) if FLAGS.model == 'MultiHeadAttentionBaseModel_set_share': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) else: restore_var = [ v for v in slim.get_model_variables() if 'Score' not in v.name ] init_fn = slim.assign_from_checkpoint_fn( FLAGS.weights, restore_var) if FLAGS.model == 'MultiHeadAttentionBaseModel_set_share_softmatch': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) else: restore_var = [ v for v in slim.get_model_variables() if 'Score' not in v.name ] init_fn = slim.assign_from_checkpoint_fn( FLAGS.weights, restore_var) if FLAGS.model == 'MultiHeadAttentionBaseModel_set_share_softmatch_v2': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) else: restore_var = [ v for v in slim.get_model_variables() if 'Score' not in v.name ] init_fn = slim.assign_from_checkpoint_fn( FLAGS.weights, restore_var) if FLAGS.model == 'MultiHeadAttentionBaseModel_set_share_res50': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('resnet_v2_50') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'resnet_v2_50.ckpt'), slim.get_model_variables('resnet_v2_50')) if FLAGS.model == 'MultiHeadAttentionBaseModel_set_inv3': # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV3') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v3.ckpt'), slim.get_model_variables('InceptionV3')) # compute and update gradients with tf.device(config.optimizer_device()): if FLAGS.moving_average_decay: update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. all_trainable = tf.trainable_variables() # and returns a train_tensor and summary_op total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=None, var_list=all_trainable) grad_mult = utils.get_model_gradient_multipliers( FLAGS.last_layer_gradient_multiplier) grads_and_vars = slim.learning.multiply_gradients( grads_and_vars, grad_mult) # Optionally clip gradients # with tf.name_scope('clip_grads'): # grads_and_vars = slim.learning.clip_gradient_norms(grads_and_vars, 10) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # GPU settings session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = False # Save checkpoints regularly. keep_checkpoint_every_n_hours = 2.0 saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) ########################### # Kicks off the training. # ########################### slim.learning.train(train_tensor, logdir=logdir, master=FLAGS.master, is_chief=(FLAGS.task == 0), session_config=session_config, startup_delay_steps=10, summary_op=summary_op, init_fn=init_fn, number_of_steps=num_batches_epoch * FLAGS.num_epoches, save_summaries_secs=240, sync_optimizer=None, saver=saver)
def graph_fn(global_step): boundaries = [] rates = [0.01] learning_rate = learning_schedules.manual_stepping( global_step, boundaries, rates) return (learning_rate, )
def graph_fn(global_step): boundaries = [2, 3, 7] rates = [1.0, 2.0, 3.0, 4.0] learning_rate = learning_schedules.manual_stepping( global_step, boundaries, rates) return (learning_rate, )