def restore_ckpt(sess, ckpt_path, enable_ema=True, export_ckpt=None): """Restore variables from a given checkpoint. Args: sess: a tf session for restoring or exporting models. ckpt_path: the path of the checkpoint. Can be a file path or a folder path. enable_ema: whether reload ema values or not. export_ckpt: whether to export the restored model. """ sess.run(tf.global_variables_initializer()) if tf.io.gfile.isdir(ckpt_path): ckpt_path = tf.train.latest_checkpoint(ckpt_path) if enable_ema: ema = tf.train.ExponentialMovingAverage(decay=0.0) ema_vars = utils.get_ema_vars() var_dict = ema.variables_to_restore(ema_vars) ema_assign_op = ema.apply(ema_vars) else: var_dict = utils.get_ema_vars() ema_assign_op = None tf.train.get_or_create_global_step() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_dict, max_to_keep=1) saver.restore(sess, ckpt_path) if export_ckpt: print('export model to {}'.format(export_ckpt)) if ema_assign_op is not None: sess.run(ema_assign_op) saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) saver.save(sess, export_ckpt)
def restore_model(self, sess, ckpt_path, enable_ema=True, export_ckpt=None): """Restore variables from a given checkpoint.""" sess.run(tf.global_variables_initializer()) checkpoint = tf.train.latest_checkpoint(ckpt_path) if enable_ema: ema = tf.train.ExponentialMovingAverage(decay=0.0) ema_vars = utils.get_ema_vars() var_dict = ema.variables_to_restore(ema_vars) ema_assign_op = ema.apply(ema_vars) else: var_dict = utils.get_ema_vars() ema_assign_op = None tf.train.get_or_create_global_step() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_dict, max_to_keep=1) saver.restore(sess, checkpoint) if export_ckpt: print('export model to {}'.format(export_ckpt)) if ema_assign_op is not None: sess.run(ema_assign_op) saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) saver.save(sess, export_ckpt)
def tf1_export_ema_ckpt(): """Restore variables from a given checkpoint.""" with tf1.Session() as sess: model = effnetv2_model.EffNetV2Model(FLAGS.model_name, FLAGS.hparam_str) batch_size = FLAGS.batch_size isize = FLAGS.image_size or model.cfg.eval.isize inputs = tf.ones((batch_size, isize, isize, 3), tf.float32) _ = model(inputs, training=False) sess.run(tf1.global_variables_initializer()) if tf.io.gfile.isdir(FLAGS.model_dir): ckpt_path = tf1.train.latest_checkpoint(FLAGS.model_dir) else: ckpt_path = FLAGS.model_dir ema = tf1.train.ExponentialMovingAverage(decay=0.0) ema_vars = utils.get_ema_vars() var_dict = ema.variables_to_restore(ema_vars) ema_assign_op = ema.apply(ema_vars) tf1.train.get_or_create_global_step() sess.run(tf1.global_variables_initializer()) saver = tf1.train.Saver(var_dict, max_to_keep=1) # Restore all variables from ckpt. saver.restore(sess, ckpt_path) print('export model to {}'.format(FLAGS.export_dir)) sess.run(ema_assign_op) saver = tf1.train.Saver(max_to_keep=1, save_relative_paths=True) saver.save(sess, FLAGS.export_dir)
def norm(x, scope, axis=[-1]): with tf.variable_scope(scope): n_state = shape_list(x)[-1] g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0)) g, b = get_ema_vars(g, b) return _norm(x, g, b, axis=axis)
def restore_model(self, sess, ckpt_path, enable_ema=True, export_ckpt=None): """Restore variables from a given checkpoint.""" sess.run(tf.global_variables_initializer()) checkpoint = tf.train.latest_checkpoint(ckpt_path) if enable_ema: ema = tf.train.ExponentialMovingAverage(decay=0.0) ema_vars = utils.get_ema_vars() var_dict = ema.variables_to_restore(ema_vars) ema_assign_op = ema.apply(ema_vars) else: var_dict = utils.get_ema_vars() ema_assign_op = None tf.train.get_or_create_global_step() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_dict, max_to_keep=1) saver.restore(sess, checkpoint) global_variables = tf.global_variables() weights = dict() for variable in global_variables: try: if 'Exponential' in variable.name: continue weights[variable.name] = variable.eval() except: print( f"Skipping variable {variable.name}, an exception occurred" ) import pickle pickle.dump(weights, open(f'checkpoints/{self.model_name}_weights.pkl', 'wb')) if export_ckpt: print('export model to {}'.format(export_ckpt)) if ema_assign_op is not None: sess.run(ema_assign_op) saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) saver.save(sess, export_ckpt)
def restore_ckpt(sess, ckpt_path, ema_decay=0.9998, export_ckpt=None): """Restore variables from a given checkpoint. Args: sess: a tf session for restoring or exporting models. ckpt_path: the path of the checkpoint. Can be a file path or a folder path. ema_decay: ema decay rate. If None or zero or negative value, disable ema. export_ckpt: whether to export the restored model. """ sess.run(tf.global_variables_initializer()) if tf.io.gfile.isdir(ckpt_path): ckpt_path = tf.train.latest_checkpoint(ckpt_path) if ema_decay > 0: ema = tf.train.ExponentialMovingAverage(decay=0.0) ema_vars = utils.get_ema_vars() var_dict = ema.variables_to_restore(ema_vars) ema_assign_op = ema.apply(ema_vars) else: var_dict = utils.get_ema_vars() ema_assign_op = None tf.train.get_or_create_global_step() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_dict, max_to_keep=1) if ckpt_path == '_': logging.info('Running test: do not load any ckpt.') return # Restore all variables from ckpt. saver.restore(sess, ckpt_path) if export_ckpt: print('export model to {}'.format(export_ckpt)) if ema_assign_op is not None: sess.run(ema_assign_op) saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) saver.save(sess, export_ckpt)
def get_pretrained_variables_to_restore(checkpoint_path, load_moving_average=False): """Gets veriables_to_restore mapping from pretrained checkpoint. Args: checkpoint_path: String. Path of checkpoint. load_moving_average: Boolean, whether load moving average variables to replace variables. Returns: Mapping of variables to restore. """ checkpoint_reader = tf.train.load_checkpoint(checkpoint_path) variable_shape_map = checkpoint_reader.get_variable_to_shape_map() variables_to_restore = {} ema_vars = utils.get_ema_vars() for v in tf.global_variables(): # Skip variables if they are in excluded scopes. is_excluded = False for scope in ['global_step', 'ExponentialMovingAverage']: if scope in v.op.name: is_excluded = True break if is_excluded: tf.logging.info('Exclude [%s] from loading from checkpoint.', v.op.name) continue variable_name_ckpt = v.op.name if load_moving_average and v in ema_vars: # To load moving average variables into non-moving version for # fine-tuning, maps variables here manually. variable_name_ckpt = v.op.name + '/ExponentialMovingAverage' if variable_name_ckpt not in variable_shape_map: tf.logging.info( 'Skip init [%s] from [%s] as it is not in the checkpoint', v.op.name, variable_name_ckpt) continue variables_to_restore[variable_name_ckpt] = v tf.logging.info('Init variable [%s] from [%s] in ckpt', v.op.name, variable_name_ckpt) return variables_to_restore
def restore_model(self, sess, ckpt_dir, enable_ema=True, export_ckpt=None): """Restore variables from checkpoint dir.""" sess.run(tf.global_variables_initializer()) checkpoint = tf.train.latest_checkpoint(ckpt_dir) if enable_ema: ema = tf.train.ExponentialMovingAverage(decay=0.0) ema_vars = utils.get_ema_vars() var_dict = ema.variables_to_restore(ema_vars) ema_assign_op = ema.apply(ema_vars) else: var_dict = None ema_assign_op = None sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_dict, max_to_keep=1) saver.restore(sess, checkpoint) if export_ckpt: if ema_assign_op is not None: sess.run(ema_assign_op) saver = tf.train.Saver(max_to_keep=1) saver.save(sess, export_ckpt)
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of one hot labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if FLAGS.data_format == 'channels_first': assert not FLAGS.transpose_input # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (FLAGS.moving_average_decay > 0) # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) logging.info('Using open-source implementation.') override_params = {} if FLAGS.batch_norm_momentum is not None: override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum if FLAGS.batch_norm_epsilon is not None: override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon if FLAGS.dropout_rate is not None: override_params['dropout_rate'] = FLAGS.dropout_rate if FLAGS.survival_prob is not None: override_params['survival_prob'] = FLAGS.survival_prob if FLAGS.data_format: override_params['data_format'] = FLAGS.data_format if FLAGS.num_label_classes: override_params['num_classes'] = FLAGS.num_label_classes if FLAGS.depth_coefficient: override_params['depth_coefficient'] = FLAGS.depth_coefficient if FLAGS.width_coefficient: override_params['width_coefficient'] = FLAGS.width_coefficient def normalize_features(features, mean_rgb, stddev_rgb): """Normalize the image given the means and stddevs.""" features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype) features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype) return features def build_model(): """Build model using the model_name given through the command line.""" model_builder = model_builder_factory.get_model_builder( FLAGS.model_name) normalized_features = normalize_features(features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB) logits, _ = model_builder.build_model(normalized_features, model_name=FLAGS.model_name, training=is_training, override_params=override_params, model_dir=FLAGS.model_dir) return logits if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): logits = tf.cast(build_model(), tf.float32) else: logits = build_model() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) logging.info('base_learning_rate = %f', FLAGS.base_learning_rate) learning_rate = utils.build_learning_rate( scaled_lr, global_step, params['steps_per_epoch'], decay_epochs=FLAGS.lr_decay_epoch) optimizer = utils.build_optimizer(learning_rate) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not FLAGS.skip_host_call: def host_call_fn(gs, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed FLAGS.iterations_per_loop times after one # TPU loop is finished, setting max_queue value to the same as number of # iterations will make the summary writer only flush the data to storage # once per loop. with tf2.summary.create_file_writer( FLAGS.model_dir, max_queue=FLAGS.iterations_per_loop).as_default(): with tf2.summary.record_if(True): tf2.summary.scalar('learning_rate', lr[0], step=gs) tf2.summary.scalar('current_epoch', ce[0], step=gs) return tf.summary.all_v2_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, lr_t, ce_t]) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) logging.info('number of trainable parameters: %d', num_params) def _scaffold_fn(): saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) if has_moving_average_decay and not is_training: # Only apply scaffold for eval jobs. scaffold_fn = _scaffold_fn else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def model_fn(features, labels, mode, params=None): """The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of one hot labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features["feature"] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if context.get_hparam("data_format") == "channels_first": assert not context.get_hparam("transpose_input") # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] #if context.get_hparam("transpose_input") and mode != tf.estimator.ModeKeys.PREDICT: # features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC is_training = mode == tf.estimator.ModeKeys.TRAIN has_moving_average_decay = context.get_hparam("moving_average_decay") > 0 # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) logging.info("Using open-source implementation.") override_params = {} #if context.get_hparam("batch_norm_momentum") is not None: # override_params["batch_norm_momentum"] = context.get_hparam("batch_norm_momentum") #if context.get_hparam("batch_norm_epsilon") is not None: # override_params["batch_norm_epsilon"] = context.get_hparam("batch_norm_epsilon") # if context.get_hparam("dropout_rate") is not None: # override_params["dropout_rate"] = context.get_hparam("dropout_rate") # if context.get_hparam("survival_prob") is not None: # override_params["survival_prob"] = context.get_hparam("survival_prob") # if context.get_hparam("data_format"): # override_params["data_format"] = context.get_hparam("data_format") # if context.get_hparam("num_label_classes"): # override_params["num_classes"] = context.get_hparam("num_label_classes") # if context.get_hparam("depth_coefficient"): # override_params["depth_coefficient"] = context.get_hparam("depth_coefficient") # if context.get_hparam("width_coefficient"): # override_params["width_coefficient"] = context.get_hparam("width_coefficient") def normalize_features(features, mean_rgb, stddev_rgb): """Normalize the image given the means and stddevs.""" features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype) features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype) return features def build_model(): """Build model using the model_name given through the command line.""" model_builder = model_builder_factory.get_model_builder( context.get_hparam("model_name"), ) normalized_features = normalize_features( features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB ) logits, _ = model_builder.build_model( normalized_features, model_name=context.get_hparam("model_name"), training=is_training, override_params=override_params, #model_dir=context.get_hparam("model_dir"), ) return logits logits = build_model() # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=context.get_hparam("label_smoothing") ) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + context.get_hparam("weight_decay") * tf.add_n( [ tf.nn.l2_loss(v) for v in tf.trainable_variables() if "batch_normalization" not in v.name ] ) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=context.get_hparam("moving_average_decay"), num_updates=global_step ) ema_vars = utils.get_ema_vars() restore_vars_dict = None train_op = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = tf.cast(global_step, tf.float32) / context.get_hparam("steps_per_epoch") scaled_lr = context.get_hparam("base_learning_rate") * (context.get_hparam("train_batch_size") / 256.0) logging.info("base_learning_rate = %f", context.get_hparam("base_learning_rate")) learning_rate = utils.build_learning_rate( scaled_lr, global_step, context.get_hparam("steps_per_epoch"), ) optimizer = utils.build_optimizer(context, learning_rate) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { "top_1_accuracy": top_1_accuracy, "top_5_accuracy": top_5_accuracy, } eval_metrics = metric_fn(labels, logits) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) logging.info("number of trainable parameters: %d", num_params) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics, )
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ utils.image('input_image', features) training_hooks = [] if params['data_format'] == 'channels_first': features = tf.transpose(features, [0, 3, 1, 2]) def _model_outputs(inputs): # Convert params (dict) to Config for easier access. return model(inputs, config=hparams_config.Config(params)) cls_outputs, box_outputs = utils.build_model_with_precision( params['precision'], _model_outputs, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) if box_iou_loss: utils.scalar('trainloss/box_iou_loss', box_iou_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if params['strategy'] == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top learning_rate = learning_rate * hvd.size() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) elif params['strategy'] == 'horovod': optimizer = hvd.DistributedOptimizer(optimizer) training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] if params['strategy'] == 'tpu': batch_size = params['batch_size'] * params['num_shards'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get( 'var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None if params['strategy'] != 'tpu': # Profile every 1K steps. profile_hook = tf.train.ProfilerHook(save_steps=1000, output_dir=params['model_dir']) training_hooks.append(profile_hook) # Report memory allocation if OOM class OomReportingHook(tf.estimator.SessionRunHook): def before_run(self, run_context): return tf.estimator.SessionRunArgs( fetches=[], options=tf.RunOptions( report_tensor_allocations_upon_oom=True)) training_hooks.append(OomReportingHook()) return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN and EVAL. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ is_tpu = params['strategy'] == 'tpu' if params['img_summary_steps']: utils.image('input_image', features, is_tpu) training_hooks = [] params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN) if params['use_keras_model']: def model_fn(inputs): model = efficientdet_keras.EfficientDetNet( config=hparams_config.Config(params)) cls_out_list, box_out_list = model(inputs, params['is_training_bn']) cls_outputs, box_outputs = {}, {} for i in range(params['min_level'], params['max_level'] + 1): cls_outputs[i] = cls_out_list[i - params['min_level']] box_outputs[i] = box_out_list[i - params['min_level']] return cls_outputs, box_outputs else: model_fn = functools.partial(model, config=hparams_config.Config(params)) precision = utils.get_precision(params['strategy'], params['mixed_precision']) cls_outputs, box_outputs = utils.build_model_with_precision( precision, model_fn, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate, is_tpu) utils.scalar('trainloss/cls_loss', cls_loss, is_tpu) utils.scalar('trainloss/box_loss', box_loss, is_tpu) utils.scalar('trainloss/det_loss', det_loss, is_tpu) utils.scalar('trainloss/reg_l2_loss', reg_l2loss, is_tpu) utils.scalar('trainloss/loss', total_loss, is_tpu) train_epochs = tf.cast(global_step, tf.float32) / params['steps_per_epoch'] utils.scalar('train_epochs', train_epochs, is_tpu) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if is_tpu: optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', None): logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] # First clip each variable's norm, then clip global norm. clip_norm = abs(params['clip_gradients_norm']) clipped_grads = [ tf.clip_by_norm(g, clip_norm) if g is not None else None for g in grads ] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) utils.scalar('gradient_norm', tf.linalg.global_norm(clipped_grads), is_tpu) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] nms_configs = params['nms_configs'] for index in range(kwargs['boxes'].shape[0]): detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file'], label_map=params['label_map']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) cls_outputs = postprocess.to_list(cls_outputs) box_outputs = postprocess.to_list(box_outputs) params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS boxes, scores, classes = postprocess.pre_nms(params, cls_outputs, box_outputs) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'image_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], 'boxes': boxes, 'scores': scores, 'classes': classes, } eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, skip_mismatch=params['skip_mismatch']) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None if is_tpu: return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call(global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks) else: # Profile every 1K steps. if params.get('profile', False): profile_hook = tf.estimator.ProfilerHook( save_steps=1000, output_dir=params['model_dir'], show_memory=True) training_hooks.append(profile_hook) # Report memory allocation if OOM; it will slow down the running. class OomReportingHook(tf.estimator.SessionRunHook): def before_run(self, run_context): return tf.estimator.SessionRunArgs( fetches=[], options=tf.RunOptions( report_tensor_allocations_upon_oom=True)) training_hooks.append(OomReportingHook()) logging_hook = tf.estimator.LoggingTensorHook( { 'step': global_step, 'det_loss': det_loss, 'cls_loss': cls_loss, 'box_loss': box_loss, }, every_n_iter=params.get('iterations_per_loop', 100), ) training_hooks.append(logging_hook) eval_metric_ops = (eval_metrics[0]( **eval_metrics[1]) if eval_metrics else None) return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, scaffold=scaffold_fn() if scaffold_fn else None, training_hooks=training_hooks)
def mnasnet_model_fn(features, labels, mode, params): """The model_fn for MnasNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) # This is essential, if using a keras-derived model. K.set_learning_phase(is_training) if isinstance(features, dict): features = features['feature'] if mode == tf.estimator.ModeKeys.PREDICT: # Adds an identify node to help TFLite export. features = tf.identity(features, 'float_image_input') # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if params['data_format'] == 'channels_first': assert not params['transpose_input'] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(imagenet_input.MEAN_RGB, shape=stats_shape, dtype=features.dtype) features /= tf.constant(imagenet_input.STDDEV_RGB, shape=stats_shape, dtype=features.dtype) has_moving_average_decay = (params['moving_average_decay'] > 0) tf.logging.info('Using open-source implementation for MnasNet definition.') override_params = {} if params['batch_norm_momentum']: override_params['batch_norm_momentum'] = params['batch_norm_momentum'] if params['batch_norm_epsilon']: override_params['batch_norm_epsilon'] = params['batch_norm_epsilon'] if params['dropout_rate']: override_params['dropout_rate'] = params['dropout_rate'] if params['data_format']: override_params['data_format'] = params['data_format'] if params['num_label_classes']: override_params['num_classes'] = params['num_label_classes'] if params['depth_multiplier']: override_params['depth_multiplier'] = params['depth_multiplier'] if params['depth_divisor']: override_params['depth_divisor'] = params['depth_divisor'] if params['min_depth']: override_params['min_depth'] = params['min_depth'] override_params['use_keras'] = params['use_keras'] if params['precision'] == 'bfloat16': with tf.contrib.tpu.bfloat16_scope(): logits, _ = mnasnet_models.build_mnasnet_model( features, model_name=params['model_name'], training=is_training, override_params=override_params) logits = tf.cast(logits, tf.float32) else: # params['precision'] == 'float32' logits, _ = mnasnet_models.build_mnasnet_model( features, model_name=params['model_name'], training=is_training, override_params=override_params) if params['quantized_training']: if is_training: tf.logging.info('Adding fake quantization ops for training.') tf.contrib.quantize.create_training_graph( quant_delay=int(params['steps_per_epoch'] * FLAGS.quantization_delay_epochs)) else: tf.logging.info('Adding fake quantization ops for evaluation.') tf.contrib.quantize.create_eval_graph() if mode == tf.estimator.ModeKeys.PREDICT: scaffold_fn = None if FLAGS.export_moving_average: # If the model is trained with moving average decay, to match evaluation # metrics, we need to export the model using moving average variables. restore_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir) variables_to_restore = get_pretrained_variables_to_restore( restore_checkpoint, load_moving_average=True) tf.logging.info('Restoring from the latest checkpoint: %s', restore_checkpoint) tf.logging.info(str(variables_to_restore)) def restore_scaffold(): saver = tf.train.Saver(variables_to_restore) return tf.train.Scaffold(saver=saver) scaffold_fn = restore_scaffold predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }, scaffold_fn=scaffold_fn) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params['num_label_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=params['moving_average_decay'], num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = params['base_learning_rate'] * (params['train_batch_size'] / 256.0) # pylint: disable=line-too-long learning_rate = utils.build_learning_rate(scaled_lr, global_step, params['steps_per_epoch']) optimizer = utils.build_optimizer(learning_rate) if params['use_tpu']: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not params['skip_host_call']: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the # data to storage once per loop. with tf.contrib.summary.create_file_writer( FLAGS.model_dir, max_queue=params['iterations_per_loop']).as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('loss', loss[0], step=gs) tf.contrib.summary.scalar('learning_rate', lr[0], step=gs) tf.contrib.summary.scalar('current_epoch', ce[0], step=gs) return tf.contrib.summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('number of trainable parameters: {}'.format(num_params)) # Prepares scaffold_fn if needed. scaffold_fn = None if is_training and FLAGS.init_checkpoint: variables_to_restore = get_pretrained_variables_to_restore( FLAGS.init_checkpoint, has_moving_average_decay) tf.logging.info('Initializing from pretrained checkpoint: %s', FLAGS.init_checkpoint) if FLAGS.use_tpu: def init_scaffold(): tf.train.init_from_checkpoint(FLAGS.init_checkpoint, variables_to_restore) return tf.train.Scaffold() scaffold_fn = init_scaffold else: tf.train.init_from_checkpoint(FLAGS.init_checkpoint, variables_to_restore) restore_vars_dict = None if not is_training and has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) def eval_scaffold(): saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) scaffold_fn = eval_scaffold return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: A dict of `Tensor` of batched images and other features. labels: a Tensor or a dict of Tensor representing the batched labels. mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ logging.info('params=%s', params) images = features['image'] if isinstance(features, dict) else features labels = labels['label'] if isinstance(labels, dict) else labels config = params['config'] image_size = params['image_size'] utils.scalar('model/resolution', image_size) if config.model.data_format == 'channels_first': images = tf.transpose(images, [0, 3, 1, 2]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (config.train.ema_decay > 0) if FLAGS.use_tpu and not config.model.bn_type: config.model.bn_type = 'tpu_bn' # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) def build_model(in_images): """Build model using the model_name given through the command line.""" config.model.num_classes = config.data.num_classes model = effnetv2_model.EffNetV2Model(config.model.model_name, config.model) logits = model(in_images, training=is_training)[0] return logits pre_num_params, pre_num_flops = utils.num_params_flops( readable_format=True) if config.runtime.mixed_precision: precision = 'mixed_bfloat16' if FLAGS.use_tpu else 'mixed_float16' logits = utils.build_model_with_precision(precision, build_model, images, is_training) logits = tf.cast(logits, tf.float32) else: logits = build_model(images) num_params, num_flops = utils.num_params_flops(readable_format=True) num_params = num_params - pre_num_params num_flops = (num_flops - pre_num_flops) / params['batch_size'] logging.info('backbone params/flops = %.4f M / %.4f B', num_params, num_flops) utils.scalar('model/params', num_params) utils.scalar('model/flops', num_flops) # Calculate loss, which includes softmax cross entropy and L2 regularization. if config.train.loss_type == 'sigmoid': cross_entropy = tf.losses.sigmoid_cross_entropy( multi_class_labels=tf.cast(labels, dtype=logits.dtype), logits=logits, label_smoothing=config.train.label_smoothing) elif config.train.loss_type == 'custom': xent = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast( labels, dtype=logits.dtype), logits=logits) cross_entropy = tf.reduce_mean(tf.reduce_sum(xent, axis=-1)) else: if config.data.multiclass: logging.info('use multi-class loss: %s', config.data.multiclass) labels /= tf.reshape(tf.reduce_sum(labels, axis=1), (-1, 1)) cross_entropy = tf.losses.softmax_cross_entropy( onehot_labels=labels, logits=logits, label_smoothing=config.train.label_smoothing) train_steps = max(config.train.min_steps, config.train.epochs * params['steps_per_epoch']) global_step = tf.train.get_global_step() weight_decay_inc = config.train.weight_decay_inc * ( tf.cast(global_step, tf.float32) / tf.cast(train_steps, tf.float32)) weight_decay = (1 + weight_decay_inc) * config.train.weight_decay utils.scalar('train/weight_decay', weight_decay) # Add weight decay to the loss for non-batch-normalization variables. matcher = re.compile(config.train.weight_decay_exclude) l2loss = weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if not matcher.match(v.name) ]) loss = cross_entropy + l2loss utils.scalar('loss/l2reg', l2loss) utils.scalar('loss/xent', cross_entropy) if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=config.train.ema_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) utils.scalar('train/epoch', current_epoch) scaled_lr = config.train.lr_base * (config.train.batch_size / 256.0) scaled_lr_min = config.train.lr_min * (config.train.batch_size / 256.0) learning_rate = utils.WarmupLearningRateSchedule( scaled_lr, steps_per_epoch=params['steps_per_epoch'], decay_epochs=config.train.lr_decay_epoch, warmup_epochs=config.train.lr_warmup_epoch, decay_factor=config.train.lr_decay_factor, lr_decay_type=config.train.lr_sched, total_steps=train_steps, minimal_lr=scaled_lr_min)(global_step) utils.scalar('train/lr', learning_rate) optimizer = utils.build_optimizer( learning_rate, optimizer_name=config.train.optimizer) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # filter trainable variables if needed. var_list = tf.trainable_variables() if config.train.varsexp: vars2 = [ v for v in var_list if re.match(config.train.varsexp, v.name) ] if len(vars2) == len(var_list): logging.warning('%s has no match.', config.train.freeze) logging.info('Filter variables: orig=%d, final=%d, delta=%d', len(var_list), len(vars2), len(var_list) - len(vars2)) var_list = vars2 # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if config.train.gclip and is_training: logging.info('clip gradients norm by %f', config.train.gclip) grads_and_vars = optimizer.compute_gradients(loss, var_list) with tf.name_scope('gclip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] utils.scalar('train/gnorm', tf.linalg.global_norm(grads)) utils.scalar('train/gnormmax', tf.math.reduce_max([tf.norm(g) for g in grads])) # First clip each variable's norm, then clip global norm. clip_norm = abs(config.train.gclip) clipped_grads = [ tf.clip_by_norm(g, clip_norm) if g is not None else None for g in grads ] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step, var_list=var_list) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not config.runtime.skip_host_call: host_call = utils.get_tpu_host_call( global_step, FLAGS.model_dir, config.runtime.iterations_per_loop) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ metrics = {} if config.data.multiclass: metrics['eval/global_ap'] = tf.metrics.auc( labels, tf.nn.sigmoid(logits), curve='PR', num_thresholds=200, summation_method='careful_interpolation', name='global_ap') # Convert labels to set: be careful, tf.metrics.xx_at_k are horrible. labels = tf.cast(labels, dtype=tf.int64) label_to_repeat = tf.expand_dims(tf.argmax(labels, axis=-1), axis=-1) all_labels_set = tf.range(0, labels.shape[-1], dtype=tf.int64) all_labels_set = tf.expand_dims(all_labels_set, axis=0) labels_set = labels * all_labels_set + ( 1 - labels) * label_to_repeat metrics['eval/precision@1'] = tf.metrics.precision_at_k( labels_set, logits, k=1) metrics['eval/recall@1'] = tf.metrics.recall_at_k(labels_set, logits, k=1) metrics['eval/precision@5'] = tf.metrics.precision_at_k( labels_set, logits, k=5) metrics['eval/recall@5'] = tf.metrics.recall_at_k(labels_set, logits, k=5) # always add accuracy. labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) metrics['eval/acc_top1'] = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) metrics['eval/acc_top5'] = tf.metrics.mean(in_top_5) metrics['model/resolution'] = tf.metrics.mean(image_size) metrics['model/flops'] = tf.metrics.mean(num_flops) metrics['model/params'] = tf.metrics.mean(num_params) return metrics eval_metrics = (metric_fn, [labels, logits]) if has_moving_average_decay and not is_training: def scaffold_fn(): # read ema for eval jobs. saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) elif config.train.ft_init_ckpt and is_training: def scaffold_fn(): logging.info('restore variables from %s', config.train.ft_init_ckpt) var_map = utils.get_ckpt_var_map( ckpt_path=config.train.ft_init_ckpt, skip_mismatch=True, init_ema=config.train.ft_init_ema) tf.train.init_from_checkpoint(config.train.ft_init_ckpt, var_map) return tf.train.Scaffold() else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ # Convert params (dict) to Config for easier access. def _model_outputs(): return model(features, config=hparams_config.Config(params)) if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() if is_rank0(): show_model() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels, params) l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/l2_loss', l2loss) utils.scalar('trainloss/loss', total_loss) utils.scalar('loss', total_loss) # for consistency moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) if horovod_enabled(): optimizer = hvd.DistributedOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list, params['resnet_depth']) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) # only rank0 to restore, then broadcast variables if is_rank0(): checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') elif params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get( 'var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None else: scaffold_fn = None training_hooks = [] if horovod_enabled(): init_weights_hook = BroadcastGlobalVariablesHook( root_rank=0, model_dir=params['model_dir']) training_hooks.append(init_weights_hook) if is_rank0() or params['dump_all_ranks']: training_hooks.extend([ LoggingTensorHook(dict(utils.summaries), summary_dir=params['model_dir'], every_n_iter=params['every_n_iter']), ExamplesPerSecondEstimatorHook( params['batch_size'], every_n_steps=params['every_n_iter'], output_dir=params['model_dir'], log_global_step=True) ]) if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold=scaffold_fn() if scaffold_fn is not None else None, training_hooks=training_hooks) else: # host_call in the original code was to write summary, but caused the error # 'ValueError: Tensor("strided_slice_6:0", shape=(), dtype=int64) must be from the same graph as Tensor("strided_slice:0", shape=(), dtype=float32)' # thus, it's handled in write_summary() in main.py return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, #host_call=utils.get_tpu_host_call(global_step, params), scaffold_fn=scaffold_fn)
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] stats_shape = [1, 1, 3] is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (FLAGS.moving_average_decay > 0) # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) tf.logging.info('Using open-source implementation.') override_params = {} if FLAGS.batch_norm_momentum is not None: override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum if FLAGS.batch_norm_epsilon is not None: override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon if FLAGS.dropout_rate is not None: override_params['dropout_rate'] = FLAGS.dropout_rate if FLAGS.drop_connect_rate is not None: override_params['drop_connect_rate'] = FLAGS.drop_connect_rate if FLAGS.num_label_classes: override_params['num_classes'] = FLAGS.num_label_classes if FLAGS.depth_coefficient: override_params['depth_coefficient'] = FLAGS.depth_coefficient if FLAGS.width_coefficient: override_params['width_coefficient'] = FLAGS.width_coefficient def normalize_features(features, mean_rgb, stddev_rgb): """Normalize the image given the means and stddevs.""" features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype) features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype) return features def build_model(): """Build model using the model_name given through the command line.""" model_builder = None if FLAGS.model_name.startswith('efficientnet'): model_builder = efficientnet_builder else: raise ValueError('Model must be either efficientnet-b*') normalized_features = normalize_features(features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB) logits, _ = model_builder.build_model(normalized_features, model_name=FLAGS.model_name, training=is_training, override_params=override_params, model_dir=FLAGS.model_dir) return logits logits = build_model() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=FLAGS.label_smoothing) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() train_op = None restore_vars_dict = None training_hooks = [] if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) learning_rate = utils.build_learning_rate(scaled_lr, global_step, params['steps_per_epoch']) optimizer = utils.build_optimizer(learning_rate, optimizer_name='adam') # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) predictions = tf.argmax(logits, axis=1) top1_accuray = tf.metrics.accuracy(labels, predictions) logging_hook = tf.train.LoggingTensorHook( { "loss": loss, "accuracy": top1_accuray[1], "step": global_step }, every_n_iter=1) training_hooks.append(logging_hook) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: predictions = tf.argmax(logits, axis=1) top1_accuray = tf.metrics.accuracy(labels, predictions) eval_metrics = {'val_accuracy': top1_accuray} num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('number of trainable parameters: {}'.format(num_params)) scaffold = None if has_moving_average_decay and not is_training: # Only apply scaffold for eval jobs. saver = tf.train.Saver(restore_vars_dict) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=training_hooks, eval_metric_ops=eval_metrics, scaffold=scaffold)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ utils.image('input_image', features) training_hooks = [] params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN) if params['use_keras_model']: def model_fn(inputs): model = efficientdet_keras.EfficientDetNet( config=hparams_config.Config(params)) cls_out_list, box_out_list = model(inputs, params['is_training_bn']) cls_outputs, box_outputs = {}, {} for i in range(params['min_level'], params['max_level'] + 1): cls_outputs[i] = cls_out_list[i - params['min_level']] box_outputs[i] = box_out_list[i - params['min_level']] return cls_outputs, box_outputs else: model_fn = functools.partial(model, config=hparams_config.Config(params)) precision = utils.get_precision(params['strategy'], params['mixed_precision']) cls_outputs, box_outputs = utils.build_model_with_precision( precision, model_fn, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) if params['iou_loss_type']: utils.scalar('trainloss/box_iou_loss', box_iou_loss) train_epochs = tf.cast(global_step, tf.float32) / params['steps_per_epoch'] utils.scalar('train_epochs', train_epochs) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) if params['gradient_checkpointing']: from third_party.grad_checkpoint \ import memory_saving_gradients # pylint: disable=g-import-not-at-top from tensorflow.python.ops \ import gradients # pylint: disable=g-import-not-at-top # monkey patch tf.gradients to point to our custom version, # with automatic checkpoint selection def gradients_(ys, xs, grad_ys=None, **kwargs): return memory_saving_gradients.gradients( ys, xs, grad_ys, checkpoints=params['gradient_checkpointing_list'], **kwargs) gradients.__dict__["gradients"] = gradients_ # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', None): logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] # First clip each variable's norm, then clip global norm. clip_norm = abs(params['clip_gradients_norm']) clipped_grads = [tf.clip_by_norm(g, clip_norm) for g in grads] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) utils.scalar('gradient_norm', tf.linalg.global_norm(clipped_grads)) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] for index in range(kwargs['boxes'].shape[0]): nms_configs = params['nms_configs'] detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data'], params['label_map']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) cls_outputs = postprocess.to_list(cls_outputs) box_outputs = postprocess.to_list(box_outputs) params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS boxes, scores, classes = postprocess.pre_nms(params, cls_outputs, box_outputs) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'image_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], 'boxes': boxes, 'scores': scores, 'classes': classes, } eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, skip_mismatch=params['skip_mismatch']) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None if params['strategy'] != 'tpu': # Profile every 1K steps. if params.get('profile', False): profile_hook = tf.estimator.ProfilerHook( save_steps=1000, output_dir=params['model_dir'], show_memory=True) training_hooks.append(profile_hook) # Report memory allocation if OOM class OomReportingHook(tf.estimator.SessionRunHook): def before_run(self, run_context): return tf.estimator.SessionRunArgs( fetches=[], options=tf.RunOptions( report_tensor_allocations_upon_oom=True)) training_hooks.append(OomReportingHook()) logging_hook = tf.estimator.LoggingTensorHook( { 'step': global_step, 'det_loss': det_loss, 'cls_loss': cls_loss, 'box_loss': box_loss, }, every_n_iter=params.get('iterations_per_loop', 100), ) training_hooks.append(logging_hook) if params["nvgpu_logging"]: try: from third_party import nvgpu # pylint: disable=g-import-not-at-top from functools import reduce # pylint: disable=g-import-not-at-top def get_nested_value(d, path): return reduce(dict.get, path, d) def nvgpu_gpu_info(inp): inp = inp.decode("utf-8") inp = inp.split(",") inp = [x.strip() for x in inp] value = get_nested_value(nvgpu.gpu_info(), inp) return np.str(value) def commonsize(inp): const_sizes = { 'B': 1, 'KB': 1e3, 'MB': 1e6, 'GB': 1e9, 'TB': 1e12, 'PB': 1e15, 'KiB': 1024, 'MiB': 1048576, 'GiB': 1073741824 } inp = inp.split(" ") # convert all to MiB if inp[1] != 'MiB': inp_ = float( inp[0]) * (const_sizes[inp[1]] / 1048576.0) else: inp_ = float(inp[0]) return inp_ def formatter_log(tensors): """Format the output.""" mem_used = tensors["memory used"].decode("utf-8") mem_total = tensors["memory total"].decode("utf-8") mem_util = commonsize(mem_used) / commonsize(mem_total) logstring = "GPU memory used: {} = {:.1%} of total GPU memory: {}".format( mem_used, mem_util, mem_total) return logstring mem_used = tf.py_func(nvgpu_gpu_info, ['gpu, fb_memory_usage, used'], [tf.string])[0] mem_total = tf.py_func(nvgpu_gpu_info, ['gpu, fb_memory_usage, total'], [tf.string])[0] logging_hook3 = tf.estimator.LoggingTensorHook( tensors={ "memory used": mem_used, "memory total": mem_total, }, every_n_iter=params.get('iterations_per_loop', 100), formatter=formatter_log, ) training_hooks.append(logging_hook3) except: logging.error("nvgpu error: nvidia-smi format not recognized") if params['strategy'] == 'tpu': return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call(global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks) else: eval_metric_ops = eval_metrics[0]( **eval_metrics[1]) if eval_metrics else None utils.get_tpu_host_call(global_step, params) return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, scaffold=scaffold_fn(), training_hooks=training_hooks)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ # Convert params (dict) to Config for easier access. def _model_outputs(): return model(features, config=hparams_config.Config(params)) if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels, params) l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/l2_loss', l2loss) utils.scalar('trainloss/loss', total_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list, params['resnet_depth']) if params.get('clip_gradients_norm', 0) > 0: tf.logging.info('clip gradients norm by {}'.format( params['clip_gradients_norm'])) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) if params['backbone_ckpt'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.logging.info('restore variables from %s' % params['backbone_ckpt']) if params['ckpt_var_scope'] is None: ckpt_scope = params[ 'backbone_name'] # Use backbone name in default. else: ckpt_scope = params['ckpt_var_scope'] tf.train.init_from_checkpoint( params['backbone_ckpt'], utils.get_ckt_var_map(params['backbone_ckpt'], ckpt_scope + '/', params['backbone_name'] + '/')) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" tf.logging.info('Load EMA vars with ema_decay=%f' % moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): def _model_outputs(): return model(features, config=hparams_config.Config(params)) if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,labels, params) l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/l2_loss', l2loss) utils.scalar('trainloss/loss', total_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list, params['resnet_depth']) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError('--backbone_ckpt and --checkpoint are mutually exclusive') elif params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get('var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call(global_step, params), scaffold_fn=scaffold_fn)