Ejemplo n.º 1
0
def restore_ckpt(sess, ckpt_path, enable_ema=True, export_ckpt=None):
    """Restore variables from a given checkpoint.

  Args:
    sess: a tf session for restoring or exporting models.
    ckpt_path: the path of the checkpoint. Can be a file path or a folder path.
    enable_ema: whether reload ema values or not.
    export_ckpt: whether to export the restored model.
  """
    sess.run(tf.global_variables_initializer())
    if tf.io.gfile.isdir(ckpt_path):
        ckpt_path = tf.train.latest_checkpoint(ckpt_path)
    if enable_ema:
        ema = tf.train.ExponentialMovingAverage(decay=0.0)
        ema_vars = utils.get_ema_vars()
        var_dict = ema.variables_to_restore(ema_vars)
        ema_assign_op = ema.apply(ema_vars)
    else:
        var_dict = utils.get_ema_vars()
        ema_assign_op = None
    tf.train.get_or_create_global_step()
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(var_dict, max_to_keep=1)
    saver.restore(sess, ckpt_path)

    if export_ckpt:
        print('export model to {}'.format(export_ckpt))
        if ema_assign_op is not None:
            sess.run(ema_assign_op)
        saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
        saver.save(sess, export_ckpt)
Ejemplo n.º 2
0
    def restore_model(self,
                      sess,
                      ckpt_path,
                      enable_ema=True,
                      export_ckpt=None):
        """Restore variables from a given checkpoint."""
        sess.run(tf.global_variables_initializer())
        checkpoint = tf.train.latest_checkpoint(ckpt_path)
        if enable_ema:
            ema = tf.train.ExponentialMovingAverage(decay=0.0)
            ema_vars = utils.get_ema_vars()
            var_dict = ema.variables_to_restore(ema_vars)
            ema_assign_op = ema.apply(ema_vars)
        else:
            var_dict = utils.get_ema_vars()
            ema_assign_op = None

        tf.train.get_or_create_global_step()
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(var_dict, max_to_keep=1)
        saver.restore(sess, checkpoint)

        if export_ckpt:
            print('export model to {}'.format(export_ckpt))
            if ema_assign_op is not None:
                sess.run(ema_assign_op)
            saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
            saver.save(sess, export_ckpt)
Ejemplo n.º 3
0
def tf1_export_ema_ckpt():
    """Restore variables from a given checkpoint."""
    with tf1.Session() as sess:
        model = effnetv2_model.EffNetV2Model(FLAGS.model_name,
                                             FLAGS.hparam_str)
        batch_size = FLAGS.batch_size
        isize = FLAGS.image_size or model.cfg.eval.isize
        inputs = tf.ones((batch_size, isize, isize, 3), tf.float32)
        _ = model(inputs, training=False)
        sess.run(tf1.global_variables_initializer())
        if tf.io.gfile.isdir(FLAGS.model_dir):
            ckpt_path = tf1.train.latest_checkpoint(FLAGS.model_dir)
        else:
            ckpt_path = FLAGS.model_dir

        ema = tf1.train.ExponentialMovingAverage(decay=0.0)
        ema_vars = utils.get_ema_vars()
        var_dict = ema.variables_to_restore(ema_vars)
        ema_assign_op = ema.apply(ema_vars)

        tf1.train.get_or_create_global_step()
        sess.run(tf1.global_variables_initializer())
        saver = tf1.train.Saver(var_dict, max_to_keep=1)
        # Restore all variables from ckpt.
        saver.restore(sess, ckpt_path)

        print('export model to {}'.format(FLAGS.export_dir))
        sess.run(ema_assign_op)
        saver = tf1.train.Saver(max_to_keep=1, save_relative_paths=True)
        saver.save(sess, FLAGS.export_dir)
Ejemplo n.º 4
0
def norm(x, scope, axis=[-1]):
    with tf.variable_scope(scope):
        n_state = shape_list(x)[-1]
        g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1))
        b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0))
        g, b = get_ema_vars(g, b)
        return _norm(x, g, b, axis=axis)
Ejemplo n.º 5
0
    def restore_model(self,
                      sess,
                      ckpt_path,
                      enable_ema=True,
                      export_ckpt=None):
        """Restore variables from a given checkpoint."""
        sess.run(tf.global_variables_initializer())
        checkpoint = tf.train.latest_checkpoint(ckpt_path)
        if enable_ema:
            ema = tf.train.ExponentialMovingAverage(decay=0.0)
            ema_vars = utils.get_ema_vars()
            var_dict = ema.variables_to_restore(ema_vars)
            ema_assign_op = ema.apply(ema_vars)
        else:
            var_dict = utils.get_ema_vars()
            ema_assign_op = None

        tf.train.get_or_create_global_step()
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(var_dict, max_to_keep=1)
        saver.restore(sess, checkpoint)
        global_variables = tf.global_variables()
        weights = dict()
        for variable in global_variables:
            try:
                if 'Exponential' in variable.name:
                    continue
                weights[variable.name] = variable.eval()
            except:
                print(
                    f"Skipping variable {variable.name}, an exception occurred"
                )
        import pickle
        pickle.dump(weights,
                    open(f'checkpoints/{self.model_name}_weights.pkl', 'wb'))
        if export_ckpt:
            print('export model to {}'.format(export_ckpt))
            if ema_assign_op is not None:
                sess.run(ema_assign_op)
            saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
            saver.save(sess, export_ckpt)
Ejemplo n.º 6
0
def restore_ckpt(sess, ckpt_path, ema_decay=0.9998, export_ckpt=None):
    """Restore variables from a given checkpoint.

  Args:
    sess: a tf session for restoring or exporting models.
    ckpt_path: the path of the checkpoint. Can be a file path or a folder path.
    ema_decay: ema decay rate. If None or zero or negative value, disable ema.
    export_ckpt: whether to export the restored model.
  """
    sess.run(tf.global_variables_initializer())
    if tf.io.gfile.isdir(ckpt_path):
        ckpt_path = tf.train.latest_checkpoint(ckpt_path)
    if ema_decay > 0:
        ema = tf.train.ExponentialMovingAverage(decay=0.0)
        ema_vars = utils.get_ema_vars()
        var_dict = ema.variables_to_restore(ema_vars)
        ema_assign_op = ema.apply(ema_vars)
    else:
        var_dict = utils.get_ema_vars()
        ema_assign_op = None

    tf.train.get_or_create_global_step()
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(var_dict, max_to_keep=1)
    if ckpt_path == '_':
        logging.info('Running test: do not load any ckpt.')
        return

    # Restore all variables from ckpt.
    saver.restore(sess, ckpt_path)

    if export_ckpt:
        print('export model to {}'.format(export_ckpt))
        if ema_assign_op is not None:
            sess.run(ema_assign_op)
        saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
        saver.save(sess, export_ckpt)
Ejemplo n.º 7
0
def get_pretrained_variables_to_restore(checkpoint_path,
                                        load_moving_average=False):
    """Gets veriables_to_restore mapping from pretrained checkpoint.

  Args:
    checkpoint_path: String. Path of checkpoint.
    load_moving_average: Boolean, whether load moving average variables to
      replace variables.

  Returns:
    Mapping of variables to restore.
  """
    checkpoint_reader = tf.train.load_checkpoint(checkpoint_path)
    variable_shape_map = checkpoint_reader.get_variable_to_shape_map()

    variables_to_restore = {}
    ema_vars = utils.get_ema_vars()
    for v in tf.global_variables():
        # Skip variables if they are in excluded scopes.
        is_excluded = False
        for scope in ['global_step', 'ExponentialMovingAverage']:
            if scope in v.op.name:
                is_excluded = True
                break
        if is_excluded:
            tf.logging.info('Exclude [%s] from loading from checkpoint.',
                            v.op.name)
            continue
        variable_name_ckpt = v.op.name
        if load_moving_average and v in ema_vars:
            # To load moving average variables into non-moving version for
            # fine-tuning, maps variables here manually.
            variable_name_ckpt = v.op.name + '/ExponentialMovingAverage'

        if variable_name_ckpt not in variable_shape_map:
            tf.logging.info(
                'Skip init [%s] from [%s] as it is not in the checkpoint',
                v.op.name, variable_name_ckpt)
            continue

        variables_to_restore[variable_name_ckpt] = v
        tf.logging.info('Init variable [%s] from [%s] in ckpt', v.op.name,
                        variable_name_ckpt)
    return variables_to_restore
Ejemplo n.º 8
0
    def restore_model(self, sess, ckpt_dir, enable_ema=True, export_ckpt=None):
        """Restore variables from checkpoint dir."""
        sess.run(tf.global_variables_initializer())
        checkpoint = tf.train.latest_checkpoint(ckpt_dir)
        if enable_ema:
            ema = tf.train.ExponentialMovingAverage(decay=0.0)
            ema_vars = utils.get_ema_vars()
            var_dict = ema.variables_to_restore(ema_vars)
            ema_assign_op = ema.apply(ema_vars)
        else:
            var_dict = None
            ema_assign_op = None

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(var_dict, max_to_keep=1)
        saver.restore(sess, checkpoint)

        if export_ckpt:
            if ema_assign_op is not None:
                sess.run(ema_assign_op)
            saver = tf.train.Saver(max_to_keep=1)
            saver.save(sess, export_ckpt)
Ejemplo n.º 9
0
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of one hot labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features['feature']

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
    if FLAGS.data_format == 'channels_first':
        assert not FLAGS.transpose_input  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])
        stats_shape = [3, 1, 1]
    else:
        stats_shape = [1, 1, 3]

    if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT:
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (FLAGS.moving_average_decay > 0)
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)
    logging.info('Using open-source implementation.')
    override_params = {}
    if FLAGS.batch_norm_momentum is not None:
        override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum
    if FLAGS.batch_norm_epsilon is not None:
        override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon
    if FLAGS.dropout_rate is not None:
        override_params['dropout_rate'] = FLAGS.dropout_rate
    if FLAGS.survival_prob is not None:
        override_params['survival_prob'] = FLAGS.survival_prob
    if FLAGS.data_format:
        override_params['data_format'] = FLAGS.data_format
    if FLAGS.num_label_classes:
        override_params['num_classes'] = FLAGS.num_label_classes
    if FLAGS.depth_coefficient:
        override_params['depth_coefficient'] = FLAGS.depth_coefficient
    if FLAGS.width_coefficient:
        override_params['width_coefficient'] = FLAGS.width_coefficient

    def normalize_features(features, mean_rgb, stddev_rgb):
        """Normalize the image given the means and stddevs."""
        features -= tf.constant(mean_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        features /= tf.constant(stddev_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        return features

    def build_model():
        """Build model using the model_name given through the command line."""
        model_builder = model_builder_factory.get_model_builder(
            FLAGS.model_name)
        normalized_features = normalize_features(features,
                                                 model_builder.MEAN_RGB,
                                                 model_builder.STDDEV_RGB)
        logits, _ = model_builder.build_model(normalized_features,
                                              model_name=FLAGS.model_name,
                                              training=is_training,
                                              override_params=override_params,
                                              model_dir=FLAGS.model_dir)
        return logits

    if params['use_bfloat16']:
        with tf.tpu.bfloat16_scope():
            logits = tf.cast(build_model(), tf.float32)
    else:
        logits = build_model()

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=labels,
        label_smoothing=FLAGS.label_smoothing)

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + FLAGS.weight_decay * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=FLAGS.moving_average_decay, num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    restore_vars_dict = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0)
        logging.info('base_learning_rate = %f', FLAGS.base_learning_rate)
        learning_rate = utils.build_learning_rate(
            scaled_lr,
            global_step,
            params['steps_per_epoch'],
            decay_epochs=FLAGS.lr_decay_epoch)
        optimizer = utils.build_optimizer(learning_rate)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not FLAGS.skip_host_call:

            def host_call_fn(gs, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                # Host call fns are executed FLAGS.iterations_per_loop times after one
                # TPU loop is finished, setting max_queue value to the same as number of
                # iterations will make the summary writer only flush the data to storage
                # once per loop.
                with tf2.summary.create_file_writer(
                        FLAGS.model_dir,
                        max_queue=FLAGS.iterations_per_loop).as_default():
                    with tf2.summary.record_if(True):
                        tf2.summary.scalar('learning_rate', lr[0], step=gs)
                        tf2.summary.scalar('current_epoch', ce[0], step=gs)

                        return tf.summary.all_v2_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, lr_t, ce_t])

    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, num_classes]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            labels = tf.argmax(labels, axis=1)
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    logging.info('number of trainable parameters: %d', num_params)

    def _scaffold_fn():
        saver = tf.train.Saver(restore_vars_dict)
        return tf.train.Scaffold(saver=saver)

    if has_moving_average_decay and not is_training:
        # Only apply scaffold for eval jobs.
        scaffold_fn = _scaffold_fn
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=loss,
                                             train_op=train_op,
                                             host_call=host_call,
                                             eval_metrics=eval_metrics,
                                             scaffold_fn=scaffold_fn)
Ejemplo n.º 10
0
    def model_fn(features, labels, mode, params=None):
        """The model_fn to be used with TPUEstimator.

        Args:
            features: `Tensor` of batched images.
            labels: `Tensor` of one hot labels for the data samples
            mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`

        Returns:
        A `TPUEstimatorSpec` for the model
        """
        if isinstance(features, dict):
            features = features["feature"]

        # In most cases, the default data format NCHW instead of NHWC should be
        # used for a significant performance boost on GPU. NHWC should be used
        # only if the network needs to be run on CPU since the pooling operations
        # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
        if context.get_hparam("data_format") == "channels_first":
            assert not context.get_hparam("transpose_input")  # channels_first only for GPU
            features = tf.transpose(features, [0, 3, 1, 2])
            stats_shape = [3, 1, 1]
        else:
            stats_shape = [1, 1, 3]

        #if context.get_hparam("transpose_input") and mode != tf.estimator.ModeKeys.PREDICT:
        #    features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        has_moving_average_decay = context.get_hparam("moving_average_decay") > 0
        # This is essential, if using a keras-derived model.
        tf.keras.backend.set_learning_phase(is_training)
        logging.info("Using open-source implementation.")
        override_params = {}
        #if context.get_hparam("batch_norm_momentum") is not None:
        #    override_params["batch_norm_momentum"] = context.get_hparam("batch_norm_momentum")
        #if context.get_hparam("batch_norm_epsilon") is not None:
        #    override_params["batch_norm_epsilon"] = context.get_hparam("batch_norm_epsilon")
       # if context.get_hparam("dropout_rate") is not None:
       #     override_params["dropout_rate"] = context.get_hparam("dropout_rate")
       # if context.get_hparam("survival_prob") is not None:
       #     override_params["survival_prob"] = context.get_hparam("survival_prob")
       # if context.get_hparam("data_format"):
       #     override_params["data_format"] = context.get_hparam("data_format")
       # if context.get_hparam("num_label_classes"):
       #     override_params["num_classes"] = context.get_hparam("num_label_classes")
       # if context.get_hparam("depth_coefficient"):
       #     override_params["depth_coefficient"] = context.get_hparam("depth_coefficient")
       # if context.get_hparam("width_coefficient"):
       #     override_params["width_coefficient"] = context.get_hparam("width_coefficient")

        def normalize_features(features, mean_rgb, stddev_rgb):
            """Normalize the image given the means and stddevs."""
            features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype)
            features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype)
            return features

        def build_model():
            """Build model using the model_name given through the command line."""
            model_builder = model_builder_factory.get_model_builder(
                context.get_hparam("model_name"),
            )
            normalized_features = normalize_features(
                features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB
            )
            logits, _ = model_builder.build_model(
                normalized_features,
                model_name=context.get_hparam("model_name"),
                training=is_training,
                override_params=override_params,
                #model_dir=context.get_hparam("model_dir"),
            )
            return logits

        logits = build_model()

        # Calculate loss, which includes softmax cross entropy and L2 regularization.
        cross_entropy = tf.losses.softmax_cross_entropy(
            logits=logits, onehot_labels=labels, label_smoothing=context.get_hparam("label_smoothing")
        )

        # Add weight decay to the loss for non-batch-normalization variables.
        loss = cross_entropy + context.get_hparam("weight_decay") * tf.add_n(
            [
                tf.nn.l2_loss(v)
                for v in tf.trainable_variables()
                if "batch_normalization" not in v.name
            ]
        )

        global_step = tf.train.get_global_step()
        if has_moving_average_decay:
            ema = tf.train.ExponentialMovingAverage(
                decay=context.get_hparam("moving_average_decay"), num_updates=global_step
            )
            ema_vars = utils.get_ema_vars()

        restore_vars_dict = None
        train_op = None
        if is_training:
            # Compute the current epoch and associated learning rate from global_step.
            current_epoch = tf.cast(global_step, tf.float32) / context.get_hparam("steps_per_epoch")

            scaled_lr = context.get_hparam("base_learning_rate") * (context.get_hparam("train_batch_size") / 256.0)
            logging.info("base_learning_rate = %f", context.get_hparam("base_learning_rate"))
            learning_rate = utils.build_learning_rate(
                scaled_lr, global_step, context.get_hparam("steps_per_epoch"),
            )
            optimizer = utils.build_optimizer(context, learning_rate)

            # Batch normalization requires UPDATE_OPS to be added as a dependency to
            # the train operation.
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(loss, global_step)

            if has_moving_average_decay:
                with tf.control_dependencies([train_op]):
                    train_op = ema.apply(ema_vars)

        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

        eval_metrics = None
        if mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(labels, logits):
                """Evaluation metric function. Evaluates accuracy.

                This function is executed on the CPU and should not directly reference
                any Tensors in the rest of the `model_fn`. To pass Tensors from the model
                to the `metric_fn`, provide as part of the `eval_metrics`. See
                https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
                for more information.

                Arguments should match the list of `Tensor` objects passed as the second
                element in the tuple passed to `eval_metrics`.

                Args:
                    labels: `Tensor` with shape `[batch, num_classes]`.
                    logits: `Tensor` with shape `[batch, num_classes]`.

                Returns:
                    A dict of the metrics to return from evaluation.
                """
                labels = tf.argmax(labels, axis=1)
                predictions = tf.argmax(logits, axis=1)
                top_1_accuracy = tf.metrics.accuracy(labels, predictions)
                in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
                top_5_accuracy = tf.metrics.mean(in_top_5)

                return {
                    "top_1_accuracy": top_1_accuracy,
                    "top_5_accuracy": top_5_accuracy,
                }

            eval_metrics = metric_fn(labels, logits)

        num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
        logging.info("number of trainable parameters: %d", num_params)


        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics,
        )
Ejemplo n.º 11
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    utils.image('input_image', features)
    training_hooks = []
    if params['data_format'] == 'channels_first':
        features = tf.transpose(features, [0, 3, 1, 2])

    def _model_outputs(inputs):
        # Convert params (dict) to Config for easier access.
        return model(inputs, config=hparams_config.Config(params))

    cls_outputs, box_outputs = utils.build_model_with_precision(
        params['precision'], _model_outputs, features,
        params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)
        if box_iou_loss:
            utils.scalar('trainloss/box_iou_loss', box_iou_loss)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()
    if params['strategy'] == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        learning_rate = learning_rate * hvd.size()
    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        elif params['strategy'] == 'horovod':
            optimizer = hvd.DistributedOptimizer(optimizer)
            training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            if params['strategy'] == 'tpu':
                batch_size = params['batch_size'] * params['num_shards']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    testdev_dir=params['testdev_dir'],
                    disable_pyfun=params.get('disable_pyfun', None),
                    **kwargs)
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                              params['val_json_file'],
                                              **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint,
                                             ckpt_scope=ckpt_scope,
                                             var_scope=var_scope,
                                             var_exclude_expr=params.get(
                                                 'var_exclude_expr', None))

            tf.train.init_from_checkpoint(checkpoint, var_map)

            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if params['strategy'] != 'tpu':
        # Profile every 1K steps.
        profile_hook = tf.train.ProfilerHook(save_steps=1000,
                                             output_dir=params['model_dir'])
        training_hooks.append(profile_hook)

        # Report memory allocation if OOM
        class OomReportingHook(tf.estimator.SessionRunHook):
            def before_run(self, run_context):
                return tf.estimator.SessionRunArgs(
                    fetches=[],
                    options=tf.RunOptions(
                        report_tensor_allocations_upon_oom=True))

        training_hooks.append(OomReportingHook())

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn,
                                             training_hooks=training_hooks)
Ejemplo n.º 12
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN and EVAL.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    is_tpu = params['strategy'] == 'tpu'
    if params['img_summary_steps']:
        utils.image('input_image', features, is_tpu)
    training_hooks = []
    params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)

    if params['use_keras_model']:

        def model_fn(inputs):
            model = efficientdet_keras.EfficientDetNet(
                config=hparams_config.Config(params))
            cls_out_list, box_out_list = model(inputs,
                                               params['is_training_bn'])
            cls_outputs, box_outputs = {}, {}
            for i in range(params['min_level'], params['max_level'] + 1):
                cls_outputs[i] = cls_out_list[i - params['min_level']]
                box_outputs[i] = box_out_list[i - params['min_level']]
            return cls_outputs, box_outputs
    else:
        model_fn = functools.partial(model,
                                     config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, model_fn, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                  labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate, is_tpu)
        utils.scalar('trainloss/cls_loss', cls_loss, is_tpu)
        utils.scalar('trainloss/box_loss', box_loss, is_tpu)
        utils.scalar('trainloss/det_loss', det_loss, is_tpu)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss, is_tpu)
        utils.scalar('trainloss/loss', total_loss, is_tpu)
        train_epochs = tf.cast(global_step,
                               tf.float32) / params['steps_per_epoch']
        utils.scalar('train_epochs', train_epochs, is_tpu)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if is_tpu:
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', None):
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(params['clip_gradients_norm'])
                clipped_grads = [
                    tf.clip_by_norm(g, clip_norm) if g is not None else None
                    for g in grads
                ]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                utils.scalar('gradient_norm',
                             tf.linalg.global_norm(clipped_grads), is_tpu)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', True):
                detections_bs = []
                nms_configs = params['nms_configs']
                for index in range(kwargs['boxes'].shape[0]):
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
                detections_bs = postprocess.transform_detections(
                    tf.stack(detections_bs))
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                    params, kwargs['boxes'], kwargs['scores'],
                    kwargs['classes'], kwargs['image_scales'])
                img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                  nms_scores.dtype)
                detections_bs = [
                    img_ids * tf.ones_like(nms_scores),
                    nms_boxes[:, :, 1],
                    nms_boxes[:, :, 0],
                    nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                    nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                    nms_scores,
                    nms_classes,
                ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'],
                    label_map=params['label_map'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, kwargs['groundtruth_data'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])

        cls_outputs = postprocess.to_list(cls_outputs)
        box_outputs = postprocess.to_list(box_outputs)
        params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS
        boxes, scores, classes = postprocess.pre_nms(params, cls_outputs,
                                                     box_outputs)
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'image_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
            'boxes': boxes,
            'scores': scores,
            'classes': classes,
        }
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(
                ckpt_path=checkpoint,
                ckpt_scope=ckpt_scope,
                var_scope=var_scope,
                skip_mismatch=params['skip_mismatch'])

            tf.train.init_from_checkpoint(checkpoint, var_map)
            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if is_tpu:
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metrics=eval_metrics,
            host_call=utils.get_tpu_host_call(global_step, params),
            scaffold_fn=scaffold_fn,
            training_hooks=training_hooks)
    else:
        # Profile every 1K steps.
        if params.get('profile', False):
            profile_hook = tf.estimator.ProfilerHook(
                save_steps=1000,
                output_dir=params['model_dir'],
                show_memory=True)
            training_hooks.append(profile_hook)

            # Report memory allocation if OOM; it will slow down the running.
            class OomReportingHook(tf.estimator.SessionRunHook):
                def before_run(self, run_context):
                    return tf.estimator.SessionRunArgs(
                        fetches=[],
                        options=tf.RunOptions(
                            report_tensor_allocations_upon_oom=True))

            training_hooks.append(OomReportingHook())

        logging_hook = tf.estimator.LoggingTensorHook(
            {
                'step': global_step,
                'det_loss': det_loss,
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            },
            every_n_iter=params.get('iterations_per_loop', 100),
        )
        training_hooks.append(logging_hook)

        eval_metric_ops = (eval_metrics[0](
            **eval_metrics[1]) if eval_metrics else None)
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metric_ops=eval_metric_ops,
            scaffold=scaffold_fn() if scaffold_fn else None,
            training_hooks=training_hooks)
Ejemplo n.º 13
0
def mnasnet_model_fn(features, labels, mode, params):
    """The model_fn for MnasNet to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
      `params['batch_size']` is always provided and should be used as the
      effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    # This is essential, if using a keras-derived model.
    K.set_learning_phase(is_training)

    if isinstance(features, dict):
        features = features['feature']

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Adds an identify node to help TFLite export.
        features = tf.identity(features, 'float_image_input')

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
    if params['data_format'] == 'channels_first':
        assert not params['transpose_input']  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])
        stats_shape = [3, 1, 1]
    else:
        stats_shape = [1, 1, 3]

    if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT:
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    # Normalize the image to zero mean and unit variance.
    features -= tf.constant(imagenet_input.MEAN_RGB,
                            shape=stats_shape,
                            dtype=features.dtype)
    features /= tf.constant(imagenet_input.STDDEV_RGB,
                            shape=stats_shape,
                            dtype=features.dtype)

    has_moving_average_decay = (params['moving_average_decay'] > 0)

    tf.logging.info('Using open-source implementation for MnasNet definition.')
    override_params = {}
    if params['batch_norm_momentum']:
        override_params['batch_norm_momentum'] = params['batch_norm_momentum']
    if params['batch_norm_epsilon']:
        override_params['batch_norm_epsilon'] = params['batch_norm_epsilon']
    if params['dropout_rate']:
        override_params['dropout_rate'] = params['dropout_rate']
    if params['data_format']:
        override_params['data_format'] = params['data_format']
    if params['num_label_classes']:
        override_params['num_classes'] = params['num_label_classes']
    if params['depth_multiplier']:
        override_params['depth_multiplier'] = params['depth_multiplier']
    if params['depth_divisor']:
        override_params['depth_divisor'] = params['depth_divisor']
    if params['min_depth']:
        override_params['min_depth'] = params['min_depth']
    override_params['use_keras'] = params['use_keras']

    if params['precision'] == 'bfloat16':
        with tf.contrib.tpu.bfloat16_scope():
            logits, _ = mnasnet_models.build_mnasnet_model(
                features,
                model_name=params['model_name'],
                training=is_training,
                override_params=override_params)
        logits = tf.cast(logits, tf.float32)
    else:  # params['precision'] == 'float32'
        logits, _ = mnasnet_models.build_mnasnet_model(
            features,
            model_name=params['model_name'],
            training=is_training,
            override_params=override_params)

    if params['quantized_training']:
        if is_training:
            tf.logging.info('Adding fake quantization ops for training.')
            tf.contrib.quantize.create_training_graph(
                quant_delay=int(params['steps_per_epoch'] *
                                FLAGS.quantization_delay_epochs))
        else:
            tf.logging.info('Adding fake quantization ops for evaluation.')
            tf.contrib.quantize.create_eval_graph()

    if mode == tf.estimator.ModeKeys.PREDICT:
        scaffold_fn = None
        if FLAGS.export_moving_average:
            # If the model is trained with moving average decay, to match evaluation
            # metrics, we need to export the model using moving average variables.
            restore_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
            variables_to_restore = get_pretrained_variables_to_restore(
                restore_checkpoint, load_moving_average=True)
            tf.logging.info('Restoring from the latest checkpoint: %s',
                            restore_checkpoint)
            tf.logging.info(str(variables_to_restore))

            def restore_scaffold():
                saver = tf.train.Saver(variables_to_restore)
                return tf.train.Scaffold(saver=saver)

            scaffold_fn = restore_scaffold

        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            },
            scaffold_fn=scaffold_fn)

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, params['num_label_classes'])
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=one_hot_labels,
        label_smoothing=params['label_smoothing'])

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + params['weight_decay'] * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=params['moving_average_decay'], num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = params['base_learning_rate'] * (params['train_batch_size'] / 256.0)  # pylint: disable=line-too-long
        learning_rate = utils.build_learning_rate(scaled_lr, global_step,
                                                  params['steps_per_epoch'])
        optimizer = utils.build_optimizer(learning_rate)
        if params['use_tpu']:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not params['skip_host_call']:

            def host_call_fn(gs, loss, lr, ce):
                """Training host call.

        Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          loss: `Tensor` with shape `[batch]` for the training loss.
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                # Host call fns are executed params['iterations_per_loop'] times after
                # one TPU loop is finished, setting max_queue value to the same as
                # number of iterations will make the summary writer only flush the
                # data to storage once per loop.
                with tf.contrib.summary.create_file_writer(
                        FLAGS.model_dir,
                        max_queue=params['iterations_per_loop']).as_default():
                    with tf.contrib.summary.always_record_summaries():
                        tf.contrib.summary.scalar('loss', loss[0], step=gs)
                        tf.contrib.summary.scalar('learning_rate',
                                                  lr[0],
                                                  step=gs)
                        tf.contrib.summary.scalar('current_epoch',
                                                  ce[0],
                                                  step=gs)

                        return tf.contrib.summary.all_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            loss_t = tf.reshape(loss, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function.

      Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info('number of trainable parameters: {}'.format(num_params))

    # Prepares scaffold_fn if needed.
    scaffold_fn = None
    if is_training and FLAGS.init_checkpoint:
        variables_to_restore = get_pretrained_variables_to_restore(
            FLAGS.init_checkpoint, has_moving_average_decay)
        tf.logging.info('Initializing from pretrained checkpoint: %s',
                        FLAGS.init_checkpoint)
        if FLAGS.use_tpu:

            def init_scaffold():
                tf.train.init_from_checkpoint(FLAGS.init_checkpoint,
                                              variables_to_restore)
                return tf.train.Scaffold()

            scaffold_fn = init_scaffold
        else:
            tf.train.init_from_checkpoint(FLAGS.init_checkpoint,
                                          variables_to_restore)

    restore_vars_dict = None
    if not is_training and has_moving_average_decay:
        # Load moving average variables for eval.
        restore_vars_dict = ema.variables_to_restore(ema_vars)

        def eval_scaffold():
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)

        scaffold_fn = eval_scaffold

    return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                           loss=loss,
                                           train_op=train_op,
                                           host_call=host_call,
                                           eval_metrics=eval_metrics,
                                           scaffold_fn=scaffold_fn)
Ejemplo n.º 14
0
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: A dict of `Tensor` of batched images and other features.
    labels: a Tensor or a dict of Tensor representing the batched labels.
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
      `params['batch_size']` is always provided and should be used as the
      effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    logging.info('params=%s', params)
    images = features['image'] if isinstance(features, dict) else features
    labels = labels['label'] if isinstance(labels, dict) else labels
    config = params['config']
    image_size = params['image_size']
    utils.scalar('model/resolution', image_size)

    if config.model.data_format == 'channels_first':
        images = tf.transpose(images, [0, 3, 1, 2])

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (config.train.ema_decay > 0)
    if FLAGS.use_tpu and not config.model.bn_type:
        config.model.bn_type = 'tpu_bn'
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)

    def build_model(in_images):
        """Build model using the model_name given through the command line."""
        config.model.num_classes = config.data.num_classes
        model = effnetv2_model.EffNetV2Model(config.model.model_name,
                                             config.model)
        logits = model(in_images, training=is_training)[0]
        return logits

    pre_num_params, pre_num_flops = utils.num_params_flops(
        readable_format=True)

    if config.runtime.mixed_precision:
        precision = 'mixed_bfloat16' if FLAGS.use_tpu else 'mixed_float16'
        logits = utils.build_model_with_precision(precision, build_model,
                                                  images, is_training)
        logits = tf.cast(logits, tf.float32)
    else:
        logits = build_model(images)

    num_params, num_flops = utils.num_params_flops(readable_format=True)
    num_params = num_params - pre_num_params
    num_flops = (num_flops - pre_num_flops) / params['batch_size']
    logging.info('backbone params/flops = %.4f M / %.4f B', num_params,
                 num_flops)
    utils.scalar('model/params', num_params)
    utils.scalar('model/flops', num_flops)

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    if config.train.loss_type == 'sigmoid':
        cross_entropy = tf.losses.sigmoid_cross_entropy(
            multi_class_labels=tf.cast(labels, dtype=logits.dtype),
            logits=logits,
            label_smoothing=config.train.label_smoothing)
    elif config.train.loss_type == 'custom':
        xent = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(
            labels, dtype=logits.dtype),
                                                       logits=logits)
        cross_entropy = tf.reduce_mean(tf.reduce_sum(xent, axis=-1))
    else:
        if config.data.multiclass:
            logging.info('use multi-class loss: %s', config.data.multiclass)
            labels /= tf.reshape(tf.reduce_sum(labels, axis=1), (-1, 1))
        cross_entropy = tf.losses.softmax_cross_entropy(
            onehot_labels=labels,
            logits=logits,
            label_smoothing=config.train.label_smoothing)

    train_steps = max(config.train.min_steps,
                      config.train.epochs * params['steps_per_epoch'])
    global_step = tf.train.get_global_step()
    weight_decay_inc = config.train.weight_decay_inc * (
        tf.cast(global_step, tf.float32) / tf.cast(train_steps, tf.float32))
    weight_decay = (1 + weight_decay_inc) * config.train.weight_decay
    utils.scalar('train/weight_decay', weight_decay)
    # Add weight decay to the loss for non-batch-normalization variables.
    matcher = re.compile(config.train.weight_decay_exclude)
    l2loss = weight_decay * tf.add_n([
        tf.nn.l2_loss(v)
        for v in tf.trainable_variables() if not matcher.match(v.name)
    ])
    loss = cross_entropy + l2loss
    utils.scalar('loss/l2reg', l2loss)
    utils.scalar('loss/xent', cross_entropy)

    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=config.train.ema_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    restore_vars_dict = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])
        utils.scalar('train/epoch', current_epoch)

        scaled_lr = config.train.lr_base * (config.train.batch_size / 256.0)
        scaled_lr_min = config.train.lr_min * (config.train.batch_size / 256.0)
        learning_rate = utils.WarmupLearningRateSchedule(
            scaled_lr,
            steps_per_epoch=params['steps_per_epoch'],
            decay_epochs=config.train.lr_decay_epoch,
            warmup_epochs=config.train.lr_warmup_epoch,
            decay_factor=config.train.lr_decay_factor,
            lr_decay_type=config.train.lr_sched,
            total_steps=train_steps,
            minimal_lr=scaled_lr_min)(global_step)
        utils.scalar('train/lr', learning_rate)
        optimizer = utils.build_optimizer(
            learning_rate, optimizer_name=config.train.optimizer)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # filter trainable variables if needed.
        var_list = tf.trainable_variables()
        if config.train.varsexp:
            vars2 = [
                v for v in var_list if re.match(config.train.varsexp, v.name)
            ]
            if len(vars2) == len(var_list):
                logging.warning('%s has no match.', config.train.freeze)
            logging.info('Filter variables: orig=%d, final=%d, delta=%d',
                         len(var_list), len(vars2),
                         len(var_list) - len(vars2))
            var_list = vars2

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if config.train.gclip and is_training:
            logging.info('clip gradients norm by %f', config.train.gclip)
            grads_and_vars = optimizer.compute_gradients(loss, var_list)
            with tf.name_scope('gclip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                utils.scalar('train/gnorm', tf.linalg.global_norm(grads))
                utils.scalar('train/gnormmax',
                             tf.math.reduce_max([tf.norm(g) for g in grads]))
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(config.train.gclip)
                clipped_grads = [
                    tf.clip_by_norm(g, clip_norm) if g is not None else None
                    for g in grads
                ]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(loss,
                                              global_step,
                                              var_list=var_list)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not config.runtime.skip_host_call:
            host_call = utils.get_tpu_host_call(
                global_step, FLAGS.model_dir,
                config.runtime.iterations_per_loop)
    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function.

      Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, num_classes]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            metrics = {}
            if config.data.multiclass:
                metrics['eval/global_ap'] = tf.metrics.auc(
                    labels,
                    tf.nn.sigmoid(logits),
                    curve='PR',
                    num_thresholds=200,
                    summation_method='careful_interpolation',
                    name='global_ap')

                # Convert labels to set: be careful, tf.metrics.xx_at_k are horrible.
                labels = tf.cast(labels, dtype=tf.int64)
                label_to_repeat = tf.expand_dims(tf.argmax(labels, axis=-1),
                                                 axis=-1)
                all_labels_set = tf.range(0, labels.shape[-1], dtype=tf.int64)
                all_labels_set = tf.expand_dims(all_labels_set, axis=0)
                labels_set = labels * all_labels_set + (
                    1 - labels) * label_to_repeat

                metrics['eval/precision@1'] = tf.metrics.precision_at_k(
                    labels_set, logits, k=1)
                metrics['eval/recall@1'] = tf.metrics.recall_at_k(labels_set,
                                                                  logits,
                                                                  k=1)
                metrics['eval/precision@5'] = tf.metrics.precision_at_k(
                    labels_set, logits, k=5)
                metrics['eval/recall@5'] = tf.metrics.recall_at_k(labels_set,
                                                                  logits,
                                                                  k=5)

            # always add accuracy.
            labels = tf.argmax(labels, axis=1)
            predictions = tf.argmax(logits, axis=1)
            metrics['eval/acc_top1'] = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            metrics['eval/acc_top5'] = tf.metrics.mean(in_top_5)
            metrics['model/resolution'] = tf.metrics.mean(image_size)
            metrics['model/flops'] = tf.metrics.mean(num_flops)
            metrics['model/params'] = tf.metrics.mean(num_params)
            return metrics

        eval_metrics = (metric_fn, [labels, logits])

    if has_moving_average_decay and not is_training:

        def scaffold_fn():  # read ema for eval jobs.
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    elif config.train.ft_init_ckpt and is_training:

        def scaffold_fn():
            logging.info('restore variables from %s',
                         config.train.ft_init_ckpt)
            var_map = utils.get_ckpt_var_map(
                ckpt_path=config.train.ft_init_ckpt,
                skip_mismatch=True,
                init_ema=config.train.ft_init_ema)
            tf.train.init_from_checkpoint(config.train.ft_init_ckpt, var_map)
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=loss,
                                             train_op=train_op,
                                             host_call=host_call,
                                             eval_metrics=eval_metrics,
                                             scaffold_fn=scaffold_fn)
Ejemplo n.º 15
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """

    # Convert params (dict) to Config for easier access.
    def _model_outputs():
        return model(features, config=hparams_config.Config(params))

    if params['use_bfloat16']:
        with tf.tpu.bfloat16_scope():
            cls_outputs, box_outputs = _model_outputs()
            levels = cls_outputs.keys()
            for level in levels:
                cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
                box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
    else:
        cls_outputs, box_outputs = _model_outputs()
        levels = cls_outputs.keys()

    if is_rank0():
        show_model()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                  labels, params)
    l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/l2_loss', l2loss)
        utils.scalar('trainloss/loss', total_loss)
        utils.scalar('loss', total_loss)  # for consistency

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=params['momentum'])

        if horovod_enabled():
            optimizer = hvd.DistributedOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list, params['resnet_depth'])

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    testdev_dir=params['testdev_dir'],
                    disable_pyfun=params.get('disable_pyfun', None),
                    **kwargs)
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                              params['val_json_file'],
                                              **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    # only rank0 to restore, then broadcast variables
    if is_rank0():
        checkpoint = params.get('ckpt') or params.get('backbone_ckpt')
        if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
            # Initialize the model from an EfficientDet or backbone checkpoint.
            if params.get('ckpt') and params.get('backbone_ckpt'):
                raise RuntimeError(
                    '--backbone_ckpt and --checkpoint are mutually exclusive')
            elif params.get('backbone_ckpt'):
                var_scope = params['backbone_name'] + '/'
                if params['ckpt_var_scope'] is None:
                    # Use backbone name as default checkpoint scope.
                    ckpt_scope = params['backbone_name'] + '/'
                else:
                    ckpt_scope = params['ckpt_var_scope'] + '/'
            else:
                # Load every var in the given checkpoint
                var_scope = ckpt_scope = '/'

            def scaffold_fn():
                """Loads pretrained model through scaffold function."""
                logging.info('restore variables from %s', checkpoint)

                var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint,
                                                 ckpt_scope=ckpt_scope,
                                                 var_scope=var_scope,
                                                 var_exclude_expr=params.get(
                                                     'var_exclude_expr', None))

                tf.train.init_from_checkpoint(checkpoint, var_map)

                return tf.train.Scaffold()
        elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

            def scaffold_fn():
                """Load moving average variables for eval."""
                logging.info('Load EMA vars with ema_decay=%f',
                             moving_average_decay)
                restore_vars_dict = ema.variables_to_restore(ema_vars)
                saver = tf.train.Saver(restore_vars_dict)
                return tf.train.Scaffold(saver=saver)
        else:
            scaffold_fn = None
    else:
        scaffold_fn = None

    training_hooks = []
    if horovod_enabled():
        init_weights_hook = BroadcastGlobalVariablesHook(
            root_rank=0, model_dir=params['model_dir'])
        training_hooks.append(init_weights_hook)

    if is_rank0() or params['dump_all_ranks']:
        training_hooks.extend([
            LoggingTensorHook(dict(utils.summaries),
                              summary_dir=params['model_dir'],
                              every_n_iter=params['every_n_iter']),
            ExamplesPerSecondEstimatorHook(
                params['batch_size'],
                every_n_steps=params['every_n_iter'],
                output_dir=params['model_dir'],
                log_global_step=True)
        ])

    if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            scaffold=scaffold_fn() if scaffold_fn is not None else None,
            training_hooks=training_hooks)
    else:
        # host_call in the original code was to write summary, but caused the error
        # 'ValueError: Tensor("strided_slice_6:0", shape=(), dtype=int64) must be from the same graph as Tensor("strided_slice:0", shape=(), dtype=float32)'
        # thus, it's handled in write_summary() in main.py
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metrics=eval_metrics,
            #host_call=utils.get_tpu_host_call(global_step, params),
            scaffold_fn=scaffold_fn)
Ejemplo n.º 16
0
Archivo: train.py Proyecto: yichenj/tpu
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features['feature']

    stats_shape = [1, 1, 3]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (FLAGS.moving_average_decay > 0)
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)
    tf.logging.info('Using open-source implementation.')
    override_params = {}
    if FLAGS.batch_norm_momentum is not None:
        override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum
    if FLAGS.batch_norm_epsilon is not None:
        override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon
    if FLAGS.dropout_rate is not None:
        override_params['dropout_rate'] = FLAGS.dropout_rate
    if FLAGS.drop_connect_rate is not None:
        override_params['drop_connect_rate'] = FLAGS.drop_connect_rate
    if FLAGS.num_label_classes:
        override_params['num_classes'] = FLAGS.num_label_classes
    if FLAGS.depth_coefficient:
        override_params['depth_coefficient'] = FLAGS.depth_coefficient
    if FLAGS.width_coefficient:
        override_params['width_coefficient'] = FLAGS.width_coefficient

    def normalize_features(features, mean_rgb, stddev_rgb):
        """Normalize the image given the means and stddevs."""
        features -= tf.constant(mean_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        features /= tf.constant(stddev_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        return features

    def build_model():
        """Build model using the model_name given through the command line."""
        model_builder = None
        if FLAGS.model_name.startswith('efficientnet'):
            model_builder = efficientnet_builder
        else:
            raise ValueError('Model must be either efficientnet-b*')

        normalized_features = normalize_features(features,
                                                 model_builder.MEAN_RGB,
                                                 model_builder.STDDEV_RGB)
        logits, _ = model_builder.build_model(normalized_features,
                                              model_name=FLAGS.model_name,
                                              training=is_training,
                                              override_params=override_params,
                                              model_dir=FLAGS.model_dir)
        return logits

    logits = build_model()

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes)
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=one_hot_labels,
        label_smoothing=FLAGS.label_smoothing)

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + FLAGS.weight_decay * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=FLAGS.moving_average_decay, num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    train_op = None
    restore_vars_dict = None
    training_hooks = []
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0)
        learning_rate = utils.build_learning_rate(scaled_lr, global_step,
                                                  params['steps_per_epoch'])
        optimizer = utils.build_optimizer(learning_rate, optimizer_name='adam')

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        predictions = tf.argmax(logits, axis=1)
        top1_accuray = tf.metrics.accuracy(labels, predictions)
        logging_hook = tf.train.LoggingTensorHook(
            {
                "loss": loss,
                "accuracy": top1_accuray[1],
                "step": global_step
            },
            every_n_iter=1)
        training_hooks.append(logging_hook)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:
        predictions = tf.argmax(logits, axis=1)
        top1_accuray = tf.metrics.accuracy(labels, predictions)
        eval_metrics = {'val_accuracy': top1_accuray}

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info('number of trainable parameters: {}'.format(num_params))

    scaffold = None
    if has_moving_average_decay and not is_training:
        # Only apply scaffold for eval jobs.
        saver = tf.train.Saver(restore_vars_dict)
        scaffold = tf.train.Scaffold(saver=saver)

    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      train_op=train_op,
                                      training_hooks=training_hooks,
                                      eval_metric_ops=eval_metrics,
                                      scaffold=scaffold)
Ejemplo n.º 17
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    utils.image('input_image', features)
    training_hooks = []
    params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)

    if params['use_keras_model']:

        def model_fn(inputs):
            model = efficientdet_keras.EfficientDetNet(
                config=hparams_config.Config(params))
            cls_out_list, box_out_list = model(inputs,
                                               params['is_training_bn'])
            cls_outputs, box_outputs = {}, {}
            for i in range(params['min_level'], params['max_level'] + 1):
                cls_outputs[i] = cls_out_list[i - params['min_level']]
                box_outputs[i] = box_out_list[i - params['min_level']]
            return cls_outputs, box_outputs
    else:
        model_fn = functools.partial(model,
                                     config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, model_fn, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)
        if params['iou_loss_type']:
            utils.scalar('trainloss/box_iou_loss', box_iou_loss)
        train_epochs = tf.cast(global_step,
                               tf.float32) / params['steps_per_epoch']
        utils.scalar('train_epochs', train_epochs)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        if params['gradient_checkpointing']:
            from third_party.grad_checkpoint \
                import memory_saving_gradients  # pylint: disable=g-import-not-at-top
            from tensorflow.python.ops \
                import gradients  # pylint: disable=g-import-not-at-top

            # monkey patch tf.gradients to point to our custom version,
            # with automatic checkpoint selection
            def gradients_(ys, xs, grad_ys=None, **kwargs):
                return memory_saving_gradients.gradients(
                    ys,
                    xs,
                    grad_ys,
                    checkpoints=params['gradient_checkpointing_list'],
                    **kwargs)

            gradients.__dict__["gradients"] = gradients_

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', None):
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(params['clip_gradients_norm'])
                clipped_grads = [tf.clip_by_norm(g, clip_norm) for g in grads]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                utils.scalar('gradient_norm',
                             tf.linalg.global_norm(clipped_grads))
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', True):
                detections_bs = []
                for index in range(kwargs['boxes'].shape[0]):
                    nms_configs = params['nms_configs']
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
                detections_bs = postprocess.transform_detections(
                    tf.stack(detections_bs))
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                    params, kwargs['boxes'], kwargs['scores'],
                    kwargs['classes'], kwargs['image_scales'])
                img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                  nms_scores.dtype)
                detections_bs = [
                    img_ids * tf.ones_like(nms_scores),
                    nms_boxes[:, :, 1],
                    nms_boxes[:, :, 0],
                    nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                    nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                    nms_scores,
                    nms_classes,
                ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, kwargs['groundtruth_data'],
                    params['label_map'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])

        cls_outputs = postprocess.to_list(cls_outputs)
        box_outputs = postprocess.to_list(box_outputs)
        params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS
        boxes, scores, classes = postprocess.pre_nms(params, cls_outputs,
                                                     box_outputs)
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'image_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
            'boxes': boxes,
            'scores': scores,
            'classes': classes,
        }
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(
                ckpt_path=checkpoint,
                ckpt_scope=ckpt_scope,
                var_scope=var_scope,
                skip_mismatch=params['skip_mismatch'])

            tf.train.init_from_checkpoint(checkpoint, var_map)
            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if params['strategy'] != 'tpu':
        # Profile every 1K steps.
        if params.get('profile', False):
            profile_hook = tf.estimator.ProfilerHook(
                save_steps=1000,
                output_dir=params['model_dir'],
                show_memory=True)
            training_hooks.append(profile_hook)

            # Report memory allocation if OOM
            class OomReportingHook(tf.estimator.SessionRunHook):
                def before_run(self, run_context):
                    return tf.estimator.SessionRunArgs(
                        fetches=[],
                        options=tf.RunOptions(
                            report_tensor_allocations_upon_oom=True))

            training_hooks.append(OomReportingHook())

        logging_hook = tf.estimator.LoggingTensorHook(
            {
                'step': global_step,
                'det_loss': det_loss,
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            },
            every_n_iter=params.get('iterations_per_loop', 100),
        )
        training_hooks.append(logging_hook)

        if params["nvgpu_logging"]:
            try:
                from third_party import nvgpu  # pylint: disable=g-import-not-at-top
                from functools import reduce  # pylint: disable=g-import-not-at-top

                def get_nested_value(d, path):
                    return reduce(dict.get, path, d)

                def nvgpu_gpu_info(inp):
                    inp = inp.decode("utf-8")
                    inp = inp.split(",")
                    inp = [x.strip() for x in inp]
                    value = get_nested_value(nvgpu.gpu_info(), inp)
                    return np.str(value)

                def commonsize(inp):
                    const_sizes = {
                        'B': 1,
                        'KB': 1e3,
                        'MB': 1e6,
                        'GB': 1e9,
                        'TB': 1e12,
                        'PB': 1e15,
                        'KiB': 1024,
                        'MiB': 1048576,
                        'GiB': 1073741824
                    }
                    inp = inp.split(" ")
                    # convert all to MiB
                    if inp[1] != 'MiB':
                        inp_ = float(
                            inp[0]) * (const_sizes[inp[1]] / 1048576.0)
                    else:
                        inp_ = float(inp[0])

                    return inp_

                def formatter_log(tensors):
                    """Format the output."""
                    mem_used = tensors["memory used"].decode("utf-8")
                    mem_total = tensors["memory total"].decode("utf-8")
                    mem_util = commonsize(mem_used) / commonsize(mem_total)
                    logstring = "GPU memory used: {} = {:.1%} of total GPU memory: {}".format(
                        mem_used, mem_util, mem_total)
                    return logstring

                mem_used = tf.py_func(nvgpu_gpu_info,
                                      ['gpu, fb_memory_usage, used'],
                                      [tf.string])[0]
                mem_total = tf.py_func(nvgpu_gpu_info,
                                       ['gpu, fb_memory_usage, total'],
                                       [tf.string])[0]

                logging_hook3 = tf.estimator.LoggingTensorHook(
                    tensors={
                        "memory used": mem_used,
                        "memory total": mem_total,
                    },
                    every_n_iter=params.get('iterations_per_loop', 100),
                    formatter=formatter_log,
                )
                training_hooks.append(logging_hook3)
            except:
                logging.error("nvgpu error: nvidia-smi format not recognized")

    if params['strategy'] == 'tpu':
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metrics=eval_metrics,
            host_call=utils.get_tpu_host_call(global_step, params),
            scaffold_fn=scaffold_fn,
            training_hooks=training_hooks)
    else:
        eval_metric_ops = eval_metrics[0](
            **eval_metrics[1]) if eval_metrics else None
        utils.get_tpu_host_call(global_step, params)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=total_loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops,
                                          scaffold=scaffold_fn(),
                                          training_hooks=training_hooks)
Ejemplo n.º 18
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
  """

    # Convert params (dict) to Config for easier access.
    def _model_outputs():
        return model(features, config=hparams_config.Config(params))

    if params['use_bfloat16']:
        with tf.tpu.bfloat16_scope():
            cls_outputs, box_outputs = _model_outputs()
            levels = cls_outputs.keys()
            for level in levels:
                cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
                box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
    else:
        cls_outputs, box_outputs = _model_outputs()
        levels = cls_outputs.keys()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                  labels, params)
    l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/l2_loss', l2loss)
        utils.scalar('trainloss/loss', total_loss)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=params['momentum'])
        if params['use_tpu']:
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list, params['resnet_depth'])

        if params.get('clip_gradients_norm', 0) > 0:
            tf.logging.info('clip gradients norm by {}'.format(
                params['clip_gradients_norm']))
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                          params['val_json_file'], **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    if params['backbone_ckpt'] and mode == tf.estimator.ModeKeys.TRAIN:

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            tf.logging.info('restore variables from %s' %
                            params['backbone_ckpt'])
            if params['ckpt_var_scope'] is None:
                ckpt_scope = params[
                    'backbone_name']  # Use backbone name in default.
            else:
                ckpt_scope = params['ckpt_var_scope']
            tf.train.init_from_checkpoint(
                params['backbone_ckpt'],
                utils.get_ckt_var_map(params['backbone_ckpt'],
                                      ckpt_scope + '/',
                                      params['backbone_name'] + '/'))
            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            tf.logging.info('Load EMA vars with ema_decay=%f' %
                            moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn)
Ejemplo n.º 19
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
  def _model_outputs():
    return model(features, config=hparams_config.Config(params))

  if params['use_bfloat16']:
    with tf.tpu.bfloat16_scope():
      cls_outputs, box_outputs = _model_outputs()
      levels = cls_outputs.keys()
      for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
  else:
    cls_outputs, box_outputs = _model_outputs()
    levels = cls_outputs.keys()

  if mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
        'image': features,
    }
    for level in levels:
      predictions['cls_outputs_%d' % level] = cls_outputs[level]
      predictions['box_outputs_%d' % level] = box_outputs[level]
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

  update_learning_rate_schedule_parameters(params)
  global_step = tf.train.get_or_create_global_step()
  learning_rate = learning_rate_schedule(params, global_step)

  det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,labels, params)
  l2loss = reg_l2_loss(params['weight_decay'])
  total_loss = det_loss + l2loss

  if mode == tf.estimator.ModeKeys.TRAIN:
    utils.scalar('lrn_rate', learning_rate)
    utils.scalar('trainloss/cls_loss', cls_loss)
    utils.scalar('trainloss/box_loss', box_loss)
    utils.scalar('trainloss/det_loss', det_loss)
    utils.scalar('trainloss/l2_loss', l2loss)
    utils.scalar('trainloss/loss', total_loss)

  moving_average_decay = params['moving_average_decay']
  if moving_average_decay:
    ema = tf.train.ExponentialMovingAverage(
        decay=moving_average_decay, num_updates=global_step)
    ema_vars = utils.get_ema_vars()

  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.MomentumOptimizer(
        learning_rate, momentum=params['momentum'])
    if params['use_tpu']:
      optimizer = tf.tpu.CrossShardOptimizer(optimizer)

    # Batch norm requires update_ops to be added as a train_op dependency.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    var_list = tf.trainable_variables()
    if variable_filter_fn:
      var_list = variable_filter_fn(var_list, params['resnet_depth'])

    if params.get('clip_gradients_norm', 0) > 0:
      logging.info('clip gradients norm by %f', params['clip_gradients_norm'])
      grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
      with tf.name_scope('clip'):
        grads = [gv[0] for gv in grads_and_vars]
        tvars = [gv[1] for gv in grads_and_vars]
        clipped_grads, gnorm = tf.clip_by_global_norm(
            grads, params['clip_gradients_norm'])
        utils.scalar('gnorm', gnorm)
        grads_and_vars = list(zip(clipped_grads, tvars))

      with tf.control_dependencies(update_ops):
        train_op = optimizer.apply_gradients(grads_and_vars, global_step)
    else:
      with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(total_loss, global_step, var_list=var_list)

    if moving_average_decay:
      with tf.control_dependencies([train_op]):
        train_op = ema.apply(ema_vars)

  else:
    train_op = None
  eval_metrics = None

  checkpoint = params.get('ckpt') or params.get('backbone_ckpt')
  if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
    # Initialize the model from an EfficientDet or backbone checkpoint.
    if params.get('ckpt') and params.get('backbone_ckpt'):
      raise RuntimeError('--backbone_ckpt and --checkpoint are mutually exclusive')
    elif params.get('backbone_ckpt'):
      var_scope = params['backbone_name'] + '/'
      if params['ckpt_var_scope'] is None:
        # Use backbone name as default checkpoint scope.
        ckpt_scope = params['backbone_name'] + '/'
      else:
        ckpt_scope = params['ckpt_var_scope'] + '/'
    else:
      # Load every var in the given checkpoint
      var_scope = ckpt_scope = '/'

    def scaffold_fn():
      """Loads pretrained model through scaffold function."""
      logging.info('restore variables from %s', checkpoint)
      var_map = utils.get_ckpt_var_map(
          ckpt_path=checkpoint,
          ckpt_scope=ckpt_scope,
          var_scope=var_scope,
          var_exclude_expr=params.get('var_exclude_expr', None))
      tf.train.init_from_checkpoint(checkpoint, var_map)
      return tf.train.Scaffold()

  elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:
    def scaffold_fn():
      """Load moving average variables for eval."""
      logging.info('Load EMA vars with ema_decay=%f', moving_average_decay)
      restore_vars_dict = ema.variables_to_restore(ema_vars)
      saver = tf.train.Saver(restore_vars_dict)
      return tf.train.Scaffold(saver=saver)
  else:
    scaffold_fn = None

  return tf.estimator.tpu.TPUEstimatorSpec(
      mode=mode,
      loss=total_loss,
      train_op=train_op,
      eval_metrics=eval_metrics,
      host_call=utils.get_tpu_host_call(global_step, params),
      scaffold_fn=scaffold_fn)