Ejemplo n.º 1
0
def run_classifier(c_value, kernel='linear'):
    # multiclass support is handled by one-vs-one scheme
    classifier = svm.SVC(C=c_value, kernel=kernel)
    classifier.fit(X_train, y_train)

    test_predicted = classifier.predict(X_test)
    train_predicted = classifier.predict(X_train)

    assert len(test_predicted) == len(y_test)

    test_confusion = get_confusion_matrix(test_predicted, y_test)
    train_confusion = get_confusion_matrix(train_predicted, y_train)
    plot_heatmap(test_confusion,
                 f'../confusions/test_c_{c_value}_kernel_{kernel}.png')
    plot_heatmap(train_confusion,
                 f'../confusions/train_c_{c_value}_kernel_{kernel}.png')

    test_precision = get_precision(test_confusion)
    train_precision = get_precision(train_confusion)

    # correct = sum(1 for x in filter(lambda x: x, list(map(lambda x: x[0] == x[1], results))))
    # print(f'There are {correct} out of {len(X_test)} samples. This gives us an accuracy of {correct/len(X_test)}')

    print(f'Finished iteration with c {c_value} and kernel {kernel}')

    return test_precision, train_precision
Ejemplo n.º 2
0
def get_optimizer(params):
  """Get optimizer."""
  learning_rate = learning_rate_schedule(params)
  momentum = params['momentum']
  if params['optimizer'].lower() == 'sgd':
    logging.info('Use SGD optimizer')
    optimizer = tf.keras.optimizers.SGD(learning_rate, momentum=momentum)
  elif params['optimizer'].lower() == 'adam':
    logging.info('Use Adam optimizer')
    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=momentum)
  else:
    raise ValueError('optimizers should be adam or sgd')

  moving_average_decay = params['moving_average_decay']
  if moving_average_decay:
    # TODO(tanmingxing): potentially add dynamic_decay for new tfa release.
    from tensorflow_addons import optimizers as tfa_optimizers  # pylint: disable=g-import-not-at-top
    optimizer = tfa_optimizers.MovingAverage(
        optimizer, average_decay=moving_average_decay, dynamic_decay=True)
  precision = utils.get_precision(params['strategy'], params['mixed_precision'])
  if precision == 'mixed_float16' and params['loss_scale']:
    optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
        optimizer,
        loss_scale=tf.mixed_precision.experimental.DynamicLossScale(
            params['loss_scale']))
  return optimizer
Ejemplo n.º 3
0
    def __init__(self,
                 model_name: Text,
                 ckpt_path: Text = None,
                 batch_size: int = 1,
                 only_network: bool = False,
                 model_params: Dict[Text, Any] = None):
        """Initialize the inference driver.

    Args:
      model_name: target model name, such as efficientdet-d0.
      ckpt_path: checkpoint path, such as /tmp/efficientdet-d0/.
      batch_size: batch size for inference.
      only_network: only use the network without pre/post processing.
      model_params: model parameters for overriding the config.
    """
        super().__init__()
        self.model_name = model_name
        self.ckpt_path = ckpt_path
        self.batch_size = batch_size
        self.only_network = only_network

        self.params = hparams_config.get_detection_config(model_name).as_dict()

        if model_params:
            self.params.update(model_params)
        self.params.update(dict(is_training_bn=False))
        self.label_map = self.params.get('label_map', None)

        self._model = None

        mixed_precision = self.params.get('mixed_precision', None)
        precision = utils.get_precision(self.params.get('strategy', None),
                                        mixed_precision)
        policy = tf.keras.mixed_precision.experimental.Policy(precision)
        tf.keras.mixed_precision.experimental.set_policy(policy)
Ejemplo n.º 4
0
def build_model(model_name: Text, inputs: tf.Tensor, **kwargs):
    """Build model for a given model name.

  Args:
    model_name: the name of the model.
    inputs: an image tensor or a numpy array.
    **kwargs: extra parameters for model builder.

  Returns:
    (cls_outputs, box_outputs): the outputs for class and box predictions.
    Each is a dictionary with key as feature level and value as predictions.
  """
    model_arch = det_model_fn.get_model_arch(model_name)
    mixed_precision = kwargs.get('mixed_precision', None)
    precision = utils.get_precision(kwargs.get('strategy', None),
                                    mixed_precision)
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, model_arch, inputs, False, model_name, **kwargs)
    if mixed_precision:
        # Post-processing has multiple places with hard-coded float32.
        # TODO(tanmingxing): Remove them once post-process can adpat to dtypes.
        cls_outputs = {
            k: tf.cast(v, tf.float32)
            for k, v in cls_outputs.items()
        }
        box_outputs = {
            k: tf.cast(v, tf.float32)
            for k, v in box_outputs.items()
        }
    return cls_outputs, box_outputs
Ejemplo n.º 5
0
    def __init__(self, ckpt_path, debug, *args, **kwargs):
        """ Initialize the inference driver.

    Args:
      ckpt_path: checkpoint path, such as /tmp/efficientdet-d0/.
      debug: bool, if true, run in debug mode.
    """
        super().__init__(*args, **kwargs)
        params = copy.deepcopy(self.params)
        config = hparams_config.get_efficientdet_config(self.model_name)
        config.override(params)
        precision = utils.get_precision(config.strategy,
                                        config.mixed_precision)
        policy = tf.keras.mixed_precision.Policy(precision)
        tf.keras.mixed_precision.set_global_policy(policy)
        self.model = efficientdet_keras.EfficientDetModel(config=config)
        image_size = utils.parse_image_size(config.image_size)
        self.model.build((self.batch_size, *image_size, 3))
        util_keras.restore_ckpt(self.model,
                                ckpt_path,
                                config.moving_average_decay,
                                skip_mismatch=False)
        self.debug = debug
        if debug:
            tf.config.run_functions_eagerly(debug)
Ejemplo n.º 6
0
def launch(symbol):
    try:
        precision = get_precision(symbol)
        pip = get_pip(symbol)
        price_precision = len(
            '{0:.10f}'.format(pip).split('.')[1].split('1')[0]) + 1

        to_general_log(symbol, 'Start monitoring')

        while is_allowed(symbol):
            ma8 = get_ma_value(symbol, '5M', 8)
            ma21 = get_ma_value(symbol, '5M', 21)

            current_candle_open = float(get_current_candle(symbol, '5M')[1])
            current_candle_close = float(get_current_candle(symbol, '5M')[4])

            # looking for BUY signal
            if ma21 < ma8 < min(current_candle_open,
                                current_candle_close) and check_anchor_chart(
                                    symbol, 'buy'):

                # waiting trigger bar for BUY
                while True:
                    current_candle = get_current_candle(symbol, '5M')
                    current_candle_close = float(current_candle[4])
                    ma8 = get_ma_value(symbol, '5M', 8)

                    if current_candle_close < ma8:
                        entrance_point = round(
                            last_bars_extremum(symbol, 5, 'buy') + (30 * pip),
                            price_precision)

                        place_pending_order(symbol, 'buy', entrance_point,
                                            precision, price_precision)
                        break

            # looking for SELL signal
            if ma21 > ma8 > max(current_candle_open,
                                current_candle_close) and check_anchor_chart(
                                    symbol, 'sell'):

                # waiting trigger bar for SELL
                while True:
                    current_candle = get_current_candle(symbol, '5M')
                    current_candle_open = float(current_candle[1])
                    ma8 = get_ma_value(symbol, '5M', 8)

                    if current_candle_open > ma8:
                        entrance_point = round(
                            last_bars_extremum(symbol, 5, 'sell') - (30 * pip),
                            price_precision)

                        place_pending_order(symbol, 'sell', entrance_point,
                                            precision, price_precision)
                        break

            time.sleep(1)
    except:
        to_err_log(symbol, traceback.format_exc())
Ejemplo n.º 7
0
def build_model(model_name: Text, inputs: tf.Tensor, **kwargs):
  """Build model for a given model name.
  Args:
    model_name: the name of the model.
    inputs: an image tensor or a numpy array.
    **kwargs: extra parameters for model builder.
  Returns:
    (cls_outputs, box_outputs): the outputs for class and box predictions.
    Each is a dictionary with key as feature level and value as predictions.
  """
  mixed_precision = kwargs.get('mixed_precision', None)
  precision = utils.get_precision(kwargs.get('strategy', None), mixed_precision)

  if kwargs.get('use_keras_model', None):

    def model_arch(feats, model_name=None, **kwargs):
      """Construct a model arch for keras models."""
      config = hparams_config.get_efficientdet_config(model_name)
      config.override(kwargs)
      model = efficientdet_keras.EfficientDetNet(config=config)

      #l=model.layers[0]  # efficientnet part
      #print(l.name)


      #layer_names=[]
      #feats_out=l.predict(feats,steps=1)  #predict
      #for ml in l.layers:
        #print(ml.name)
        #layer_names.append(ml.name)
      #save_feat_fig(feats_out)
      #exit()

      cls_out_list, box_out_list = model(feats, training=False)
      # convert the list of model outputs to a dictionary with key=level.
      assert len(cls_out_list) == config.max_level - config.min_level + 1
      assert len(box_out_list) == config.max_level - config.min_level + 1
      cls_outputs, box_outputs = {}, {}
      for i in range(config.min_level, config.max_level + 1):
        cls_outputs[i] = cls_out_list[i - config.min_level]
        box_outputs[i] = box_out_list[i - config.min_level]
      return cls_outputs, box_outputs

  else:
    model_arch = det_model_fn.get_model_arch(model_name)

  cls_outputs, box_outputs = utils.build_model_with_precision(
      precision, model_arch, inputs, False, model_name, **kwargs)

  if mixed_precision:
    # Post-processing has multiple places with hard-coded float32.
    # TODO(tanmingxing): Remove them once post-process can adpat to dtypes.
    cls_outputs = {k: tf.cast(v, tf.float32) for k, v in cls_outputs.items()}
    box_outputs = {k: tf.cast(v, tf.float32) for k, v in box_outputs.items()}

  return cls_outputs, box_outputs
Ejemplo n.º 8
0
    def __init__(self,
                 model_name: Text,
                 ckpt_path: Text = None,
                 batch_size: int = 1,
                 min_score_thresh: float = None,
                 max_boxes_to_draw: float = None,
                 model_params: Dict[Text, Any] = None):
        """Initialize the inference driver.

    Args:
      model_name: target model name, such as efficientdet-d0.
      ckpt_path: checkpoint path, such as /tmp/efficientdet-d0/.
      batch_size: batch size for inference.
      min_score_thresh: minimal score threshold for filtering predictions.
      max_boxes_to_draw: the maximum number of boxes per image.
      model_params: model parameters for overriding the config.
    """
        super().__init__()
        self.model_name = model_name
        self.ckpt_path = ckpt_path
        self.batch_size = batch_size

        self.params = hparams_config.get_detection_config(model_name).as_dict()

        if model_params:
            self.params.update(model_params)
        self.params.update(dict(is_training_bn=False))
        self.label_map = self.params.get('label_map', None)

        self.model = None

        self.min_score_thresh = min_score_thresh
        self.max_boxes_to_draw = max_boxes_to_draw
        mixed_precision = self.params.get('mixed_precision', None)
        precision = utils.get_precision(self.params.get('strategy', None),
                                        mixed_precision)
        policy = tf.keras.mixed_precision.experimental.Policy(precision)
        tf.keras.mixed_precision.experimental.set_policy(policy)
Ejemplo n.º 9
0
def launch(symbol):
    try:
        precision = get_precision(symbol)
        pip = get_pip(symbol)
        price_precision = len(
            '{0:.10f}'.format(pip).split('.')[1].split('1')[0]) + 1

        to_general_log(symbol, 'Start monitoring')

        while is_allowed(symbol):

            # looking for BUY signal
            if get_price_change_percent_difference(symbol) > 2:
                place_pending_order(symbol, 'buy', precision, price_precision)
                break

            # looking for SELL signal
            if get_price_change_percent_difference(symbol) > 2:
                place_pending_order(symbol, 'sell', precision, price_precision)
                break

            time.sleep(1)
    except:
        to_err_log(symbol, traceback.format_exc())
Ejemplo n.º 10
0
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.debugging.set_log_device_placement(True)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        tf.random.set_seed(FLAGS.tf_random_seed)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        gpus = tf.config.list_physical_devices('GPU')
        if FLAGS.batch_size % len(gpus):
            raise ValueError(
                'Batch size divide gpus number must be interger, but got %f' %
                (FLAGS.batch_size / len(gpus)))
        if platform.system() == 'Windows':
            # Windows doesn't support nccl use HierarchicalCopyAllReduce instead
            # TODO(fsx950223): investigate HierarchicalCopyAllReduce performance issue
            cross_device_ops = tf.distribute.HierarchicalCopyAllReduce()
        else:
            cross_device_ops = None
        ds_strategy = tf.distribute.MirroredStrategy(
            cross_device_ops=cross_device_ops)
        logging.info('All devices: %s', gpus)
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size
    params = dict(profile=FLAGS.profile,
                  model_name=FLAGS.model_name,
                  steps_per_execution=FLAGS.steps_per_execution,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size,
                  tf_random_seed=FLAGS.tf_random_seed,
                  debug=FLAGS.debug,
                  val_json_file=FLAGS.val_json_file,
                  eval_samples=FLAGS.eval_samples,
                  num_shards=ds_strategy.num_replicas_in_sync)
    config.override(params, True)
    # set mixed precision policy by keras api.
    precision = utils.get_precision(config.strategy, config.mixed_precision)
    policy = tf.keras.mixed_precision.Policy(precision)
    tf.keras.mixed_precision.set_global_policy(policy)

    def get_dataset(is_training, config):
        file_pattern = (FLAGS.train_file_pattern
                        if is_training else FLAGS.val_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image,
            debug=FLAGS.debug)(config.as_dict())

    with ds_strategy.scope():
        if config.model_optimizations:
            tfmot.set_config(config.model_optimizations.as_dict())
        if FLAGS.hub_module_url:
            model = train_lib.EfficientDetNetTrainHub(
                config=config, hub_module_url=FLAGS.hub_module_url)
        else:
            model = train_lib.EfficientDetNetTrain(config=config)
        model = setup_model(model, config)
        if FLAGS.debug:
            tf.config.run_functions_eagerly(True)
        if FLAGS.pretrained_ckpt and not FLAGS.hub_module_url:
            ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt)
            util_keras.restore_ckpt(model,
                                    ckpt_path,
                                    config.moving_average_decay,
                                    exclude_layers=['class_net'])
        init_experimental(config)
        if 'train' in FLAGS.mode:
            val_dataset = get_dataset(False,
                                      config) if 'eval' in FLAGS.mode else None
            model.fit(
                get_dataset(True, config),
                epochs=config.num_epochs,
                steps_per_epoch=steps_per_epoch,
                callbacks=train_lib.get_callbacks(config.as_dict(),
                                                  val_dataset),
                validation_data=val_dataset,
                validation_steps=(FLAGS.eval_samples // FLAGS.batch_size))
        else:
            # Continuous eval.
            for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir,
                                                      min_interval_secs=180):
                logging.info('Starting to evaluate.')
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_epoch = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    current_epoch = 0

                val_dataset = get_dataset(False, config)
                logging.info('start loading model.')
                model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir))
                logging.info('finish loading model.')
                coco_eval = train_lib.COCOCallback(val_dataset, 1)
                coco_eval.set_model(model)
                eval_results = coco_eval.on_epoch_end(current_epoch)
                logging.info('eval results for %s: %s', ckpt, eval_results)

                try:
                    utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                except tf.errors.NotFoundError:
                    # Checkpoint might be not already deleted by the time eval finished.
                    logging.info('Checkpoint %s no longer exists, skipping.',
                                 ckpt)

                if current_epoch >= config.num_epochs or not current_epoch:
                    logging.info('Eval epoch %d / %d', current_epoch,
                                 config.num_epochs)
                    break
Ejemplo n.º 11
0
    def _detection_loss(self, cls_outputs, box_outputs, labels, loss_vals):
        """Computes total detection loss.

    Computes total detection loss including box and class loss from all levels.
    Args:
      cls_outputs: an OrderDict with keys representing levels and values
        representing logits in [batch_size, height, width, num_anchors].
      box_outputs: an OrderDict with keys representing levels and values
        representing box regression targets in [batch_size, height, width,
        num_anchors * 4].
      labels: the dictionary that returned from dataloader that includes
        groundtruth targets.
      loss_vals: A dict of loss values.

    Returns:
      total_loss: an integer tensor representing total loss reducing from
        class and box losses from all levels.
      cls_loss: an integer tensor representing total class loss.
      box_loss: an integer tensor representing total box regression loss.
      box_iou_loss: an integer tensor representing total box iou loss.
    """
        # Sum all positives in a batch for normalization and avoid zero
        # num_positives_sum, which would lead to inf loss during training
        precision = utils.get_precision(self.config.strategy,
                                        self.config.mixed_precision)
        dtype = precision.split('_')[-1]
        num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0
        positives_momentum = self.config.positives_momentum or 0
        if positives_momentum > 0:
            # normalize the num_positive_examples for training stability.
            moving_normalizer_var = tf.Variable(
                0.0,
                name='moving_normalizer',
                dtype=dtype,
                synchronization=tf.VariableSynchronization.ON_READ,
                trainable=False,
                aggregation=tf.VariableAggregation.MEAN)
            num_positives_sum = tf.keras.backend.moving_average_update(
                moving_normalizer_var,
                num_positives_sum,
                momentum=self.config.positives_momentum)
        elif positives_momentum < 0:
            num_positives_sum = utils.cross_replica_mean(num_positives_sum)
        num_positives_sum = tf.cast(num_positives_sum, dtype)
        levels = range(len(cls_outputs))
        cls_losses = []
        box_losses = []
        for level in levels:
            # Onehot encoding for classification labels.
            cls_targets_at_level = tf.one_hot(
                labels['cls_targets_%d' % (level + self.config.min_level)],
                self.config.num_classes,
                dtype=dtype)

            if self.config.data_format == 'channels_first':
                bs, _, width, height, _ = cls_targets_at_level.get_shape(
                ).as_list()
                cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                                  [bs, -1, width, height])
            else:
                bs, width, height, _, _ = cls_targets_at_level.get_shape(
                ).as_list()
                cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                                  [bs, width, height, -1])

            class_loss_layer = self.loss.get(FocalLoss.__name__, None)
            if class_loss_layer:
                cls_loss = class_loss_layer(
                    [num_positives_sum, cls_targets_at_level],
                    cls_outputs[level])
                if self.config.data_format == 'channels_first':
                    cls_loss = tf.reshape(
                        cls_loss,
                        [bs, -1, width, height, self.config.num_classes])
                else:
                    cls_loss = tf.reshape(
                        cls_loss,
                        [bs, width, height, -1, self.config.num_classes])
                cls_loss *= tf.cast(
                    tf.expand_dims(
                        tf.not_equal(
                            labels['cls_targets_%d' %
                                   (level + self.config.min_level)], -2), -1),
                    dtype)
                cls_loss_sum = tf.clip_by_value(tf.reduce_sum(cls_loss), 0.0,
                                                2.0)
                cls_losses.append(tf.cast(cls_loss_sum, dtype))

            if self.config.box_loss_weight and self.loss.get(
                    BoxLoss.__name__, None):
                box_targets_at_level = (
                    labels['box_targets_%d' % (level + self.config.min_level)])
                box_loss_layer = self.loss[BoxLoss.__name__]
                box_losses.append(
                    box_loss_layer([num_positives_sum, box_targets_at_level],
                                   box_outputs[level]))

        if self.config.iou_loss_type:
            box_outputs = tf.concat(
                [tf.reshape(v, [-1, 4]) for v in box_outputs], axis=0)
            box_targets = tf.concat([
                tf.reshape(
                    labels['box_targets_%d' %
                           (level + self.config.min_level)], [-1, 4])
                for level in levels
            ],
                                    axis=0)
            box_iou_loss_layer = self.loss[BoxIouLoss.__name__]
            box_iou_loss = box_iou_loss_layer([num_positives_sum, box_targets],
                                              box_outputs)
            loss_vals['box_iou_loss'] = box_iou_loss
        else:
            box_iou_loss = 0

        cls_loss = tf.add_n(cls_losses) if cls_losses else 0
        box_loss = tf.add_n(box_losses) if box_losses else 0
        total_loss = (cls_loss + self.config.box_loss_weight * box_loss +
                      self.config.iou_loss_weight * box_iou_loss)
        loss_vals['det_loss'] = total_loss
        loss_vals['cls_loss'] = cls_loss
        loss_vals['box_loss'] = box_loss
        return total_loss
Ejemplo n.º 12
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN and EVAL.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    is_tpu = params['strategy'] == 'tpu'
    if params['img_summary_steps']:
        utils.image('input_image', features, is_tpu)
    training_hooks = []
    params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)

    if params['use_keras_model']:

        def model_fn(inputs):
            model = efficientdet_keras.EfficientDetNet(
                config=hparams_config.Config(params))
            cls_out_list, box_out_list = model(inputs,
                                               params['is_training_bn'])
            cls_outputs, box_outputs = {}, {}
            for i in range(params['min_level'], params['max_level'] + 1):
                cls_outputs[i] = cls_out_list[i - params['min_level']]
                box_outputs[i] = box_out_list[i - params['min_level']]
            return cls_outputs, box_outputs
    else:
        model_fn = functools.partial(model,
                                     config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, model_fn, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                  labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate, is_tpu)
        utils.scalar('trainloss/cls_loss', cls_loss, is_tpu)
        utils.scalar('trainloss/box_loss', box_loss, is_tpu)
        utils.scalar('trainloss/det_loss', det_loss, is_tpu)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss, is_tpu)
        utils.scalar('trainloss/loss', total_loss, is_tpu)
        train_epochs = tf.cast(global_step,
                               tf.float32) / params['steps_per_epoch']
        utils.scalar('train_epochs', train_epochs, is_tpu)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if is_tpu:
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', None):
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(params['clip_gradients_norm'])
                clipped_grads = [
                    tf.clip_by_norm(g, clip_norm) if g is not None else None
                    for g in grads
                ]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                utils.scalar('gradient_norm',
                             tf.linalg.global_norm(clipped_grads), is_tpu)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', True):
                detections_bs = []
                nms_configs = params['nms_configs']
                for index in range(kwargs['boxes'].shape[0]):
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
                detections_bs = postprocess.transform_detections(
                    tf.stack(detections_bs))
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                    params, kwargs['boxes'], kwargs['scores'],
                    kwargs['classes'], kwargs['image_scales'])
                img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                  nms_scores.dtype)
                detections_bs = [
                    img_ids * tf.ones_like(nms_scores),
                    nms_boxes[:, :, 1],
                    nms_boxes[:, :, 0],
                    nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                    nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                    nms_scores,
                    nms_classes,
                ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'],
                    label_map=params['label_map'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, kwargs['groundtruth_data'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])

        cls_outputs = postprocess.to_list(cls_outputs)
        box_outputs = postprocess.to_list(box_outputs)
        params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS
        boxes, scores, classes = postprocess.pre_nms(params, cls_outputs,
                                                     box_outputs)
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'image_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
            'boxes': boxes,
            'scores': scores,
            'classes': classes,
        }
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(
                ckpt_path=checkpoint,
                ckpt_scope=ckpt_scope,
                var_scope=var_scope,
                skip_mismatch=params['skip_mismatch'])

            tf.train.init_from_checkpoint(checkpoint, var_map)
            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if is_tpu:
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metrics=eval_metrics,
            host_call=utils.get_tpu_host_call(global_step, params),
            scaffold_fn=scaffold_fn,
            training_hooks=training_hooks)
    else:
        # Profile every 1K steps.
        if params.get('profile', False):
            profile_hook = tf.estimator.ProfilerHook(
                save_steps=1000,
                output_dir=params['model_dir'],
                show_memory=True)
            training_hooks.append(profile_hook)

            # Report memory allocation if OOM; it will slow down the running.
            class OomReportingHook(tf.estimator.SessionRunHook):
                def before_run(self, run_context):
                    return tf.estimator.SessionRunArgs(
                        fetches=[],
                        options=tf.RunOptions(
                            report_tensor_allocations_upon_oom=True))

            training_hooks.append(OomReportingHook())

        logging_hook = tf.estimator.LoggingTensorHook(
            {
                'step': global_step,
                'det_loss': det_loss,
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            },
            every_n_iter=params.get('iterations_per_loop', 100),
        )
        training_hooks.append(logging_hook)

        eval_metric_ops = (eval_metrics[0](
            **eval_metrics[1]) if eval_metrics else None)
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metric_ops=eval_metric_ops,
            scaffold=scaffold_fn() if scaffold_fn else None,
            training_hooks=training_hooks)
Ejemplo n.º 13
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    utils.image('input_image', features)
    training_hooks = []
    if params['data_format'] == 'channels_first':
        features = tf.transpose(features, [0, 3, 1, 2])

    def _model_outputs(inputs):
        # Convert params (dict) to Config for easier access.
        return model(inputs, config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, _model_outputs, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)
        if params['iou_loss_type']:
            utils.scalar('trainloss/box_iou_loss', box_iou_loss)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()
    if params['strategy'] == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        learning_rate = learning_rate * hvd.size()
    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        elif params['strategy'] == 'horovod':
            optimizer = hvd.DistributedOptimizer(optimizer)
            training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            if params['strategy'] == 'tpu':
                batch_size = params['batch_size'] * params['num_shards']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    testdev_dir=params['testdev_dir'],
                    nms_configs=params['nms_configs'],
                    **kwargs)
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    nms_configs=params['nms_configs'],
                    **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint,
                                             ckpt_scope=ckpt_scope,
                                             var_scope=var_scope,
                                             var_exclude_expr=params.get(
                                                 'var_exclude_expr', None))

            tf.train.init_from_checkpoint(checkpoint, var_map)

            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if params['strategy'] != 'tpu':
        # Profile every 1K steps.
        profile_hook = tf.train.ProfilerHook(save_steps=1000,
                                             output_dir=params['model_dir'])
        training_hooks.append(profile_hook)

        # Report memory allocation if OOM
        class OomReportingHook(tf.estimator.SessionRunHook):
            def before_run(self, run_context):
                return tf.estimator.SessionRunArgs(
                    fetches=[],
                    options=tf.RunOptions(
                        report_tensor_allocations_upon_oom=True))

        training_hooks.append(OomReportingHook())

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn,
                                             training_hooks=training_hooks)
Ejemplo n.º 14
0
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        tf.random.set_seed(FLAGS.tf_random_seed)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size
    params = dict(profile=FLAGS.profile,
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size,
                  tf_random_seed=FLAGS.tf_random_seed,
                  debug=FLAGS.debug,
                  val_json_file=FLAGS.val_json_file,
                  eval_samples=FLAGS.eval_samples,
                  num_shards=ds_strategy.num_replicas_in_sync)
    config.override(params, True)
    # set mixed precision policy by keras api.
    precision = utils.get_precision(config.strategy, config.mixed_precision)
    policy = tf.keras.mixed_precision.experimental.Policy(precision)
    tf.keras.mixed_precision.experimental.set_policy(policy)

    def get_dataset(is_training, config):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.val_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image,
            debug=FLAGS.debug)(config.as_dict())

    with ds_strategy.scope():
        if config.model_optimizations:
            tfmot.set_config(config.model_optimizations.as_dict())
        model = setup_model(config)
        if FLAGS.pretrained_ckpt:
            ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt)
            util_keras.restore_ckpt(model, ckpt_path)
        init_experimental(config)
        val_dataset = get_dataset(False, config).repeat()
        model.fit(get_dataset(True, config),
                  epochs=config.num_epochs,
                  steps_per_epoch=steps_per_epoch,
                  callbacks=train_lib.get_callbacks(config.as_dict(),
                                                    val_dataset),
                  validation_data=val_dataset,
                  validation_steps=(FLAGS.eval_samples // FLAGS.batch_size))
    model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))
Ejemplo n.º 15
0
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        tf.random.set_seed(111111)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size //
                  ds_strategy.num_replicas_in_sync,
                  num_shards=ds_strategy.num_replicas_in_sync,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)

    # set mixed precision policy by keras api.
    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    policy = tf.keras.mixed_precision.experimental.Policy(precision)
    tf.keras.mixed_precision.experimental.set_policy(policy)

    def get_dataset(is_training, params):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.validation_file_pattern)
        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image)(params)

    with ds_strategy.scope():
        model = train_lib.EfficientDetNetTrain(params['model_name'], config)
        height, width = utils.parse_image_size(params['image_size'])
        model.build((params['batch_size'], height, width, 3))
        model.compile(
            optimizer=train_lib.get_optimizer(params),
            loss={
                'box_loss':
                train_lib.BoxLoss(params['delta'],
                                  reduction=tf.keras.losses.Reduction.NONE),
                'box_iou_loss':
                train_lib.BoxIouLoss(params['iou_loss_type'],
                                     params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'],
                                     reduction=tf.keras.losses.Reduction.NONE),
                'class_loss':
                train_lib.FocalLoss(params['alpha'],
                                    params['gamma'],
                                    label_smoothing=params['label_smoothing'],
                                    reduction=tf.keras.losses.Reduction.NONE)
            })
    ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir)
    if ckpt_path:
        model.load_weights(ckpt_path)
    model.freeze_vars(params['var_freeze_expr'])
    model.fit(get_dataset(True, params=params),
              steps_per_epoch=FLAGS.num_examples_per_epoch,
              callbacks=train_lib.get_callbacks(params, FLAGS.profile),
              validation_data=get_dataset(False, params=params),
              validation_steps=FLAGS.eval_samples)
    model.save_weights(os.path.join(FLAGS.model_dir, 'model'))
Ejemplo n.º 16
0
def main(_):
    assert '.yaml' in FLAGS.config, 'Please provide path to yaml file.'
    cfg = get_default_config()
    cfg.merge_from_file(FLAGS.config)
    cfg.freeze()

    model_dir = FLAGS.model_dir
    if not tf.io.gfile.exists(model_dir):
        tf.io.gfile.makedirs(model_dir)

    # init wandb
    if cfg.WANDB.ENABLE:
        wandb.tensorboard.patch(root_logdir=model_dir)
        wandb.init(job_type='train',
                   group=cfg.WANDB.GROUP_NAME,
                   project=cfg.WANDB.PROJECT_NAME,
                   sync_tensorboard=cfg.WANDB.TENSORBOARD,
                   mode=cfg.WANDB.MODE,
                   config=dict(cfg),
                   resume=True)

    if FLAGS.debug:
        tf.config.run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        tf.random.set_seed(1111)
        logging.set_verbosity(logging.DEBUG)
        tf.debugging.experimental.enable_dump_debug_info(
            model_dir,
            tensor_debug_mode="FULL_HEALTH",
            circular_buffer_size=-1)

    strategy = utils.get_strategy(FLAGS.num_gpus)

    # mixed precision
    precision = utils.get_precision(FLAGS.mixed_precision)
    policy = tf.keras.mixed_precision.Policy(precision)
    tf.keras.mixed_precision.set_global_policy(policy)

    def get_dataset(cfg, file_pattern, is_training):
        """Returns a tf.data.Dataset"""
        return dataloader.InputReader(
            cfg, is_training, FLAGS.use_tfrecord, FLAGS.mixed_precision)(
                file_pattern,
                cfg.TRAIN.BATCH_SIZE if is_training else cfg.TEST.BATCH_SIZE)

    def load_model(model, cfg):
        """Compile model with loss function, model optimizers and metrics."""
        opt_str = cfg.TRAIN.OPTIMIZER.lower()
        if opt_str == 'sgd':
            opt = tf.optimizers.SGD(learning_rate=cfg.TRAIN.WARMUP_LR,
                                    momentum=cfg.TRAIN.MOMENTUM,
                                    nesterov=True)
        elif opt_str == 'adam':
            opt = tf.optimizers.Adam(learning_rate=cfg.TRAIN.WARMUP_LR)
        else:
            raise NotImplementedError(f'{opt_str} not supported')

        if FLAGS.mixed_precision:
            opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)

        model.compile(
            optimizer=opt,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            metrics=[
                tf.keras.metrics.SparseCategoricalAccuracy(name='acc'),
                tf.keras.metrics.SparseTopKCategoricalAccuracy(
                    k=5, name='top_5_acc')
            ])

        return model

    # learning rate schedule
    def lr_schedule(epoch, lr):
        """
    Implements the learning rate schedule used in
      https://arxiv.org/abs/2004.04730
    """
        if epoch > cfg.TRAIN.WARMUP_EPOCHS:
            new_lr = cfg.TRAIN.BASE_LR * (0.5 * (
                tf.math.cos(tf.constant(math.pi) *
                            (epoch / cfg.TRAIN.EPOCHS)) + 1))
        else:
            new_lr = cfg.TRAIN.WARMUP_LR + (
                epoch * (cfg.TRAIN.BASE_LR - cfg.TRAIN.WARMUP_LR) /
                cfg.TRAIN.WARMUP_EPOCHS)
        return new_lr

    with strategy.scope():
        model = X3D(cfg)
        model = load_model(model, cfg)

        # resume training from latest checkpoint, if available
        current_epoch = 0
        ckpt_path = tf.train.latest_checkpoint(model_dir)
        if ckpt_path:
            current_epoch = int(os.path.basename(ckpt_path).split('-')[1])
            logging.info(
                f'Found checkpoint {ckpt_path} at epoch {current_epoch}')
            model.load_weights(ckpt_path)
        elif FLAGS.pretrained_ckpt:
            logging.info(
                f'Loading model from pretrained weights at {FLAGS.pretrained_ckpt}'
            )
            if tf.io.gfile.isdir(FLAGS.pretrained_ckpt):
                model.load_weights(
                    tf.train.latest_checkpoint(FLAGS.pretrained_ckpt))
            else:
                model.load_weights(FLAGS.pretrained_ckpt)

        model.fit(
            get_dataset(cfg, FLAGS.train_file_pattern, True),
            verbose=1,
            epochs=cfg.TRAIN.EPOCHS,
            initial_epoch=current_epoch,
            steps_per_epoch=cfg.TRAIN.DATASET_SIZE // cfg.TRAIN.BATCH_SIZE,
            validation_data=get_dataset(cfg, FLAGS.val_file_pattern, False)
            if FLAGS.val_file_pattern else None,
            callbacks=utils.get_callbacks(cfg, lr_schedule, FLAGS))
Ejemplo n.º 17
0
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        tf.random.set_seed(111111)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size
    params = dict(config.as_dict(),
                  profile=FLAGS.profile,
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size,
                  num_shards=ds_strategy.num_replicas_in_sync)

    # set mixed precision policy by keras api.
    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    policy = tf.keras.mixed_precision.experimental.Policy(precision)
    tf.keras.mixed_precision.experimental.set_policy(policy)

    def get_dataset(is_training, params):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.validation_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image)(params)

    with ds_strategy.scope():
        model = train_lib.EfficientDetNetTrain(params['model_name'], config)
        model.compile(
            optimizer=train_lib.get_optimizer(params),
            loss={
                'box_loss':
                train_lib.BoxLoss(params['delta'],
                                  reduction=tf.keras.losses.Reduction.NONE),
                'box_iou_loss':
                train_lib.BoxIouLoss(params['iou_loss_type'],
                                     params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'],
                                     reduction=tf.keras.losses.Reduction.NONE),
                'class_loss':
                train_lib.FocalLoss(params['alpha'],
                                    params['gamma'],
                                    label_smoothing=params['label_smoothing'],
                                    reduction=tf.keras.losses.Reduction.NONE),
                'seg_loss':
                tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
            })

        if FLAGS.pretrained_ckpt:
            ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt)
            util_keras.restore_ckpt(model, ckpt_path,
                                    params['moving_average_decay'])
        tf.io.gfile.makedirs(FLAGS.model_dir)
        if params['model_optimizations']:
            model_optimization.set_config(params['model_optimizations'])
        model.build((FLAGS.batch_size, *config.image_size, 3))
        model.fit(get_dataset(True, params=params),
                  epochs=params['num_epochs'],
                  steps_per_epoch=steps_per_epoch,
                  callbacks=train_lib.get_callbacks(params),
                  validation_data=get_dataset(False, params=params).repeat(),
                  validation_steps=(FLAGS.eval_samples // FLAGS.batch_size))
    model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))
Ejemplo n.º 18
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    utils.image('input_image', features)
    training_hooks = []
    params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)

    if params['use_keras_model']:

        def model_fn(inputs):
            model = efficientdet_keras.EfficientDetNet(
                config=hparams_config.Config(params))
            cls_out_list, box_out_list = model(inputs,
                                               params['is_training_bn'])
            cls_outputs, box_outputs = {}, {}
            for i in range(params['min_level'], params['max_level'] + 1):
                cls_outputs[i] = cls_out_list[i - params['min_level']]
                box_outputs[i] = box_out_list[i - params['min_level']]
            return cls_outputs, box_outputs
    else:
        model_fn = functools.partial(model,
                                     config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, model_fn, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)
        if params['iou_loss_type']:
            utils.scalar('trainloss/box_iou_loss', box_iou_loss)
        train_epochs = tf.cast(global_step,
                               tf.float32) / params['steps_per_epoch']
        utils.scalar('train_epochs', train_epochs)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        if params['gradient_checkpointing']:
            from third_party.grad_checkpoint \
                import memory_saving_gradients  # pylint: disable=g-import-not-at-top
            from tensorflow.python.ops \
                import gradients  # pylint: disable=g-import-not-at-top

            # monkey patch tf.gradients to point to our custom version,
            # with automatic checkpoint selection
            def gradients_(ys, xs, grad_ys=None, **kwargs):
                return memory_saving_gradients.gradients(
                    ys,
                    xs,
                    grad_ys,
                    checkpoints=params['gradient_checkpointing_list'],
                    **kwargs)

            gradients.__dict__["gradients"] = gradients_

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', None):
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(params['clip_gradients_norm'])
                clipped_grads = [tf.clip_by_norm(g, clip_norm) for g in grads]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                utils.scalar('gradient_norm',
                             tf.linalg.global_norm(clipped_grads))
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', True):
                detections_bs = []
                for index in range(kwargs['boxes'].shape[0]):
                    nms_configs = params['nms_configs']
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
                detections_bs = postprocess.transform_detections(
                    tf.stack(detections_bs))
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                    params, kwargs['boxes'], kwargs['scores'],
                    kwargs['classes'], kwargs['image_scales'])
                img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                  nms_scores.dtype)
                detections_bs = [
                    img_ids * tf.ones_like(nms_scores),
                    nms_boxes[:, :, 1],
                    nms_boxes[:, :, 0],
                    nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                    nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                    nms_scores,
                    nms_classes,
                ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, kwargs['groundtruth_data'],
                    params['label_map'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])

        cls_outputs = postprocess.to_list(cls_outputs)
        box_outputs = postprocess.to_list(box_outputs)
        params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS
        boxes, scores, classes = postprocess.pre_nms(params, cls_outputs,
                                                     box_outputs)
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'image_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
            'boxes': boxes,
            'scores': scores,
            'classes': classes,
        }
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(
                ckpt_path=checkpoint,
                ckpt_scope=ckpt_scope,
                var_scope=var_scope,
                skip_mismatch=params['skip_mismatch'])

            tf.train.init_from_checkpoint(checkpoint, var_map)
            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if params['strategy'] != 'tpu':
        # Profile every 1K steps.
        if params.get('profile', False):
            profile_hook = tf.estimator.ProfilerHook(
                save_steps=1000,
                output_dir=params['model_dir'],
                show_memory=True)
            training_hooks.append(profile_hook)

            # Report memory allocation if OOM
            class OomReportingHook(tf.estimator.SessionRunHook):
                def before_run(self, run_context):
                    return tf.estimator.SessionRunArgs(
                        fetches=[],
                        options=tf.RunOptions(
                            report_tensor_allocations_upon_oom=True))

            training_hooks.append(OomReportingHook())

        logging_hook = tf.estimator.LoggingTensorHook(
            {
                'step': global_step,
                'det_loss': det_loss,
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            },
            every_n_iter=params.get('iterations_per_loop', 100),
        )
        training_hooks.append(logging_hook)

        if params["nvgpu_logging"]:
            try:
                from third_party import nvgpu  # pylint: disable=g-import-not-at-top
                from functools import reduce  # pylint: disable=g-import-not-at-top

                def get_nested_value(d, path):
                    return reduce(dict.get, path, d)

                def nvgpu_gpu_info(inp):
                    inp = inp.decode("utf-8")
                    inp = inp.split(",")
                    inp = [x.strip() for x in inp]
                    value = get_nested_value(nvgpu.gpu_info(), inp)
                    return np.str(value)

                def commonsize(inp):
                    const_sizes = {
                        'B': 1,
                        'KB': 1e3,
                        'MB': 1e6,
                        'GB': 1e9,
                        'TB': 1e12,
                        'PB': 1e15,
                        'KiB': 1024,
                        'MiB': 1048576,
                        'GiB': 1073741824
                    }
                    inp = inp.split(" ")
                    # convert all to MiB
                    if inp[1] != 'MiB':
                        inp_ = float(
                            inp[0]) * (const_sizes[inp[1]] / 1048576.0)
                    else:
                        inp_ = float(inp[0])

                    return inp_

                def formatter_log(tensors):
                    """Format the output."""
                    mem_used = tensors["memory used"].decode("utf-8")
                    mem_total = tensors["memory total"].decode("utf-8")
                    mem_util = commonsize(mem_used) / commonsize(mem_total)
                    logstring = "GPU memory used: {} = {:.1%} of total GPU memory: {}".format(
                        mem_used, mem_util, mem_total)
                    return logstring

                mem_used = tf.py_func(nvgpu_gpu_info,
                                      ['gpu, fb_memory_usage, used'],
                                      [tf.string])[0]
                mem_total = tf.py_func(nvgpu_gpu_info,
                                       ['gpu, fb_memory_usage, total'],
                                       [tf.string])[0]

                logging_hook3 = tf.estimator.LoggingTensorHook(
                    tensors={
                        "memory used": mem_used,
                        "memory total": mem_total,
                    },
                    every_n_iter=params.get('iterations_per_loop', 100),
                    formatter=formatter_log,
                )
                training_hooks.append(logging_hook3)
            except:
                logging.error("nvgpu error: nvidia-smi format not recognized")

    if params['strategy'] == 'tpu':
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metrics=eval_metrics,
            host_call=utils.get_tpu_host_call(global_step, params),
            scaffold_fn=scaffold_fn,
            training_hooks=training_hooks)
    else:
        eval_metric_ops = eval_metrics[0](
            **eval_metrics[1]) if eval_metrics else None
        utils.get_tpu_host_call(global_step, params)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=total_loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops,
                                          scaffold=scaffold_fn(),
                                          training_hooks=training_hooks)
Ejemplo n.º 19
0
    def dataset_parser(self, value, example_decoder, anchor_labeler, params):
        """Parse data to a fixed dimension input image and learning targets.

    Args:
      value: a single serialized tf.Example string.
      example_decoder: TF example decoder.
      anchor_labeler: anchor box labeler.
      params: a dict of extra parameters.

    Returns:
      image: Image tensor that is preprocessed to have normalized value and
        fixed dimension [image_height, image_width, 3]
      cls_targets_dict: ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors]. The height_l and width_l
        represent the dimension of class logits at l-th level.
      box_targets_dict: ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors * 4]. The height_l and
        width_l represent the dimension of bounding box regression output at
        l-th level.
      num_positives: Number of positive anchors in the image.
      source_id: Source image id. Default value -1 if the source id is empty
        in the groundtruth annotation.
      image_scale: Scale of the processed image to the original image.
      boxes: Groundtruth bounding box annotations. The box is represented in
        [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed
        dimension [self._max_instances_per_image, 4].
      is_crowds: Groundtruth annotations to indicate if an annotation
        represents a group of instances by value {0, 1}. The tensor is
        padded with 0 to the fixed dimension [self._max_instances_per_image].
      areas: Groundtruth areas annotations. The tensor is padded with -1
        to the fixed dimension [self._max_instances_per_image].
      classes: Groundtruth classes annotations. The tensor is padded with -1
        to the fixed dimension [self._max_instances_per_image].
    """
        with tf.name_scope('parser'):
            data = example_decoder.decode(value)
            source_id = data['source_id']
            image = data['image']
            boxes = data['groundtruth_boxes']
            classes = data['groundtruth_classes']
            classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])
            areas = data['groundtruth_area']
            is_crowds = data['groundtruth_is_crowd']
            image_masks = data.get('groundtruth_instance_masks', [])
            classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])

            if self._is_training:
                # Training time preprocessing.
                if params['skip_crowd_during_training']:
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)

                if params.get('grid_mask', None):
                    from aug import gridmask  # pylint: disable=g-import-not-at-top
                    image, boxes = gridmask.gridmask(image, boxes)

                if params.get('autoaugment_policy', None):
                    from aug import autoaugment  # pylint: disable=g-import-not-at-top
                    if params['autoaugment_policy'] == 'randaug':
                        image, boxes = autoaugment.distort_image_with_randaugment(
                            image, boxes, num_layers=1, magnitude=15)
                    else:
                        image, boxes = autoaugment.distort_image_with_autoaugment(
                            image, boxes, params['autoaugment_policy'])

            input_processor = DetectionInputProcessor(image,
                                                      params['image_size'],
                                                      boxes, classes)
            input_processor.normalize_image()
            if self._is_training:
                if params['input_rand_hflip']:
                    input_processor.random_horizontal_flip()

                input_processor.set_training_random_scale_factors(
                    params['jitter_min'], params['jitter_max'],
                    params.get('target_size', None))
            else:
                input_processor.set_scale_factors_to_output_size()
            image = input_processor.resize_and_crop_image()
            boxes, classes = input_processor.resize_and_crop_boxes()

            # Assign anchors.
            (cls_targets, box_targets,
             num_positives) = anchor_labeler.label_anchors(boxes, classes)

            source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1',
                                 source_id)
            source_id = tf.strings.to_number(source_id)

            # Pad groundtruth data for evaluation.
            image_scale = input_processor.image_scale_to_original
            boxes *= image_scale
            is_crowds = tf.cast(is_crowds, dtype=tf.float32)
            boxes = pad_to_fixed_size(boxes, -1,
                                      [self._max_instances_per_image, 4])
            is_crowds = pad_to_fixed_size(is_crowds, 0,
                                          [self._max_instances_per_image, 1])
            areas = pad_to_fixed_size(areas, -1,
                                      [self._max_instances_per_image, 1])
            classes = pad_to_fixed_size(classes, -1,
                                        [self._max_instances_per_image, 1])
            if params['mixed_precision']:
                precision = utils.get_precision(params['strategy'],
                                                params['mixed_precision'])
                dtype = precision.split('_')[-1]
                image = tf.cast(image, dtype=dtype)
                box_targets = tf.nest.map_structure(
                    lambda box_target: tf.cast(box_target, dtype=dtype),
                    box_targets)
            return (image, cls_targets, box_targets, num_positives, source_id,
                    image_scale, boxes, is_crowds, areas, classes, image_masks)