def test_resample_feature_adder_compile(self): config = hparams_config.get_efficientdet_config("efficientdet-d0") feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) tf2.random.set_seed(SEED) inputs = [ tf2.keras.Input(shape=[512, 512, 3]), tf2.keras.Input(shape=[256, 256, 16]), tf2.keras.Input(shape=[128, 128, 24]), tf2.keras.Input(shape=[64, 64, 40]), tf2.keras.Input(shape=[32, 32, 112]), tf2.keras.Input(shape=[16, 16, 320]) ] outputs = efficientdet_arch_keras.ResampleFeatureAdder(config)(inputs) model = tf2.keras.Model(inputs=inputs, outputs=outputs) examples = [[ tf2.ones([1, 512, 512, 3]), tf2.ones([1, 256, 256, 16]), tf2.ones([1, 128, 128, 24]), tf2.ones([1, 64, 64, 40]), tf2.ones([1, 32, 32, 112]), tf2.ones([1, 16, 16, 320]) ]] preds = model(examples) try: utils.verify_feats_size(preds, feat_sizes=feat_sizes, min_level=config.min_level, max_level=config.max_level, data_format=config.data_format) except ValueError as err: self.assertFalse(True, msg=repr(err)) self.assertEqual(len(preds), 5, "P3-P7")
def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size): """Constructs multiscale RetinaNet anchors. Args: min_level: integer number of minimum level of the output feature pyramid. max_level: integer number of maximum level of the output feature pyramid. num_scales: integer number representing intermediate scales added on each level. For instances, num_scales=2 adds two additional anchor scales [2^0, 2^0.5] on each level. aspect_ratios: list of tuples representing the aspect ratio anchors added on each level. For instances, aspect_ratios = [(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level. anchor_scale: float number representing the scale of size of the base anchor to the feature stride 2^level. image_size: integer number or tuple of integer number of input image size. """ self.min_level = min_level self.max_level = max_level self.num_scales = num_scales self.aspect_ratios = aspect_ratios self.anchor_scale = anchor_scale if isinstance(image_size, int): self.image_size = (image_size, image_size) else: self.image_size = image_size self.feat_sizes = utils.get_feat_sizes(image_size, max_level) self.config = self._generate_configs() self.boxes = self._generate_boxes()
def test_variables(self): config = hparams_config.get_efficientdet_config() feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) with tf.Graph().as_default(): feats = [ tf.random.uniform([1, 64, 64, 40]), tf.random.uniform([1, 32, 32, 112]), tf.random.uniform([1, 16, 16, 320]), tf.random.uniform([1, 8, 8, 64]), tf.random.uniform([1, 4, 4, 64]) ] efficientdet_arch_keras.build_bifpn_layer(feats, feat_sizes, config) vars1 = [var.name for var in tf.global_variables()] with tf.Graph().as_default(): feats = [ tf.random.uniform([1, 64, 64, 40]), tf.random.uniform([1, 32, 32, 112]), tf.random.uniform([1, 16, 16, 320]), tf.random.uniform([1, 8, 8, 64]), tf.random.uniform([1, 4, 4, 64]) ] legacy_arch.build_bifpn_layer(feats, feat_sizes, config) vars2 = [var.name for var in tf.global_variables()] self.assertEqual(vars1, vars2)
def __init__(self, min_level: int, max_level: int, image_size: int, fpn_weight_method: str, apply_bn_for_resampling: bool, is_training_bn: bool, conv_after_downsample: bool, use_native_resize_op: bool, data_format: str, pooling_type: str, fpn_num_filters: int, conv_bn_act_pattern: bool, act_type: str, separable_conv: bool, use_tpu: bool, fpn_name: str, **kwargs): self.min_level = min_level self.max_level = max_level self.image_size = image_size self.feat_sizes = utils.get_feat_sizes(image_size, max_level) self.fpn_weight_method = fpn_weight_method self.apply_bn_for_resampling = apply_bn_for_resampling self.is_training_bn = is_training_bn self.conv_after_downsample = conv_after_downsample self.use_native_resize_op = use_native_resize_op self.data_format = data_format self.fpn_num_filters = fpn_num_filters self.pooling_type = pooling_type self.conv_bn_act_pattern = conv_bn_act_pattern self.act_type = act_type self.use_tpu = use_tpu self.separable_conv = separable_conv self.fpn_config = None self.fpn_name = fpn_name super(BiFPNLayer, self).__init__(**kwargs)
def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size): """Constructs multiscale anchors. Args: min_level: integer number of minimum level of the output feature pyramid. max_level: integer number of maximum level of the output feature pyramid. num_scales: integer number representing intermediate scales added on each level. For instances, num_scales=2 adds two additional anchor scales [2^0, 2^0.5] on each level. aspect_ratios: list of representing the aspect ratio anchors added on each level. For instances, aspect_ratios = [1.0, 2.0, 0..5] adds three anchors on each level. anchor_scale: float number representing the scale of size of the base anchor to the feature stride 2^level. Or a list, one value per layer. image_size: integer number or tuple of integer number of input image size. """ self.min_level = min_level self.max_level = max_level self.num_scales = num_scales self.aspect_ratios = aspect_ratios if isinstance(anchor_scale, (list, tuple)): assert len(anchor_scale) == max_level - min_level + 1 self.anchor_scales = anchor_scale else: self.anchor_scales = [anchor_scale] * (max_level - min_level + 1) self.image_size = utils.parse_image_size(image_size) self.feat_sizes = utils.get_feat_sizes(image_size, max_level) self.config = self._generate_configs() self.boxes = self._generate_boxes()
def build_feature_network(features, config): """Build FPN input features. Args: features: input tensor. config: a dict-like config, including all parameters. Returns: A dict from levels to the feature maps processed after feature network. """ feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) feats = [] if config.min_level not in features.keys(): raise ValueError( 'features.keys ({}) should include min_level ({})'.format( features.keys(), config.min_level)) # Build additional input features that are not from backbone. for level in range(config.min_level, config.max_level + 1): if level in features.keys(): feats.append(features[level]) else: # Adds a coarser level by downsampling the last feature map. feats.append( resample_feature_map( feats[-1], name='p%d' % level, target_height=(feats[-1].shape[1] - 1) // 2 + 1, target_width=(feats[-1].shape[2] - 1) // 2 + 1, target_num_channels=config.fpn_num_filters, apply_bn=config.apply_bn_for_resampling, is_training=config.is_training_bn, conv_after_downsample=config.conv_after_downsample, use_native_resize_op=config.use_native_resize_op, pooling_type=config.pooling_type, use_tpu=config.use_tpu)) _verify_feats_size(feats, feat_sizes=feat_sizes, min_level=config.min_level, max_level=config.max_level) with tf.variable_scope('fpn_cells'): for rep in range(config.fpn_cell_repeats): with tf.variable_scope('cell_{}'.format(rep)): logging.info('building cell %d', rep) new_feats = build_bifpn_layer(feats, feat_sizes, config) feats = [ new_feats[level] for level in range(config.min_level, config.max_level + 1) ] _verify_feats_size(feats, feat_sizes=feat_sizes, min_level=config.min_level, max_level=config.max_level) return new_feats
def test_fnode_compile(self): config = hparams_config.get_efficientdet_config("efficientdet-d0") fpn_config = legacy_arch.get_fpn_config(config.fpn_name, config.min_level, config.max_level, config.fpn_weight_method) feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) i = 0 fnode_cfg = fpn_config.nodes[i] examples = [[ tf2.ones([1, 512, 512, 3]), tf2.ones([1, 256, 256, 16]), tf2.ones([1, 128, 128, 24]), tf2.ones([1, 64, 64, 40]), tf2.ones([1, 32, 32, 112]), tf2.ones([1, 16, 16, 320]) ]] inputs = [ tf2.keras.Input(shape=[512, 512, 3]), tf2.keras.Input(shape=[256, 256, 16]), tf2.keras.Input(shape=[128, 128, 24]), tf2.keras.Input(shape=[64, 64, 40]), tf2.keras.Input(shape=[32, 32, 112]), tf2.keras.Input(shape=[16, 16, 320]) ] x = efficientdet_arch_keras.ResampleFeatureAdder(config)(inputs) outputs = efficientdet_arch_keras.FNode( feat_sizes[fnode_cfg['feat_level']]['height'], feat_sizes[fnode_cfg['feat_level']]['width'], fnode_cfg['inputs_offsets'], config.fpn_num_filters, config.apply_bn_for_resampling, config.is_training_bn, config.conv_after_downsample, config.conv_bn_act_pattern, config.separable_conv, config.act_type, strategy=config.strategy, weight_method=fpn_config.weight_method, data_format=config.data_format, name='fnode{}'.format(i))(x) model = tf2.keras.Model(inputs=inputs, outputs=outputs) preds = model(examples) self.assertEqual( len(preds), 6, msg= "Expected that FNode will add one more node (P6') to initial 5 (P3 - P7)" ) self.assertEqual(feat_sizes[fnode_cfg['feat_level']]['height'], preds[5].shape[1]) self.assertEqual(feat_sizes[fnode_cfg['feat_level']]['width'], preds[5].shape[2])
def build_feature_network(features, config): """Build FPN input features. Args: features: input tensor. config: a dict-like config, including all parameters. Returns: A dict from levels to the feature maps processed after feature network. """ feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) feats = ResampleFeatureAdder(config)(features) new_feats = FPNCells(feat_sizes, config)(feats) return new_feats
def build_feature_network(features, config): """Build FPN input features. Args: features: input tensor. config: a dict-like config, including all parameters. Returns: A dict from levels to the feature maps processed after feature network. """ feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) feats = [] if config.min_level not in features.keys(): raise ValueError( 'features.keys ({}) should include min_level ({})'.format( features.keys(), config.min_level)) # Build additional input features that are not from backbone. for level in range(config.min_level, config.max_level + 1): if level in features.keys(): feats.append(features[level]) else: h_id, w_id = (2, 3) if config.data_format == 'channels_first' else (1, 2) # Adds a coarser level by downsampling the last feature map. feats.append( ResampleFeatureMap( target_height=(feats[-1].shape[h_id] - 1) // 2 + 1, target_width=(feats[-1].shape[w_id] - 1) // 2 + 1, target_num_channels=config.fpn_num_filters, apply_bn=config.apply_bn_for_resampling, is_training=config.is_training_bn, conv_after_downsample=config.conv_after_downsample, use_native_resize_op=config.use_native_resize_op, pooling_type=config.pooling_type, strategy=config.strategy, data_format=config.data_format, name='resample_p{}'.format(level), )(feats[-1])) utils.verify_feats_size(feats, feat_sizes=feat_sizes, min_level=config.min_level, max_level=config.max_level, data_format=config.data_format) new_feats = FPNCells(feat_sizes, config)(feats) return new_feats
def test_get_feat_sizes(self): feats = utils.get_feat_sizes(640, 2) self.assertEqual(feats, [{ 'height': 640, 'width': 640 }, { 'height': 320, 'width': 320 }, { 'height': 160, 'width': 160 }]) feats = utils.get_feat_sizes((640, 300), 2) self.assertEqual(feats, [{ 'height': 640, 'width': 300, }, { 'height': 320, 'width': 150, }, { 'height': 160, 'width': 75, }])
def __init__(self, config, name='fpn_cells'): super().__init__(name=name) self.config = config self.feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) if config.fpn_config: self.fpn_config = config.fpn_config else: self.fpn_config = fpn_configs.get_fpn_config( config.fpn_name, config.min_level, config.max_level, config.fpn_weight_method) self.cells = [ FPNCell(self.feat_sizes, self.config, name='cell_%d' % rep) for rep in range(self.config.fpn_cell_repeats) ]
def build_feature_network(features, config): """Build FPN input features. Args: features: {0:image, 1:endpoints["reduction_1], ...} config: a dict-like config, including all parameters. Returns: {3:P3", 4:P4", 5:P5", 6:P6", 7:P7"} """ feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) feats = [] if config.min_level not in features.keys(): raise ValueError( 'features.keys ({}) should include min_level ({})'.format( features.keys(), config.min_level)) # Build additional input features that are not from backbone. for level in range(config.min_level, config.max_level + 1): if level in features.keys(): feats.append(features[level]) else: # Adds a coarser level by downsampling the last feature map. feats.append( resample_feature_map( feats[-1], name='p%d' % level, target_height=(feats[-1].shape[1] - 1) // 2 + 1, target_width=(feats[-1].shape[2] - 1) // 2 + 1, target_num_channels=config.fpn_num_filters, is_training=config.is_training_bn)) for rep in range(config.fpn_cell_repeats): logging.info('building cell %d', rep) new_feats = build_bifpn_layer(feats, feat_sizes, config) feats = [ new_feats[level] for level in range(config.min_level, config.max_level + 1) ] return new_feats
def build_feature_network(feats, config): """Build FPN input features. Args: feats: A list of input tensors starting from min_level. config: a dict-like config, including all parameters. Returns: A dict from levels to the feature maps processed after feature network. """ feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) if not feats: raise ValueError('FPN input features cannot be empty.') # Build additional input features that are not from backbone. while len(feats) < config.max_level - config.min_level + 1: level = len(feats) + config.min_level h_id, w_id = (2, 3) if config.data_format == 'channels_first' else (1, 2) # Adds a coarser level by downsampling the last feature map. feats.append( ResampleFeatureMap( target_height=(feats[-1].shape[h_id] - 1) // 2 + 1, target_width=(feats[-1].shape[w_id] - 1) // 2 + 1, target_num_channels=config.fpn_num_filters, apply_bn=config.apply_bn_for_resampling, is_training=config.is_training_bn, conv_after_downsample=config.conv_after_downsample, strategy=config.strategy, data_format=config.data_format, name='resample_p{}'.format(level), )(feats[-1])) utils.verify_feats_size(feats, feat_sizes=feat_sizes, min_level=config.min_level, max_level=config.max_level, data_format=config.data_format) new_feats = FPNCells(feat_sizes, config)(feats) return new_feats
def test_fpncells_compile(self): config = hparams_config.get_efficientdet_config("efficientdet-d0") feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) inputs = [ tf2.keras.Input(shape=[64, 64, 40]), tf2.keras.Input(shape=[32, 32, 112]), tf2.keras.Input(shape=[16, 16, 320]), tf2.keras.Input(shape=[8, 8, 64]), tf2.keras.Input(shape=[4, 4, 64]), ] outputs = efficientdet_arch_keras.FPNCells( feat_sizes, config, name='cell_{}'.format(0))(inputs) model = tf2.keras.Model(inputs=inputs, outputs=outputs) examples = [ tf2.ones([1, 64, 64, 40]), tf2.ones([1, 32, 32, 112]), tf2.ones([1, 16, 16, 320]), tf2.ones([1, 8, 8, 64]), tf2.ones([1, 4, 4, 64]), ] preds = model(examples) self.assertEqual(len(preds), 5)
def main(_): if FLAGS.strategy == 'tpu': tf.disable_eager_execution() tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ('train', 'train_and_eval'): if FLAGS.training_file_pattern is None: raise RuntimeError( 'Must specify --training_file_pattern for train.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError( 'Must specify --validation_file_pattern for eval.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, 'image_masks': None, } # The Input Partition Logic: We partition only the partition-able tensors. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.strategy != 'tpu': if FLAGS.use_xla: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True model_dir = FLAGS.model_dir strategy = None if FLAGS.strategy == 'tpu': tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) else: if FLAGS.strategy == 'gpus': strategy = tf.distribute.MirroredStrategy() run_config = tf.estimator.RunConfig( model_dir=model_dir, train_distribute=strategy, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size) total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch) train_steps = total_examples // FLAGS.train_batch_size logging.info(params) with tf.io.gfile.GFile(os.path.join(model_dir, 'config.yaml'), 'w') as f: f.write(str(config)) train_input_fn = dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) eval_input_fn = dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) if FLAGS.strategy == 'tpu': estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) else: params['batch_size'] = (FLAGS.train_batch_size // getattr(strategy, 'num_replicas_in_sync', 1)) params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1) estimator = tf.estimator.Estimator(model_fn=model_fn_instance, config=run_config, params=params) # start train/eval flow. if FLAGS.mode == 'train': estimator.train(input_fn=train_input_fn, max_steps=train_steps) if FLAGS.eval_after_training: estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout): logging.info('Starting to evaluate.') try: eval_results = estimator.evaluate(eval_input_fn, steps=eval_steps) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) if current_step >= train_steps: logging.info('Eval finished step %d/%d', current_step, train_steps) break except tf.errors.NotFoundError: # Checkpoint might be not already deleted by the time eval finished. # We simply skip ssuch case. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) elif FLAGS.mode == 'train_and_eval': train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=eval_steps, throttle_secs=600) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) else: logging.info('Invalid mode: %s', FLAGS.mode)
def main(_): if FLAGS.strategy == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top logging.info('Use horovod with multi gpus') hvd.init() os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank()) import tensorflow.compat.v1 as tf # pylint: disable=g-import-not-at-top tf.enable_v2_tensorshape() tf.disable_eager_execution() if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.strategy != 'tpu': if FLAGS.use_xla: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) if FLAGS.strategy == 'horovod': model_dir = FLAGS.model_dir if hvd.rank() == 0 else None else: model_dir = FLAGS.model_dir run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size) use_tpu = (FLAGS.strategy == 'tpu') logging.info(params) def _train(steps): """Build train estimator and run training if steps > 0.""" train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image), max_steps=steps) def _eval(steps): """Build estimator and eval the latest checkpoint if steps > 0.""" eval_params = dict( params, strategy=FLAGS.strategy, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate(input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, max_instances_per_image=max_instances_per_image), steps=steps, name=FLAGS.eval_name) logging.info('Evaluation results: %s', eval_results) return eval_results # start train/eval flow. if FLAGS.mode == 'train': total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch) _train(total_examples // FLAGS.train_batch_size) if FLAGS.eval_after_training: _eval(eval_steps) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout): logging.info('Starting to evaluate.') try: eval_results = _eval(eval_steps) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) elif FLAGS.mode == 'train_and_eval': ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) try: step = int(os.path.basename(ckpt).split("-")[1]) current_epoch = (step * FLAGS.train_batch_size // FLAGS.num_examples_per_epoch) logging.info('found ckpt at step %d (epoch %d)', step, current_epoch) except (IndexError, TypeError): logging.info("Folder has no ckpt with valid step.", FLAGS.model_dir) current_epoch = 0 epochs_per_cycle = 1 # higher number has less graph construction overhead. for e in range(current_epoch + 1, config.num_epochs + 1, epochs_per_cycle): print('-----------------------------------------------------\n' '=====> Starting training, epoch: %d.' % e) _train(e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size) print('-----------------------------------------------------\n' '=====> Starting evaluation, epoch: %d.' % e) eval_results = _eval(eval_steps) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) else: logging.info('Invalid mode: %s', FLAGS.mode)
def __init__(self, model_name=None, config=None, name=''): """Initialize model.""" super().__init__(name=name) config = config or hparams_config.get_efficientdet_config(model_name) self.config = config # Backbone. backbone_name = config.backbone_name is_training_bn = config.is_training_bn if 'efficientnet' in backbone_name: override_params = { 'batch_norm': utils.batch_norm_class(is_training_bn, config.strategy), 'relu_fn': functools.partial(utils.activation_fn, act_type=config.act_type), } if 'b0' in backbone_name: override_params['survival_prob'] = 0.0 if config.backbone_config is not None: override_params['blocks_args'] = ( efficientnet_builder.BlockDecoder().encode( config.backbone_config.blocks)) override_params['data_format'] = config.data_format self.backbone = backbone_factory.get_model( backbone_name, override_params=override_params) # Feature network. feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) self.resample_layers = [] # additional resampling layers. for level in range(6, config.max_level + 1): # Adds a coarser level by downsampling the last feature map. self.resample_layers.append( ResampleFeatureMap( target_height=feat_sizes[level]['height'], target_width=feat_sizes[level]['width'], target_num_channels=config.fpn_num_filters, apply_bn=config.apply_bn_for_resampling, is_training_bn=config.is_training_bn, conv_after_downsample=config.conv_after_downsample, strategy=config.strategy, data_format=config.data_format, name='resample_p%d' % level, )) self.fpn_cells = FPNCells(config) # class/box output prediction network. num_anchors = len(config.aspect_ratios) * config.num_scales num_filters = config.fpn_num_filters for head in config.heads: if head == 'object_detection': self.class_net = ClassNet(num_classes=config.num_classes, num_anchors=num_anchors, num_filters=num_filters, min_level=config.min_level, max_level=config.max_level, is_training_bn=config.is_training_bn, act_type=config.act_type, repeats=config.box_class_repeats, separable_conv=config.separable_conv, survival_prob=config.survival_prob, strategy=config.strategy, data_format=config.data_format) self.box_net = BoxNet(num_anchors=num_anchors, num_filters=num_filters, min_level=config.min_level, max_level=config.max_level, is_training_bn=config.is_training_bn, act_type=config.act_type, repeats=config.box_class_repeats, separable_conv=config.separable_conv, survival_prob=config.survival_prob, strategy=config.strategy, data_format=config.data_format) if head == 'segmentation': self.seg_head = SegmentationHead( num_classes=config.seg_num_classes, num_filters=num_filters, min_level=config.min_level, max_level=config.max_level, is_training_bn=config.is_training_bn, act_type=config.act_type, strategy=config.strategy, data_format=config.data_format)
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict( config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode, ) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) # TPU Estimator logging.info(params) if FLAGS.mode == 'train': train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), max_steps=int((config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, precision=None, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, precision=None, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': for cycle in range(config.num_epochs): logging.info('Starting training cycle, epoch: %d.', cycle) train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) logging.info('Starting evaluation cycle, epoch: %d.', cycle) # Run evaluation after every epoch. eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Evaluation results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) else: logging.info('Mode not found.')
def build_feature_network(features, config): """Build FPN input features. Args: features: input tensor. config: a dict-like config, including all parameters. Returns: A dict from levels to the feature maps processed after feature network. """ feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level) feats = [] if config.min_level not in features.keys(): raise ValueError('features.keys ({}) should include min_level ({})'.format( features.keys(), config.min_level)) # Build additional input features that are not from backbone. for level in range(config.min_level, config.max_level + 1): if level in features.keys(): feats.append(features[level]) else: h_id, w_id = (2, 3) if config.data_format == 'channels_first' else (1, 2) # Adds a coarser level by downsampling the last feature map. feats.append( resample_feature_map( feats[-1], name='p%d' % level, target_height=(feats[-1].shape[h_id] - 1) // 2 + 1, target_width=(feats[-1].shape[w_id] - 1) // 2 + 1, target_num_channels=config.fpn_num_filters, apply_bn=config.apply_bn_for_resampling, is_training=config.is_training_bn, conv_after_downsample=config.conv_after_downsample, use_native_resize_op=config.use_native_resize_op, pooling_type=config.pooling_type, use_tpu=config.use_tpu, data_format=config.data_format )) _verify_feats_size( feats, feat_sizes=feat_sizes, min_level=config.min_level, max_level=config.max_level, data_format=config.data_format) with tf.variable_scope('fpn_cells'): nodes = list() id = count(5) ends = { '3': list(), '4': list(), '5': list(), '6': list(), '7': list() } for _ in range(config.fpn_cell_repeats): build_fpn(nodes,ends,id) connect_fpn(nodes,ends) p = hparams_config.Config() p.nodes =nodes p.weight_method = 'fastattn' new_feats = build_bifpn_layer(feats, feat_sizes, config,p) feats = [new_feats[level] for level in range(config.min_level, config.max_level + 1)] _verify_feats_size( feats, feat_sizes=feat_sizes, min_level=config.min_level, max_level=config.max_level, data_format=config.data_format) return new_feats