def slim_get_split(file_pattern='{}_????'): # Features in Pascal VOC TFRecords. keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/height': tf.FixedLenFeature([1], tf.int64), 'image/width': tf.FixedLenFeature([1], tf.int64), 'image/channels': tf.FixedLenFeature([1], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'shape': slim.tfexample_decoder.Tensor('image/shape'), 'object/bbox': slim.tfexample_decoder.BoundingBox( ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) dataset = slim.dataset.Dataset( data_sources=file_pattern, reader=tf.TFRecordReader, decoder=decoder, num_samples=100, items_to_descriptions=None, num_classes=21, labels_to_names=None) with tf.name_scope('dataset_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=2, common_queue_capacity=32, common_queue_min=8, shuffle=True, num_epochs=1) [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'shape', 'object/label', 'object/bbox', 'object/difficult']) image, glabels, gbboxes = ssd_preprocessing.preprocess_image(org_image, glabels_raw, gbboxes_raw, [300, 300], is_training=True, data_format='channels_last', output_rgb=True) anchor_creator = anchor_manipulator.AnchorCreator([300] * 2, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], extra_anchor_scales=[(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], anchor_ratios=[(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() num_anchors_per_layer = [] for ind in range(len(all_anchors)): num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders=[1.0] * 6, positive_threshold=0.5, ignore_threshold=0.5, prior_scaling=[0.1, 0.1, 0.2, 0.2]) gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_all_anchors(glabels, gbboxes, all_anchors, all_num_anchors_depth, all_num_anchors_spatial, True) anchors = anchor_encoder_decoder._all_anchors # split by layers gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0), \ tf.split(gt_labels, num_anchors_per_layer, axis=0), \ tf.split(gt_scores, num_anchors_per_layer, axis=0), \ [tf.split(anchor, num_anchors_per_layer, axis=0) for anchor in anchors] save_image_op = tf.py_func(save_image_with_bbox, [ssd_preprocessing.unwhiten_image(image), tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max), tf.concat(gt_scores, axis=0), tf.concat(gt_targets, axis=0)], tf.int64, stateful=True) return save_image_op
def ssd_model_fn(features, labels, mode, params): """model_fn for SSD to be used with our Estimator.""" filename = features['filename'] filename = tf.identity(filename, name='filename') shape = features['shape'] output_shape = features['output_shape'] features = features['image'] anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) all_anchor_scales = [(30.,), (60.,), (112.5,), (165.,), (217.5,), (270.,)] all_extra_scales = [(42.43,), (82.17,), (136.23,), (189.45,), (242.34,), (295.08,)] all_anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)] #all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)] with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(params['data_format']) # forward features feature_layers = backbone.forward(features, training=(mode == tf.estimator.ModeKeys.TRAIN)) # generate anchors according to the feature map size with tf.device('/cpu:0'): if params['data_format'] == 'channels_first': all_layer_shapes = [tf.shape(feat)[2:] for feat in feature_layers] else: all_layer_shapes = [tf.shape(feat)[1:3] for feat in feature_layers] all_layer_strides = [8, 16, 32, 64, 100, 300] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind)) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, _ = anchor_encoder_decoder.get_all_anchors(tf.squeeze(output_shape, axis=0), anchors_height, anchors_width, anchors_depth, [0.5] * total_layers, all_layer_shapes, all_layer_strides, [0.] * total_layers, [False] * total_layers) # generate predictions based on anchors location_pred, cls_pred = ssd_net.multibox_head(feature_layers, params['num_classes'], anchors_depth, data_format=params['data_format']) if params['data_format'] == 'channels_first': cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred] cls_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred] location_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred] cls_pred = tf.concat(cls_pred, axis=1) location_pred = tf.concat(location_pred, axis=1) cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) location_pred = tf.reshape(location_pred, [-1, 4]) # decode predictions with tf.device('/cpu:0'): bboxes_pred = anchor_encoder_decoder.decode_anchors(location_pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) selected_bboxes, selected_scores = bbox_util.parse_by_class(tf.squeeze(output_shape, axis=0), cls_pred, bboxes_pred, params['num_classes'], params['select_threshold'], params['min_size'], params['keep_topk'], params['nms_topk'], params['nms_threshold']) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) save_image_op = tf.py_func(save_image_with_bbox, [ssd_preprocessing.unwhiten_image(tf.squeeze(features, axis=0), output_rgb=False), all_labels * tf.to_int32(all_scores > 0.3), all_scores, all_bboxes], tf.int64, stateful=True) tf.identity(save_image_op, name='save_image_op') predictions = {'filename': filename, 'shape': shape, 'output_shape': output_shape } for class_ind in range(1, params['num_classes']): predictions['scores_{}'.format(class_ind)] = tf.expand_dims(selected_scores[class_ind], axis=0) predictions['bboxes_{}'.format(class_ind)] = tf.expand_dims(selected_bboxes[class_ind], axis=0) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, prediction_hooks=None, loss=None, train_op=None) else: raise ValueError('This script only support "PREDICT" mode!')
def slim_get_split(file_pattern='{}_????'): # Features in Pascal VOC TFRecords. keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/height': tf.FixedLenFeature([1], tf.int64), 'image/width': tf.FixedLenFeature([1], tf.int64), 'image/channels': tf.FixedLenFeature([1], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'shape': slim.tfexample_decoder.Tensor('image/shape'), 'object/bbox': slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) dataset = slim.dataset.Dataset(data_sources=file_pattern, reader=tf.TFRecordReader, decoder=decoder, num_samples=100, items_to_descriptions=None, num_classes=21, labels_to_names=None) with tf.name_scope('dataset_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=2, common_queue_capacity=32, common_queue_min=8, shuffle=True, num_epochs=1) [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get( ['image', 'shape', 'object/label', 'object/bbox', 'object/difficult']) image, glabels, gbboxes = ssd_preprocessing.preprocess_image( org_image, glabels_raw, gbboxes_raw, [300, 300], is_training=True, data_format='channels_last', output_rgb=True) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( positive_threshold=0.5, ignore_threshold=0.5, prior_scaling=[0.1, 0.1, 0.2, 0.2]) all_anchor_scales = [(30., ), (60., ), (112.5, ), (165., ), (217.5, ), (270., )] all_extra_scales = [(42.43, ), (82.17, ), (136.23, ), (189.45, ), (242.34, ), (295.08, )] all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)] all_layer_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)] all_layer_strides = [8, 16, 32, 64, 100, 300] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height( all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind]) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors( [300] * 2, anchors_height, anchors_width, anchors_depth, [0.5] * total_layers, all_layer_shapes, all_layer_strides, [300.] * total_layers, [False] * total_layers) gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_anchors( glabels, gbboxes, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask, True) num_anchors_per_layer = list() for ind, layer_shape in enumerate(all_layer_shapes): _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count( anchors_depth[ind], layer_shape) num_anchors_per_layer.append(_num_anchors_per_layer) # split by layers all_anchors = tf.stack( [anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1) gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0),\ tf.split(gt_labels, num_anchors_per_layer, axis=0),\ tf.split(gt_scores, num_anchors_per_layer, axis=0),\ tf.split(all_anchors, num_anchors_per_layer, axis=0) save_image_op = tf.py_func(save_image_with_bbox, [ ssd_preprocessing.unwhiten_image(image), tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max), tf.concat(gt_scores, axis=0), tf.concat(gt_targets, axis=0) ], tf.int64, stateful=True) return save_image_op