def input_fn(): with tf.name_scope('post_forward'): out_shape = [FLAGS.train_image_size] * 2 anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) num_anchors_per_layer = [] for ind in range(len(all_anchors)): num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=FLAGS.match_threshold, ignore_threshold=FLAGS.neg_threshold, prior_scaling=[0.1, 0.1, 0.2, 0.2]) # global global_anchor_info # global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer), # 'num_anchors_per_layer': num_anchors_per_layer, # 'all_num_anchors_depth': all_num_anchors_depth, # 'encode_fn': lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)} image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image( image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) # anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) filenames = tf.placeholder(tf.string, shape=[None]) arga = tf.constant(True) dataset = tf.data.TFRecordDataset(training_files) dataset = dataset.map(lambda x: data_mapping_fn( x, is_training, image_preprocessing_fn)) dataset = dataset.repeat() # repeat the input infinitely dataset = dataset.batch(batch_size) # set the batch size iterator = dataset.make_initializable_iterator() # return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores} return dataset
def input_fn(): target_shape = [FLAGS.train_image_size] * 2 anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(positive_threshold = FLAGS.match_threshold, ignore_threshold = FLAGS.neg_threshold, prior_scaling=[0.1, 0.1, 0.2, 0.2]) all_anchor_scales = [(16.,), (32.,), (64.,), (128.,), (256.,), (512.,)] all_extra_scales = [(), (), (), (), (), ()] all_anchor_ratios = [(1.,), (1.,), (1.,), (1.,), (1.,), (1.,)] all_layer_shapes = [(160, 160), (80, 80), (40, 40), (20, 20), (10, 10), (5, 5)] all_layer_strides = [4, 8, 16, 32, 64, 128] offset_list = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind)) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors(target_shape, anchors_height, anchors_width, anchors_depth, offset_list, all_layer_shapes, all_layer_strides, [FLAGS.train_image_size * 1.] * total_layers, [False] * total_layers) num_anchors_per_layer = list() for ind, layer_shape in enumerate(all_layer_shapes): _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count(anchors_depth[ind], layer_shape, name='get_anchor_count{}'.format(ind)) num_anchors_per_layer.append(_num_anchors_per_layer) image_preprocessing_fn = lambda image_, bboxes_ : sfd_preprocessing.preprocess_image(image_, bboxes_, target_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) anchor_encoder_fn = lambda gbboxes_: anchor_encoder_decoder.encode_anchors(gbboxes_, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask, match_mining=True) image, filename, shape, loc_targets, cls_targets, match_scores, _ = dataset_common.slim_get_batch(FLAGS.num_classes, batch_size, ('train' if is_training else 'valid'), os.path.join(FLAGS.data_dir, dataset_pattern), FLAGS.num_readers, FLAGS.num_preprocessing_threads, image_preprocessing_fn, anchor_encoder_fn, num_epochs=FLAGS.train_epochs, is_training=is_training) global global_anchor_info global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.batch_decode_anchors(pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax), 'num_anchors_per_layer': num_anchors_per_layer, 'all_num_anchors_depth': anchors_depth } return image, {'filename': filename, 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
def input_fn(): out_shape = [FLAGS.train_image_size] * 2 ssd300_anchor_params = {'layers_shapes': [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], 'anchor_scales': [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], 'extra_anchor_scales': [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], 'anchor_ratios': [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], 'layer_steps': [8, 16, 32, 64, 100, 300]} ssd512_anchor_params = {'layers_shapes': [(64, 64), (32, 32), (16, 16), (8, 8), (4, 4), (2, 2), (1, 1)], 'anchor_scales': [(0.07,), (0.15,), (0.3,), (0.45,), (0.6,), (0.75,), (0.9,)], 'extra_anchor_scales': [(0.1025,), (0.2121,), (0.3674,), (0.5196,), (0.6708,), (0.8216,), (0.9721,)], 'anchor_ratios': [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], 'layer_steps': [8, 16, 32, 64, 128, 256, 512]} if FLAGS.train_image_size == 512: net_params = ssd512_anchor_params print('using ssd512 model') else: net_params = ssd300_anchor_params print('using ssd300 model') anchor_creator = anchor_manipulator.AnchorCreator(out_shape, **net_params) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() num_anchors_per_layer = [] for ind in range(len(all_anchors)): num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * len(net_params['layer_steps']), positive_threshold = FLAGS.match_threshold, ignore_threshold = FLAGS.neg_threshold, prior_scaling=[0.1, 0.1, 0.2, 0.2]) image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) image, filename, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes, batch_size, ('train' if is_training else 'val'), os.path.join(FLAGS.data_dir, dataset_pattern), FLAGS.num_readers, FLAGS.num_preprocessing_threads, image_preprocessing_fn, anchor_encoder_fn, num_epochs=FLAGS.train_epochs, is_training=is_training) global global_anchor_info global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer), 'num_anchors_per_layer': num_anchors_per_layer, 'all_num_anchors_depth': all_num_anchors_depth } return {'image': image, 'filename': filename, 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}, None
def input_fn(): out_shape = [FLAGS.train_image_size] * 2 anchor_creator = anchor_manipulator.AnchorCreator(out_shape, layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], layer_steps = [8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() # all_anchors: [[(38x38x1),(38x38x1),(4x1),(4x1)],[(19x19x1),(19x19x1),(4x1),(4x1)]... ] -> recording all the anchors information num_anchors_per_layer = [] for ind in range(len(all_anchors)): num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6, positive_threshold = FLAGS.match_threshold, ignore_threshold = FLAGS.neg_threshold, prior_scaling=[0.1, 0.1, 0.2, 0.2]) image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) anchor_decoder_fn = lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer) image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes, batch_size, ('train' if is_training else 'val'), os.path.join(FLAGS.data_dir, dataset_pattern), FLAGS.num_readers, FLAGS.num_preprocessing_threads, image_preprocessing_fn, anchor_encoder_fn, num_epochs=FLAGS.train_epochs, is_training=is_training) global global_anchor_info global_anchor_info = {'decode_fn': anchor_decoder_fn, 'num_anchors_per_layer': num_anchors_per_layer, 'all_num_anchors_depth': all_num_anchors_depth } return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
def slim_get_split(file_pattern='{}_????'): # Features in Pascal VOC TFRecords. keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/height': tf.FixedLenFeature([1], tf.int64), 'image/width': tf.FixedLenFeature([1], tf.int64), 'image/channels': tf.FixedLenFeature([1], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'shape': slim.tfexample_decoder.Tensor('image/shape'), 'object/bbox': slim.tfexample_decoder.BoundingBox( ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) dataset = slim.dataset.Dataset( data_sources=file_pattern, reader=tf.TFRecordReader, decoder=decoder, num_samples=100, items_to_descriptions=None, num_classes=21, labels_to_names=None) with tf.name_scope('dataset_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=2, common_queue_capacity=32, common_queue_min=8, shuffle=True, num_epochs=1) [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'shape', 'object/label', 'object/bbox', 'object/difficult']) image, glabels, gbboxes = ssd_preprocessing.preprocess_image(org_image, glabels_raw, gbboxes_raw, [300, 300], is_training=True, data_format='channels_last', output_rgb=True) anchor_creator = anchor_manipulator.AnchorCreator([300] * 2, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], extra_anchor_scales=[(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], anchor_ratios=[(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() num_anchors_per_layer = [] for ind in range(len(all_anchors)): num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders=[1.0] * 6, positive_threshold=0.5, ignore_threshold=0.5, prior_scaling=[0.1, 0.1, 0.2, 0.2]) gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_all_anchors(glabels, gbboxes, all_anchors, all_num_anchors_depth, all_num_anchors_spatial, True) anchors = anchor_encoder_decoder._all_anchors # split by layers gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0), \ tf.split(gt_labels, num_anchors_per_layer, axis=0), \ tf.split(gt_scores, num_anchors_per_layer, axis=0), \ [tf.split(anchor, num_anchors_per_layer, axis=0) for anchor in anchors] save_image_op = tf.py_func(save_image_with_bbox, [ssd_preprocessing.unwhiten_image(image), tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max), tf.concat(gt_scores, axis=0), tf.concat(gt_targets, axis=0)], tf.int64, stateful=True) return save_image_op
def data_mapping_fn(example_proto, is_training, image_preprocessing_fn): keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/filename': tf.FixedLenFeature((), tf.string, default_value=''), 'image/height': tf.FixedLenFeature([1], tf.int64), 'image/width': tf.FixedLenFeature([1], tf.int64), 'image/channels': tf.FixedLenFeature([1], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'filename': slim.tfexample_decoder.Tensor('image/filename'), 'shape': slim.tfexample_decoder.Tensor('image/shape'), 'object/bbox': slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) [org_image, filename, shape, glabels_raw, gbboxes_raw, isdifficult] = decoder.decode(example_proto, [ 'image', 'filename', 'shape', 'object/label', 'object/bbox', 'object/difficult' ]) if is_training: # if all is difficult, then keep the first one isdifficult_mask = tf.cond( tf.count_nonzero(isdifficult, dtype=tf.int32) < tf.shape(isdifficult)[0], lambda: isdifficult < tf.ones_like(isdifficult), lambda: tf.one_hot(0, tf.shape(isdifficult)[0], on_value=True, off_value=False, dtype=tf.bool)) glabels_raw = tf.boolean_mask(glabels_raw, isdifficult_mask) gbboxes_raw = tf.boolean_mask(gbboxes_raw, isdifficult_mask) # Pre-processing image, labels and bboxes. out_shape = [FLAGS.train_image_size] * 2 anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) num_anchors_per_layer = [] for ind in range(len(all_anchors)): num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=FLAGS.match_threshold, ignore_threshold=FLAGS.neg_threshold, prior_scaling=[0.1, 0.1, 0.2, 0.2]) global global_anchor_info global_anchor_info = { 'decode_fn': lambda pred: anchor_encoder_decoder.decode_all_anchors( pred, num_anchors_per_layer), 'num_anchors_per_layer': num_anchors_per_layer, 'all_num_anchors_depth': all_num_anchors_depth, 'encode_fn': lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors( glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) } # global global_anchor_info # global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer)} if is_training: image, glabels, gbboxes = image_preprocessing_fn( org_image, glabels_raw, gbboxes_raw) else: image = image_preprocessing_fn(org_image, glabels_raw, gbboxes_raw) glabels, gbboxes = glabels_raw, gbboxes_raw gt_targets, gt_labels, gt_scores = global_anchor_info['encode_fn'](glabels, gbboxes) # return [image, filename, shape, gt_targets, gt_labels, gt_scores] return image, { 'shape': shape, 'loc_targets': gt_targets, 'cls_targets': gt_labels, 'match_scores': gt_scores }
def main(_): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) shape_input = tf.placeholder(tf.int32, shape=(2, )) features = ssd_preprocessing.preprocess_for_eval( image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) #(N, W, H, C) anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors( pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): #backbone = ssd_net.VGG16Backbone(FLAGS.data_format) backbone = ssd_net.MobileNetV2Backbone(FLAGS.data_format) feature_layers = backbone.forward(features, training=False) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred ] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [-1, 4]) for pred in location_pred ] cls_pred = tf.concat(cls_pred, axis=0) location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) # selected_bboxes, selected_scores = parse_by_class( cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) while (video.isOpened()): ret, frame = video.read() if ret == False: break else: timer2 = cv2.getTickCount() if undistort == 'y': ######################################## Undistortion Parts ######################################## dim2 = None dim3 = None timer = cv2.getTickCount() dim1 = frame.shape[: 2][:: -1] # dim1 is the dimension of input image to un-distort assert dim1[0] / dim1[1] == DIM[0] / DIM[ 1], "Image to undistort needs to have same aspect ratio as the ones used in calibration" if not dim2: dim2 = dim1 if not dim3: dim3 = dim1 scaled_K = K * dim1[0] / DIM[ 0] # The values of K is to scale with image dimension. scaled_K[2][ 2] = 1.0 # Except that K[2][2] is always 1.0 # This is how scaled_K, dim2 and balance are used to determine the final K used to un-distort image. OpenCV document failed to make this clear! new_K = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify( scaled_K, D, dim2, np.eye(3), balance=0) map1, map2 = cv2.fisheye.initUndistortRectifyMap( scaled_K, D, np.eye(3), new_K, dim3, cv2.CV_16SC2) frame_r = cv2.remap(frame, map1, map2, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT) t = (cv2.getTickCount() - timer) / cv2.getTickFrequency() # frame_r = cv2.resize(dst, (640, 360)) # frame_r = cv2.putText(frame_r, "Undistortion processing time: %.3f sec" % t, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0, 255, 255), 2) #(1.0 / (end - start)) # ############################################################################################### #np_image = imread('./demo/test.jpg') labels_, scores_, bboxes_ = sess.run( [all_labels, all_scores, all_bboxes], feed_dict={ image_input: frame, shape_input: frame.shape[:-1] }) img_to_draw = draw_toolbox.bboxes_draw_on_img(frame, labels_, scores_, bboxes_, thickness=2) fps = cv2.getTickFrequency() / (cv2.getTickCount() - timer2) img_to_draw = cv2.putText(img_to_draw, "FPS : %.1f" % fps, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 2) # dst_r cv2.imshow('Object detector', img_to_draw) # dst_r if cv2.waitKey(1) == ord('q'): break
def ssd(path): # def ssd_res(img_path): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) shape_input = tf.placeholder(tf.int32, shape=(2,)) features = ssd_preprocessing.preprocess_for_eval(image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) anchor_creator = anchor_manipulator.AnchorCrealog_device_placementtor(out_shape, layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps = [8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6, positive_threshold = None, ignore_threshold = None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) decode_fn = lambda pred : anchor_encoder_decoder.ext_decode_all_anchors(pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.forward(features, training=False) location_pred, cls_pred = ssd_net.multibox_head(feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred] cls_pred = [tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred] location_pred = [tf.reshape(pred, [-1, 4]) for pred in location_pred] cls_pred = tf.concat(cls_pred, axis=0) location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) selected_bboxes, selected_scores = parse_by_class(cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) np_image = imread(path) im = Image.open(path) print(np_image.shape) labels_, scores_, bboxes_ = sess.run([all_labels, all_scores, all_bboxes], feed_dict = {image_input : np_image, shape_input : np_image.shape[:-1]}) all_bboxes = sess.run([bboxes_pred], feed_dict = {image_input : np_image, shape_input : np_image.shape[:-1]}) shape = np_image.shape for j in range(len(all_bboxes[0])): all_box = all_bboxes[0][j] p1 = (int(all_box[0] * shape[0]), int(all_box[1] * shape[1])) p2 = (int(all_box[2] * shape[0]), int(all_box[3] * shape[1])) if (p2[0] - p1[0] < 1) or (p2[1] - p1[1] < 1): continue x1 = p1[1] y1 = p1[0] x2 = p2[1] y2 = p2[0] obj = im.crop((x1, y1, x2, y2)) num_str = str(j) num_str = num_str.zfill(5) obj.save('./res/img/{}.jpg'.format(num_str)) cor = str(x1) + ',' + str(y1) + ',' + str(x2) + ',' +str(y2) f2 = open('./res/cor.txt', 'a') f2.write(cor + '\n') zero_str = str(0) f = open('./res/label.txt', 'a') f.write(num_str + ',' + zero_str + '\n') f.close() num1 = 0 for i in range(bboxes_.shape[0]): bbox = bboxes_[i] p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1])) p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1])) num1 = num1 + 1 if (p2[0] - p1[0] < 1) or (p2[1] - p1[1] < 1): continue x1 = p1[1] y1 = p1[0] x2 = p2[1] y2 = p2[0] cor1 = str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) num = 0 with open('./res/cor.txt', 'r') as f11, open('./res/label.txt', '+r') as f22: for line in f11: num = num + 1 if cor1 in line: num11 = str(num) print(num11 + '\n') num11 = num11.zfill(5) ber = num11 + ',' + str(0) aft = num11 + ',' + str(labels_[i]) t = f22.read() t = t.replace(ber, aft) f22.seek(0, 0) f22.write(t) print(num1) img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) imsave('./demo/out.jpg', img_to_draw)
def main(_): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 with tf.name_scope('define_input'): image_input = tf.placeholder(tf.uint8, shape=(None, None, 3), name='image_input') features = ssd_preprocessing.preprocess_for_eval( image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), #(2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) def decode_fn(pred): return anchor_encoder_decoder.ext_decode_all_anchors( pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.forward(features, training=False) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred ] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [-1, 4]) for pred in location_pred ] with tf.variable_scope('cls_pred'): cls_pred = tf.concat(cls_pred, axis=0) with tf.variable_scope('location_pred'): location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) selected_bboxes, selected_scores = parse_by_class( cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() ''' config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) config.mlu_options.data_parallelism = 1 config.mlu_options.model_parallelism = 1 config.mlu_options.core_num = 1 config.mlu_options.core_version = 'MLU270' config.mlu_options.precision = 'float' with tf.Session(config = config) as sess: ''' with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) np_image = imread('demo/test.jpg') labels_, scores_, bboxes_ = sess.run( [all_labels, all_scores, all_bboxes], feed_dict={image_input: np_image}) #print('labels_', labels_, type(labels_), labels_.shape) #print('scores_', scores_, type(scores_), scores_.shape) #print('bboxes_', bboxes_, type(bboxes_), bboxes_.shape, bboxes_.shape[0]) img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) imsave('demo/test_out.jpg', img_to_draw) saver.save(sess, 'model/ssd300_vgg16/ssd300_vgg16', global_step=0)
def main(_): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) shape_input = tf.placeholder(tf.int32, shape=(2, )) features, output_shape = ssd_preprocessing.preprocess_for_eval( image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) output_shape = tf.expand_dims(output_shape, axis=0) all_anchor_scales = [(30., ), (60., ), (112.5, ), (165., ), (217.5, ), (270., )] all_extra_scales = [(42.43, ), (82.17, ), (136.23, ), (189.45, ), (242.34, ), (295.08, )] all_anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)] # all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)] with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.forward(features, training=False) with tf.device('/cpu:0'): anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) if FLAGS.data_format == 'channels_first': all_layer_shapes = [ tf.shape(feat)[2:] for feat in feature_layers ] else: all_layer_shapes = [ tf.shape(feat)[1:3] for feat in feature_layers ] all_layer_strides = [8, 16, 32, 64, 100, 300] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height( all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind)) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, _ = anchor_encoder_decoder.get_all_anchors( tf.squeeze(output_shape, axis=0), anchors_height, anchors_width, anchors_depth, [0.5] * total_layers, all_layer_shapes, all_layer_strides, [0.] * total_layers, [False] * total_layers) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, FLAGS.num_classes, anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred ] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [-1, 4]) for pred in location_pred ] cls_pred = tf.concat(cls_pred, axis=0) location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = anchor_encoder_decoder.decode_anchors( location_pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) selected_bboxes, selected_scores = bbox_util.parse_by_class( tf.squeeze(output_shape, axis=0), cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) np_image = imread('./demo/test.jpg') labels_, scores_, bboxes_, output_shape_ = sess.run( [all_labels, all_scores, all_bboxes, output_shape], feed_dict={ image_input: np_image, shape_input: np_image.shape[:-1] }) bboxes_[:, 0] = bboxes_[:, 0] * np_image.shape[0] / output_shape_[0, 0] bboxes_[:, 1] = bboxes_[:, 1] * np_image.shape[1] / output_shape_[0, 1] bboxes_[:, 2] = bboxes_[:, 2] * np_image.shape[0] / output_shape_[0, 0] bboxes_[:, 3] = bboxes_[:, 3] * np_image.shape[1] / output_shape_[0, 1] img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) imsave('./demo/test_out.jpg', img_to_draw)
def input_fn(): out_shape = [300, 510] #[FLAGS.train_image_size] * 2 anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 64), (19, 32), (10, 16), (5, 8), (3, 6), (1, 4)], anchor_scales=[(0.05, ), (0.1, ), (0.2, ), (0.3, ), (0.4, ), (0.5, )], extra_anchor_scales=[(0.07, ), (0.1414, ), (0.245, ), (0.346, ), (0.447, ), (0.547, )], anchor_ratios=[(1., ), (1., ), (1., ), (1., ), (1., ), (1., )], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) num_anchors_per_layer = [] for ind in range(len(all_anchors)): num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=FLAGS.match_threshold, ignore_threshold=FLAGS.neg_threshold, prior_scaling=[0.1, 0.1, 0.2, 0.2]) image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image( image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors( glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) image, filename, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch( FLAGS.num_classes, batch_size, ('train' if is_training else 'val'), os.path.join(FLAGS.data_dir, dataset_pattern), FLAGS.num_readers, FLAGS.num_preprocessing_threads, image_preprocessing_fn, anchor_encoder_fn, num_epochs=FLAGS.train_epochs, is_training=is_training) global global_anchor_info global_anchor_info = { 'decode_fn': lambda pred: anchor_encoder_decoder.decode_all_anchors( pred, num_anchors_per_layer), 'num_anchors_per_layer': num_anchors_per_layer, 'all_num_anchors_depth': all_num_anchors_depth } return { 'image': image, 'filename': filename, 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores }, None
def main(_): with tf.Graph().as_default(): target_shape = None image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) features, output_shape = sfd_preprocessing.preprocess_for_eval(image_input, target_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) output_shape = tf.expand_dims(output_shape, axis=0) all_anchor_scales = [(16.,), (32.,), (64.,), (128.,), (256.,), (512.,)] all_extra_scales = [(), (), (), (), (), ()] all_anchor_ratios = [(1.,), (1.,), (1.,), (1.,), (1.,), (1.,)] all_layer_strides = [4, 8, 16, 32, 64, 128] offset_list = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = sfd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.get_featmaps(features, training=False) with tf.device('/cpu:0'): anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) if FLAGS.data_format == 'channels_first': all_layer_shapes = [tf.shape(feat)[2:] for feat in feature_layers] else: all_layer_shapes = [tf.shape(feat)[1:3] for feat in feature_layers] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind)) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, _ = anchor_encoder_decoder.get_all_anchors(tf.squeeze(output_shape, axis=0), anchors_height, anchors_width, anchors_depth, offset_list, all_layer_shapes, all_layer_strides, [0.] * total_layers, [False] * total_layers) location_pred, cls_pred = backbone.multibox_head(feature_layers, [1] * len(feature_layers), [3] + [1] * (len(feature_layers) - 1), anchors_depth) if FLAGS.data_format == 'channels_first': cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred] cls_pred = [tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred] location_pred = [tf.reshape(pred, [-1, 4]) for pred in location_pred] cls_pred = tf.nn.softmax(tf.concat(cls_pred, axis=0))[:, -1] location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = anchor_encoder_decoder.decode_anchors(location_pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) saver = tf.train.Saver() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) os.makedirs(FLAGS.det_dir, exist_ok=True) if FLAGS.subset is 'val': wider_face = sio.loadmat(os.path.join(FLAGS.data_dir, 'wider_face_split', 'wider_face_val.mat')) # Val set else: wider_face = sio.loadmat(os.path.join(FLAGS.data_dir, 'wider_face_split', 'wider_face_test.mat')) # Test set event_list = wider_face['event_list'] file_list = wider_face['file_list'] del wider_face Path = os.path.join(FLAGS.data_dir, ('WIDER_val' if FLAGS.subset is 'val' else 'WIDER_test'), 'images') save_path = os.path.join(FLAGS.det_dir, FLAGS.subset) len_event = len(event_list) for index, event in enumerate(event_list): filelist = file_list[index][0] len_files = len(filelist) if not os.path.exists(os.path.join(save_path, event[0][0])): os.makedirs(os.path.join(save_path, event[0][0])) for num, file in enumerate(filelist): im_name = file[0][0] Image_Path = os.path.join(Path, event[0][0], im_name[:]+'.jpg') image = imread(Image_Path) #image = imread('manymany.jpg') max_im_shrink = (0x7fffffff / FLAGS.memory_limit / (image.shape[0] * image.shape[1])) ** 0.5 # the max size of input image for caffe #max_im_shrink = (0x7fffffff / 80.0 / (image.shape[0] * image.shape[1])) ** 0.5 # the max size of input image for caffe shrink = max_im_shrink if max_im_shrink < 1 else 1 det0 = detect_face([sess, image_input, bboxes_pred, cls_pred], image, shrink) # origin test det1 = flip_test([sess, image_input, bboxes_pred, cls_pred], image, shrink) # flip test [det2, det3] = multi_scale_test([sess, image_input, bboxes_pred, cls_pred], image, max_im_shrink) #multi-scale test # merge all test results via bounding box voting det = np.row_stack((det0, det1, det2, det3)) dets = bbox_vote(det) f = open(os.path.join(save_path, event[0][0], im_name+'.txt'), 'w') write_to_txt(f, dets, event, im_name) f.close() if num % FLAGS.log_every_n_steps == 0: img_to_draw = draw_toolbox.bboxes_draw_on_img(image, (dets[:, 4] > 0.2).astype(np.int32), dets[:, 4], dets[:, :4], thickness=2) imsave(os.path.join(FLAGS.debug_dir, '{}.jpg'.format(im_name)), img_to_draw) #imsave(os.path.join('./debug/{}_{}.jpg').format(index, num), draw_toolbox.absolute_bboxes_draw_on_img(image, (dets[:, 4]>0.1).astype(np.int32), dets[:, 4], dets[:, :4], thickness=2)) #break sys.stdout.write('\r>> Predicting event:%d/%d num:%d/%d' % (index + 1, len_event, num + 1, len_files)) sys.stdout.flush() sys.stdout.write('\n') sys.stdout.flush()
def main(_): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) shape_input = tf.placeholder(tf.int32, shape=(2, )) features = ssd_preprocessing.preprocess_for_eval( image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors( pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.forward(features, training=False) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred ] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [-1, 4]) for pred in location_pred ] cls_pred = tf.concat(cls_pred, axis=0) location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) selected_bboxes, selected_scores = parse_by_class( cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) for i in range(video_frame_cnt): ret, img_ori = vid.read() # height_ori, width_ori = img_ori.shape[:2] # img = cv2.resize(img_ori, tuple(args.new_size)) img = cv2.cvtColor(img_ori, cv2.COLOR_BGR2RGB) np_image = np.asarray(img, np.float32) start_time = time.time() labels_, scores_, bboxes_ = sess.run( [all_labels, all_scores, all_bboxes], feed_dict={ image_input: np_image, shape_input: np_image.shape[:-1] }) end_time = time.time() img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) cv2.putText(img_to_draw, '{:.2f}ms'.format((end_time - start_time) * 1000), (40, 40), 0, fontScale=1, color=(0, 255, 0), thickness=2) imsave('./test_out.jpg', img_to_draw) new_img = cv2.imread('./test_out.jpg') cv2.imshow('image', new_img) videoWriter.write(new_img) if cv2.waitKey(1) & 0xFF == ord('q'): break vid.release() videoWriter.release()
def slim_get_split(file_pattern='{}_????'): # Features in Pascal VOC TFRecords. keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/filename': tf.FixedLenFeature((), tf.string, default_value=''), 'image/height': tf.FixedLenFeature([1], tf.int64), 'image/width': tf.FixedLenFeature([1], tf.int64), 'image/channels': tf.FixedLenFeature([1], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/blur': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/expression': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/illumination': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/invalid': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/occlusion': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/pose': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'filename': slim.tfexample_decoder.Tensor('image/filename'), 'shape': slim.tfexample_decoder.Tensor('image/shape'), 'object/bbox': slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/blur': slim.tfexample_decoder.Tensor('image/object/bbox/blur'), 'object/expression': slim.tfexample_decoder.Tensor('image/object/bbox/expression'), 'object/illumination': slim.tfexample_decoder.Tensor('image/object/bbox/illumination'), 'object/invalid': slim.tfexample_decoder.Tensor('image/object/bbox/invalid'), 'object/occlusion': slim.tfexample_decoder.Tensor('image/object/bbox/occlusion'), 'object/pose': slim.tfexample_decoder.Tensor('image/object/bbox/pose'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) dataset = slim.dataset.Dataset(data_sources=file_pattern, reader=tf.TFRecordReader, decoder=decoder, num_samples=100, items_to_descriptions=None, num_classes=21, labels_to_names=None) with tf.name_scope('dataset_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=2, common_queue_capacity=32, common_queue_min=8, shuffle=True, num_epochs=1) [org_image, filename, shape, \ g_bboxes, g_blur, g_expression, \ g_illumination, g_invalid, g_occlusion, g_pose] = provider.get(['image', 'filename', 'shape', 'object/bbox', 'object/blur', 'object/expression', 'object/illumination', 'object/invalid', 'object/occlusion', 'object/pose']) isinvalid_mask = tf.ones_like(g_invalid < 1) g_bboxes = tf.boolean_mask(g_bboxes, isinvalid_mask) g_blur = tf.boolean_mask(g_blur, isinvalid_mask) g_expression = tf.boolean_mask(g_expression, isinvalid_mask) g_illumination = tf.boolean_mask(g_illumination, isinvalid_mask) g_invalid = tf.boolean_mask(g_invalid, isinvalid_mask) g_occlusion = tf.boolean_mask(g_occlusion, isinvalid_mask) g_pose = tf.boolean_mask(g_pose, isinvalid_mask) image, gbboxes = dan_preprocessing.preprocess_image( org_image, g_bboxes, [640, 640], is_training=True, data_format='channels_last', output_rgb=True) # gbboxes = tf.boolean_mask(gbboxes, small_mask) # g_blur = tf.boolean_mask(g_blur, small_mask) # g_expression = tf.boolean_mask(g_expression, small_mask) # g_illumination = tf.boolean_mask(g_illumination, small_mask) # g_invalid = tf.boolean_mask(g_invalid, small_mask) # g_occlusion = tf.boolean_mask(g_occlusion, small_mask) # g_pose = tf.boolean_mask(g_pose, small_mask) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( positive_threshold=0.5, ignore_threshold=0.5, prior_scaling=[0.1, 0.1, 0.2, 0.2]) all_anchor_scales = [(16., ), (32., ), (64., ), (128., ), (256., ), (512., )] all_extra_scales = [(), (), (), (), (), ()] all_anchor_ratios = [(1., ), (1., ), (1., ), (1., ), (1., ), (1., )] all_layer_shapes = [(160, 160), (80, 80), (40, 40), (20, 20), (10, 10), (5, 5)] all_layer_strides = [4, 8, 16, 32, 64, 128] offset_list = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height( all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind]) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors( [640] * 2, anchors_height, anchors_width, anchors_depth, offset_list, all_layer_shapes, all_layer_strides, [640.] * total_layers, [False] * total_layers) gt_targets, gt_labels, gt_scores, _ = anchor_encoder_decoder.encode_anchors( gbboxes, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask, debug=True) num_anchors_per_layer = list() for ind, layer_shape in enumerate(all_layer_shapes): _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count( anchors_depth[ind], layer_shape) num_anchors_per_layer.append(_num_anchors_per_layer) # split by layers all_anchors = tf.stack( [anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1) gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0),\ tf.split(gt_labels, num_anchors_per_layer, axis=0),\ tf.split(gt_scores, num_anchors_per_layer, axis=0),\ tf.split(all_anchors, num_anchors_per_layer, axis=0) save_image_op = tf.py_func(save_image_with_bbox, [ dan_preprocessing.unwhiten_image(image), tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max), tf.concat(gt_scores, axis=0), tf.concat(gt_targets, axis=0) ], tf.int64, stateful=True) return save_image_op
def slim_get_split(file_pattern='{}_????'): # Features in Pascal VOC TFRecords. keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/height': tf.FixedLenFeature([1], tf.int64), 'image/width': tf.FixedLenFeature([1], tf.int64), 'image/channels': tf.FixedLenFeature([1], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'shape': slim.tfexample_decoder.Tensor('image/shape'), 'object/bbox': slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) dataset = slim.dataset.Dataset(data_sources=file_pattern, reader=tf.TFRecordReader, decoder=decoder, num_samples=100, items_to_descriptions=None, num_classes=21, labels_to_names=None) with tf.name_scope('dataset_data_provider'): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=2, common_queue_capacity=32, common_queue_min=8, shuffle=True, num_epochs=1) [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get( ['image', 'shape', 'object/label', 'object/bbox', 'object/difficult']) image, glabels, gbboxes = ssd_preprocessing.preprocess_image( org_image, glabels_raw, gbboxes_raw, [300, 300], is_training=True, data_format='channels_last', output_rgb=True) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( positive_threshold=0.5, ignore_threshold=0.5, prior_scaling=[0.1, 0.1, 0.2, 0.2]) all_anchor_scales = [(30., ), (60., ), (112.5, ), (165., ), (217.5, ), (270., )] all_extra_scales = [(42.43, ), (82.17, ), (136.23, ), (189.45, ), (242.34, ), (295.08, )] all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)] all_layer_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)] all_layer_strides = [8, 16, 32, 64, 100, 300] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height( all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind]) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors( [300] * 2, anchors_height, anchors_width, anchors_depth, [0.5] * total_layers, all_layer_shapes, all_layer_strides, [300.] * total_layers, [False] * total_layers) gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_anchors( glabels, gbboxes, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask, True) num_anchors_per_layer = list() for ind, layer_shape in enumerate(all_layer_shapes): _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count( anchors_depth[ind], layer_shape) num_anchors_per_layer.append(_num_anchors_per_layer) # split by layers all_anchors = tf.stack( [anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1) gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0),\ tf.split(gt_labels, num_anchors_per_layer, axis=0),\ tf.split(gt_scores, num_anchors_per_layer, axis=0),\ tf.split(all_anchors, num_anchors_per_layer, axis=0) save_image_op = tf.py_func(save_image_with_bbox, [ ssd_preprocessing.unwhiten_image(image), tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max), tf.concat(gt_scores, axis=0), tf.concat(gt_targets, axis=0) ], tf.int64, stateful=True) return save_image_op
anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), #(2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) def decode_fn(pred): return anchor_encoder_decoder.ext_decode_all_anchors( pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.name_scope('g2_cls_pred'): g2_cls_pred = tf.placeholder(tf.float32, shape=(8732, 21), name='g2_cls_pred') with tf.name_scope('g2_location_pred'): g2_location_pred = tf.placeholder(tf.float32, shape=(8732, 4),
def ssd_model_fn(features, labels, mode, params): """model_fn for SSD to be used with our Estimator.""" filename = features['filename'] filename = tf.identity(filename, name='filename') shape = features['shape'] output_shape = features['output_shape'] features = features['image'] anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) all_anchor_scales = [(30.,), (60.,), (112.5,), (165.,), (217.5,), (270.,)] all_extra_scales = [(42.43,), (82.17,), (136.23,), (189.45,), (242.34,), (295.08,)] all_anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)] #all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)] with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(params['data_format']) # forward features feature_layers = backbone.forward(features, training=(mode == tf.estimator.ModeKeys.TRAIN)) # generate anchors according to the feature map size with tf.device('/cpu:0'): if params['data_format'] == 'channels_first': all_layer_shapes = [tf.shape(feat)[2:] for feat in feature_layers] else: all_layer_shapes = [tf.shape(feat)[1:3] for feat in feature_layers] all_layer_strides = [8, 16, 32, 64, 100, 300] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind)) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, _ = anchor_encoder_decoder.get_all_anchors(tf.squeeze(output_shape, axis=0), anchors_height, anchors_width, anchors_depth, [0.5] * total_layers, all_layer_shapes, all_layer_strides, [0.] * total_layers, [False] * total_layers) # generate predictions based on anchors location_pred, cls_pred = ssd_net.multibox_head(feature_layers, params['num_classes'], anchors_depth, data_format=params['data_format']) if params['data_format'] == 'channels_first': cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred] cls_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred] location_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred] cls_pred = tf.concat(cls_pred, axis=1) location_pred = tf.concat(location_pred, axis=1) cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) location_pred = tf.reshape(location_pred, [-1, 4]) # decode predictions with tf.device('/cpu:0'): bboxes_pred = anchor_encoder_decoder.decode_anchors(location_pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) selected_bboxes, selected_scores = bbox_util.parse_by_class(tf.squeeze(output_shape, axis=0), cls_pred, bboxes_pred, params['num_classes'], params['select_threshold'], params['min_size'], params['keep_topk'], params['nms_topk'], params['nms_threshold']) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) save_image_op = tf.py_func(save_image_with_bbox, [ssd_preprocessing.unwhiten_image(tf.squeeze(features, axis=0), output_rgb=False), all_labels * tf.to_int32(all_scores > 0.3), all_scores, all_bboxes], tf.int64, stateful=True) tf.identity(save_image_op, name='save_image_op') predictions = {'filename': filename, 'shape': shape, 'output_shape': output_shape } for class_ind in range(1, params['num_classes']): predictions['scores_{}'.format(class_ind)] = tf.expand_dims(selected_scores[class_ind], axis=0) predictions['bboxes_{}'.format(class_ind)] = tf.expand_dims(selected_bboxes[class_ind], axis=0) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, prediction_hooks=None, loss=None, train_op=None) else: raise ValueError('This script only support "PREDICT" mode!')
def main(_): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), #(2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) def decode_fn(pred): return anchor_encoder_decoder.ext_decode_all_anchors( pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.name_scope('define_input'): image_input = tf.placeholder(tf.float32, shape=(1, 300, 300, 3), name='image_input') print('image_input', image_input) with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[image_input], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.forward(image_input, training=False) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred ] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [-1, 4]) for pred in location_pred ] with tf.variable_scope('cls_pred'): cls_pred = tf.concat(cls_pred, axis=0) with tf.variable_scope('location_pred'): location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) selected_bboxes, selected_scores = parse_by_class( cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() ''' config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) config.mlu_options.data_parallelism = 1 config.mlu_options.model_parallelism = 1 config.mlu_options.core_num = 1 config.mlu_options.core_version = 'MLU270' config.mlu_options.precision = 'float' with tf.Session(config = config) as sess: ''' with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) _R_MEAN = 123.68 _G_MEAN = 116.78 _B_MEAN = 103.94 means = [ _B_MEAN, _G_MEAN, _R_MEAN, ] np_image = cv2.imread('demo/test.jpg') image = cv2.resize( np_image, (FLAGS.train_image_size, FLAGS.train_image_size)) image = (image - means) # / 255.0 image = np.expand_dims(image, axis=0) print('image', type(image), image.shape) ''' image = tf.to_float(np_image) image = tf.image.resize_images(image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) image.set_shape(out_shape + [3]) num_channels = image.get_shape().as_list()[-1] channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image) for i in range(num_channels): channels[i] -= means[i] image = tf.concat(axis=2, values=channels) image_channels = tf.unstack(image, axis=-1, name='split_rgb') image = tf.stack([image_channels[2], image_channels[1], image_channels[0]], axis=-1, name='merge_bgr') ''' labels_, scores_, bboxes_ = sess.run( [all_labels, all_scores, all_bboxes], feed_dict={image_input: image}) #print('labels_', labels_, type(labels_), labels_.shape) #print('scores_', scores_, type(scores_), scores_.shape) #print('bboxes_', bboxes_, type(bboxes_), bboxes_.shape, bboxes_.shape[0]) img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) cv2.imwrite('demo/test_out.jpg', img_to_draw) saver.save(sess, 'model/ssd300_vgg16/ssd300_vgg16_short', global_step=0)
def main(_): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) shape_input = tf.placeholder(tf.int32, shape=(2, )) features = ssd_preprocessing.preprocess_for_eval( image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors( pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.forward(features, training=False) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred ] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [-1, 4]) for pred in location_pred ] cls_pred = tf.concat(cls_pred, axis=0) location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) selected_bboxes, selected_scores = parse_by_class( cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) np_image = imread('./demo/test.jpg') labels_, scores_, bboxes_ = sess.run( [all_labels, all_scores, all_bboxes], feed_dict={ image_input: np_image, shape_input: np_image.shape[:-1] }) img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) imsave('./demo/test_out.jpg', img_to_draw)
def input_fn(): #train_imgage_size = 300 [300, 300] target_shape = [FLAGS.train_image_size] * 2 #match_threshold:0.5 #neg_threshold:0.5 anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( positive_threshold=FLAGS.match_threshold, ignore_threshold=FLAGS.neg_threshold, prior_scaling=[0.1, 0.1, 0.2, 0.2]) all_anchor_scales = [(30., ), (60., ), (112.5, ), (165., ), (217.5, ), (270., )] all_extra_scales = [(42.43, ), (82.17, ), (136.23, ), (189.45, ), (242.34, ), (295.08, )] all_anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)] all_layer_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)] all_layer_strides = [8, 16, 32, 64, 100, 300] total_layers = len(all_layer_shapes) anchors_height = list() anchors_width = list() anchors_depth = list() for ind in range(total_layers): #若该层有n个default_prior_box则anchors_height是这些box的h,_anchor_depth是n _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height( all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind)) anchors_height.append(_anchors_height) anchors_width.append(_anchors_width) anchors_depth.append(_anchor_depth) #anchors_ymin: [38*38*4 + 19*19*6 + 10*10*6 + 5*5*6 + 3*3*4 + 1*!*4] anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors( target_shape, anchors_height, anchors_width, anchors_depth, [0.5] * total_layers, all_layer_shapes, all_layer_strides, [FLAGS.train_image_size * 1.] * total_layers, [False] * total_layers) num_anchors_per_layer = list() #all_layer_shapes [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)] for ind, layer_shape in enumerate(all_layer_shapes): #num_anchors_per_layer:layer_shape[0]*layer_layer[1]*anchors_depth _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count( anchors_depth[ind], layer_shape, name='get_anchor_count{}'.format(ind)) num_anchors_per_layer.append(_num_anchors_per_layer) #num_anchors_per_layer:[38*38*4, 19*19*6, 10*10*6, 5*5*6, 3*3*4, 1*!*4] image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image( image_, labels_, bboxes_, target_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_anchors( glabels_, gbboxes_, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask) image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch( FLAGS.num_classes, batch_size, ('train' if is_training else 'val'), os.path.join(FLAGS.data_dir, dataset_pattern), FLAGS.num_readers, FLAGS.num_preprocessing_threads, image_preprocessing_fn, anchor_encoder_fn, num_epochs=FLAGS.train_epochs, is_training=is_training) global global_anchor_info global_anchor_info = { 'decode_fn': lambda pred: anchor_encoder_decoder.batch_decode_anchors( pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax), 'num_anchors_per_layer': num_anchors_per_layer, 'all_num_anchors_depth': anchors_depth } return image, { 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores }