Beispiel #1
0
    def input_fn():
        with tf.name_scope('post_forward'):
            out_shape = [FLAGS.train_image_size] * 2
            anchor_creator = anchor_manipulator.AnchorCreator(
                out_shape,
                layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                               (1, 1)],
                anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ),
                               (0.725, ), (0.9, )],
                extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                     (0.6315, ), (0.8078, ), (0.9836, )],
                anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                               (1., 2., 3., .5, 0.3333),
                               (1., 2., 3., .5, 0.3333), (1., 2., .5),
                               (1., 2., .5)],
                layer_steps=[8, 16, 32, 64, 100, 300])
            all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
            )

            num_anchors_per_layer = []
            for ind in range(len(all_anchors)):
                num_anchors_per_layer.append(all_num_anchors_depth[ind] *
                                             all_num_anchors_spatial[ind])

            anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
                allowed_borders=[1.0] * 6,
                positive_threshold=FLAGS.match_threshold,
                ignore_threshold=FLAGS.neg_threshold,
                prior_scaling=[0.1, 0.1, 0.2, 0.2])

            #             global global_anchor_info
            #             global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer),
            #                                 'num_anchors_per_layer': num_anchors_per_layer,
            #                                 'all_num_anchors_depth': all_num_anchors_depth,
            #                                 'encode_fn': lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)}

            image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image(
                image_,
                labels_,
                bboxes_,
                out_shape,
                is_training=is_training,
                data_format=FLAGS.data_format,
                output_rgb=False)

            #             anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

            filenames = tf.placeholder(tf.string, shape=[None])
            arga = tf.constant(True)

            dataset = tf.data.TFRecordDataset(training_files)
            dataset = dataset.map(lambda x: data_mapping_fn(
                x, is_training, image_preprocessing_fn))
            dataset = dataset.repeat()  # repeat the input infinitely
            dataset = dataset.batch(batch_size)  # set the batch size
            iterator = dataset.make_initializable_iterator()

            # return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
            return dataset
Beispiel #2
0
    def input_fn():
        target_shape = [FLAGS.train_image_size] * 2

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(positive_threshold = FLAGS.match_threshold,
                                                        ignore_threshold = FLAGS.neg_threshold,
                                                        prior_scaling=[0.1, 0.1, 0.2, 0.2])

        all_anchor_scales = [(16.,), (32.,), (64.,), (128.,), (256.,), (512.,)]
        all_extra_scales = [(), (), (), (), (), ()]
        all_anchor_ratios = [(1.,), (1.,), (1.,), (1.,), (1.,), (1.,)]
        all_layer_shapes = [(160, 160), (80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
        all_layer_strides = [4, 8, 16, 32, 64, 128]
        offset_list = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
        total_layers = len(all_layer_shapes)
        anchors_height = list()
        anchors_width = list()
        anchors_depth = list()
        for ind in range(total_layers):
            _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind))
            anchors_height.append(_anchors_height)
            anchors_width.append(_anchors_width)
            anchors_depth.append(_anchor_depth)
        anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors(target_shape, anchors_height, anchors_width, anchors_depth,
                                                                        offset_list, all_layer_shapes, all_layer_strides,
                                                                        [FLAGS.train_image_size * 1.] * total_layers, [False] * total_layers)

        num_anchors_per_layer = list()
        for ind, layer_shape in enumerate(all_layer_shapes):
            _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count(anchors_depth[ind], layer_shape, name='get_anchor_count{}'.format(ind))
            num_anchors_per_layer.append(_num_anchors_per_layer)

        image_preprocessing_fn = lambda image_, bboxes_ : sfd_preprocessing.preprocess_image(image_, bboxes_, target_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
        anchor_encoder_fn = lambda gbboxes_: anchor_encoder_decoder.encode_anchors(gbboxes_, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask, match_mining=True)

        image, filename, shape, loc_targets, cls_targets, match_scores, _ = dataset_common.slim_get_batch(FLAGS.num_classes,
                                                                                batch_size,
                                                                                ('train' if is_training else 'valid'),
                                                                                os.path.join(FLAGS.data_dir, dataset_pattern),
                                                                                FLAGS.num_readers,
                                                                                FLAGS.num_preprocessing_threads,
                                                                                image_preprocessing_fn,
                                                                                anchor_encoder_fn,
                                                                                num_epochs=FLAGS.train_epochs,
                                                                                is_training=is_training)
        global global_anchor_info
        global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.batch_decode_anchors(pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax),
                            'num_anchors_per_layer': num_anchors_per_layer,
                            'all_num_anchors_depth': anchors_depth }

        return image, {'filename': filename, 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
    def input_fn():
        out_shape = [FLAGS.train_image_size] * 2
        ssd300_anchor_params = {'layers_shapes': [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], 'anchor_scales': [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
                                                    'extra_anchor_scales': [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
                                                    'anchor_ratios': [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
                                                    'layer_steps': [8, 16, 32, 64, 100, 300]}
        ssd512_anchor_params = {'layers_shapes': [(64, 64), (32, 32), (16, 16), (8, 8), (4, 4), (2, 2), (1, 1)],
                                                    'anchor_scales': [(0.07,), (0.15,), (0.3,), (0.45,), (0.6,), (0.75,), (0.9,)],
                                                    'extra_anchor_scales': [(0.1025,), (0.2121,), (0.3674,), (0.5196,), (0.6708,), (0.8216,), (0.9721,)],
                                                    'anchor_ratios': [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
                                                    'layer_steps': [8, 16, 32, 64, 128, 256, 512]}
        if FLAGS.train_image_size == 512:
            net_params = ssd512_anchor_params
            print('using ssd512 model')
        else:
            net_params = ssd300_anchor_params
            print('using ssd300 model')
        anchor_creator = anchor_manipulator.AnchorCreator(out_shape, **net_params)
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()

        num_anchors_per_layer = []
        for ind in range(len(all_anchors)):
            num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * len(net_params['layer_steps']),
                                                            positive_threshold = FLAGS.match_threshold,
                                                            ignore_threshold = FLAGS.neg_threshold,
                                                            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        image, filename, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes,
                                                                                batch_size,
                                                                                ('train' if is_training else 'val'),
                                                                                os.path.join(FLAGS.data_dir, dataset_pattern),
                                                                                FLAGS.num_readers,
                                                                                FLAGS.num_preprocessing_threads,
                                                                                image_preprocessing_fn,
                                                                                anchor_encoder_fn,
                                                                                num_epochs=FLAGS.train_epochs,
                                                                                is_training=is_training)
        global global_anchor_info
        global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer),
                            'num_anchors_per_layer': num_anchors_per_layer,
                            'all_num_anchors_depth': all_num_anchors_depth }

        return {'image': image, 'filename': filename, 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}, None
Beispiel #4
0
    def input_fn():
        out_shape = [FLAGS.train_image_size] * 2
        anchor_creator = anchor_manipulator.AnchorCreator(out_shape,
                                                    layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
                                                    anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
                                                    extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
                                                    anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
                                                    layer_steps = [8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()
        # all_anchors: [[(38x38x1),(38x38x1),(4x1),(4x1)],[(19x19x1),(19x19x1),(4x1),(4x1)]... ] -> recording all the anchors information

        num_anchors_per_layer = []
        for ind in range(len(all_anchors)):
            num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6,
                                                            positive_threshold = FLAGS.match_threshold,
                                                            ignore_threshold = FLAGS.neg_threshold,
                                                            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
        
        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)
        
        anchor_decoder_fn = lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer)

        image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes,
                                                                                batch_size,
                                                                                ('train' if is_training else 'val'),
                                                                                os.path.join(FLAGS.data_dir, dataset_pattern),
                                                                                FLAGS.num_readers,
                                                                                FLAGS.num_preprocessing_threads,
                                                                                image_preprocessing_fn,
                                                                                anchor_encoder_fn,
                                                                                num_epochs=FLAGS.train_epochs,
                                                                                is_training=is_training)
        global global_anchor_info
        global_anchor_info = {'decode_fn': anchor_decoder_fn,
                            'num_anchors_per_layer': num_anchors_per_layer,
                            'all_num_anchors_depth': all_num_anchors_depth }

        return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
Beispiel #5
0
def slim_get_split(file_pattern='{}_????'):
    # Features in Pascal VOC TFRecords.
    keys_to_features = {
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
        'image/height': tf.FixedLenFeature([1], tf.int64),
        'image/width': tf.FixedLenFeature([1], tf.int64),
        'image/channels': tf.FixedLenFeature([1], tf.int64),
        'image/shape': tf.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'shape': slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox': slim.tfexample_decoder.BoundingBox(
            ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
        'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
        'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
        'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)

    dataset = slim.dataset.Dataset(
        data_sources=file_pattern,
        reader=tf.TFRecordReader,
        decoder=decoder,
        num_samples=100,
        items_to_descriptions=None,
        num_classes=21,
        labels_to_names=None)

    with tf.name_scope('dataset_data_provider'):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=2,
            common_queue_capacity=32,
            common_queue_min=8,
            shuffle=True,
            num_epochs=1)

    [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'shape',
                                                                              'object/label',
                                                                              'object/bbox',
                                                                              'object/difficult'])
    image, glabels, gbboxes = ssd_preprocessing.preprocess_image(org_image, glabels_raw, gbboxes_raw, [300, 300],
                                                                 is_training=True, data_format='channels_last',
                                                                 output_rgb=True)

    anchor_creator = anchor_manipulator.AnchorCreator([300] * 2,
                                                      layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                                                                     (1, 1)],
                                                      anchor_scales=[(0.1,), (0.2,), (0.375,), (0.55,), (0.725,),
                                                                     (0.9,)],
                                                      extra_anchor_scales=[(0.1414,), (0.2739,), (0.4541,), (0.6315,),
                                                                           (0.8078,), (0.9836,)],
                                                      anchor_ratios=[(2., .5), (2., 3., .5, 0.3333),
                                                                     (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
                                                                     (2., .5), (2., .5)],
                                                      layer_steps=[8, 16, 32, 64, 100, 300])

    all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()

    num_anchors_per_layer = []
    for ind in range(len(all_anchors)):
        num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])

    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders=[1.0] * 6,
                                                              positive_threshold=0.5,
                                                              ignore_threshold=0.5,
                                                              prior_scaling=[0.1, 0.1, 0.2, 0.2])

    gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_all_anchors(glabels, gbboxes, all_anchors,
                                                                                 all_num_anchors_depth,
                                                                                 all_num_anchors_spatial, True)

    anchors = anchor_encoder_decoder._all_anchors
    # split by layers
    gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0), \
                                                tf.split(gt_labels, num_anchors_per_layer, axis=0), \
                                                tf.split(gt_scores, num_anchors_per_layer, axis=0), \
                                                [tf.split(anchor, num_anchors_per_layer, axis=0) for anchor in anchors]

    save_image_op = tf.py_func(save_image_with_bbox,
                               [ssd_preprocessing.unwhiten_image(image),
                                tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max),
                                tf.concat(gt_scores, axis=0),
                                tf.concat(gt_targets, axis=0)],
                               tf.int64, stateful=True)
    return save_image_op
Beispiel #6
0
def data_mapping_fn(example_proto, is_training, image_preprocessing_fn):
    keys_to_features = {
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string,
                                           default_value='jpeg'),
        'image/filename': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/height': tf.FixedLenFeature([1], tf.int64),
        'image/width': tf.FixedLenFeature([1], tf.int64),
        'image/channels': tf.FixedLenFeature([1], tf.int64),
        'image/shape': tf.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image':
        slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'filename':
        slim.tfexample_decoder.Tensor('image/filename'),
        'shape':
        slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox':
        slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                           'image/object/bbox/'),
        'object/label':
        slim.tfexample_decoder.Tensor('image/object/bbox/label'),
        'object/difficult':
        slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
        'object/truncated':
        slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                      items_to_handlers)
    [org_image, filename, shape, glabels_raw, gbboxes_raw,
     isdifficult] = decoder.decode(example_proto, [
         'image', 'filename', 'shape', 'object/label', 'object/bbox',
         'object/difficult'
     ])

    if is_training:
        # if all is difficult, then keep the first one
        isdifficult_mask = tf.cond(
            tf.count_nonzero(isdifficult,
                             dtype=tf.int32) < tf.shape(isdifficult)[0],
            lambda: isdifficult < tf.ones_like(isdifficult),
            lambda: tf.one_hot(0,
                               tf.shape(isdifficult)[0],
                               on_value=True,
                               off_value=False,
                               dtype=tf.bool))

        glabels_raw = tf.boolean_mask(glabels_raw, isdifficult_mask)
        gbboxes_raw = tf.boolean_mask(gbboxes_raw, isdifficult_mask)

    # Pre-processing image, labels and bboxes.
    out_shape = [FLAGS.train_image_size] * 2
    anchor_creator = anchor_manipulator.AnchorCreator(
        out_shape,
        layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
        anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                       (0.9, )],
        extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ),
                             (0.8078, ), (0.9836, )],
        anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                       (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                       (1., 2., .5), (1., 2., .5)],
        layer_steps=[8, 16, 32, 64, 100, 300])

    all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
    )

    num_anchors_per_layer = []
    for ind in range(len(all_anchors)):
        num_anchors_per_layer.append(all_num_anchors_depth[ind] *
                                     all_num_anchors_spatial[ind])

    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
        allowed_borders=[1.0] * 6,
        positive_threshold=FLAGS.match_threshold,
        ignore_threshold=FLAGS.neg_threshold,
        prior_scaling=[0.1, 0.1, 0.2, 0.2])

    global global_anchor_info
    global_anchor_info = {
        'decode_fn':
        lambda pred: anchor_encoder_decoder.decode_all_anchors(
            pred, num_anchors_per_layer),
        'num_anchors_per_layer':
        num_anchors_per_layer,
        'all_num_anchors_depth':
        all_num_anchors_depth,
        'encode_fn':
        lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(
            glabels_, gbboxes_, all_anchors, all_num_anchors_depth,
            all_num_anchors_spatial)
    }

    #     global global_anchor_info

    #     global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer)}

    if is_training:
        image, glabels, gbboxes = image_preprocessing_fn(
            org_image, glabels_raw, gbboxes_raw)
    else:
        image = image_preprocessing_fn(org_image, glabels_raw, gbboxes_raw)
        glabels, gbboxes = glabels_raw, gbboxes_raw

    gt_targets, gt_labels, gt_scores = global_anchor_info['encode_fn'](glabels,
                                                                       gbboxes)

    # return [image, filename, shape, gt_targets, gt_labels, gt_scores]
    return image, {
        'shape': shape,
        'loc_targets': gt_targets,
        'cls_targets': gt_labels,
        'match_scores': gt_scores
    }
Beispiel #7
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)  #(N, W, H, C)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors(
            pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            #backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            backbone = ssd_net.MobileNetV2Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)

            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)  #
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            while (video.isOpened()):
                ret, frame = video.read()
                if ret == False:
                    break
                else:
                    timer2 = cv2.getTickCount()
                    if undistort == 'y':
                        ######################################## Undistortion Parts ########################################
                        dim2 = None
                        dim3 = None

                        timer = cv2.getTickCount()

                        dim1 = frame.shape[:
                                           2][::
                                              -1]  # dim1 is the dimension of input image to un-distort
                        assert dim1[0] / dim1[1] == DIM[0] / DIM[
                            1], "Image to undistort needs to have same aspect ratio as the ones used in calibration"
                        if not dim2:
                            dim2 = dim1
                        if not dim3:
                            dim3 = dim1
                        scaled_K = K * dim1[0] / DIM[
                            0]  # The values of K is to scale with image dimension.
                        scaled_K[2][
                            2] = 1.0  # Except that K[2][2] is always 1.0
                        # This is how scaled_K, dim2 and balance are used to determine the final K used to un-distort image. OpenCV document failed to make this clear!

                        new_K = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify(
                            scaled_K, D, dim2, np.eye(3), balance=0)
                        map1, map2 = cv2.fisheye.initUndistortRectifyMap(
                            scaled_K, D, np.eye(3), new_K, dim3, cv2.CV_16SC2)

                        frame_r = cv2.remap(frame,
                                            map1,
                                            map2,
                                            interpolation=cv2.INTER_LINEAR,
                                            borderMode=cv2.BORDER_CONSTANT)

                        t = (cv2.getTickCount() -
                             timer) / cv2.getTickFrequency()

                        # frame_r = cv2.resize(dst, (640, 360))
                        # frame_r = cv2.putText(frame_r, "Undistortion processing time: %.3f sec" % t, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0, 255, 255), 2) #(1.0 / (end - start))
                        # ###############################################################################################

                    #np_image = imread('./demo/test.jpg')
                    labels_, scores_, bboxes_ = sess.run(
                        [all_labels, all_scores, all_bboxes],
                        feed_dict={
                            image_input: frame,
                            shape_input: frame.shape[:-1]
                        })

                    img_to_draw = draw_toolbox.bboxes_draw_on_img(frame,
                                                                  labels_,
                                                                  scores_,
                                                                  bboxes_,
                                                                  thickness=2)
                    fps = cv2.getTickFrequency() / (cv2.getTickCount() -
                                                    timer2)
                    img_to_draw = cv2.putText(img_to_draw, "FPS : %.1f" % fps,
                                              (10, 20),
                                              cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                                              (0, 255, 255), 2)  # dst_r
                    cv2.imshow('Object detector', img_to_draw)  # dst_r
                    if cv2.waitKey(1) == ord('q'):
                        break
Beispiel #8
0
def ssd(path):
# def ssd_res(img_path):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2,))

        features = ssd_preprocessing.preprocess_for_eval(image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCrealog_device_placementtor(out_shape,
                                                    layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
                                                    anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
                                                    extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
                                                    anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
                                                    #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
                                                    layer_steps = [8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6,
                                                            positive_threshold = None,
                                                            ignore_threshold = None,
                                                            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred : anchor_encoder_decoder.ext_decode_all_anchors(pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred]
                location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred]

            cls_pred = [tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred]
            location_pred = [tf.reshape(pred, [-1, 4]) for pred in location_pred]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)


            selected_bboxes, selected_scores = parse_by_class(cls_pred, bboxes_pred,
                                                            FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size,
                                                            FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold)



            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            saver.restore(sess, get_checkpoint())

            np_image = imread(path)
            im = Image.open(path)
            print(np_image.shape)

            labels_, scores_, bboxes_ = sess.run([all_labels, all_scores, all_bboxes], feed_dict = {image_input : np_image, shape_input : np_image.shape[:-1]})

            all_bboxes = sess.run([bboxes_pred], feed_dict = {image_input : np_image, shape_input : np_image.shape[:-1]})




            shape = np_image.shape
            for j in range(len(all_bboxes[0])):
                all_box = all_bboxes[0][j]
                p1 = (int(all_box[0] * shape[0]), int(all_box[1] * shape[1]))
                p2 = (int(all_box[2] * shape[0]), int(all_box[3] * shape[1]))
                if (p2[0] - p1[0] < 1) or (p2[1] - p1[1] < 1):
                    continue
                x1 = p1[1]
                y1 = p1[0]
                x2 = p2[1]
                y2 = p2[0]

                obj = im.crop((x1, y1, x2, y2))

                num_str = str(j)
                num_str = num_str.zfill(5)
                obj.save('./res/img/{}.jpg'.format(num_str))

                cor = str(x1) + ',' + str(y1) + ',' + str(x2) + ',' +str(y2)
                f2 = open('./res/cor.txt', 'a')
                f2.write(cor + '\n')


                zero_str = str(0)
                f = open('./res/label.txt', 'a')
                f.write(num_str + ',' + zero_str + '\n')
                f.close()

            num1 = 0
            for i in range(bboxes_.shape[0]):
                bbox = bboxes_[i]
                p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1]))
                p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1]))
                num1 = num1 + 1

                if (p2[0] - p1[0] < 1) or (p2[1] - p1[1] < 1):
                    continue
                x1 = p1[1]
                y1 = p1[0]
                x2 = p2[1]
                y2 = p2[0]



                cor1 = str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2)


                num = 0
                with open('./res/cor.txt', 'r') as f11, open('./res/label.txt', '+r') as f22:
                    for line in f11:
                        num = num + 1
                        if cor1 in line:
                            num11 = str(num)
                            print(num11 + '\n')

                            num11 = num11.zfill(5)
                            ber = num11 + ',' + str(0)
                            aft = num11 + ',' + str(labels_[i])

                            t = f22.read()
                            t = t.replace(ber, aft)
                            f22.seek(0, 0)
                            f22.write(t)
            print(num1)

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2)
            imsave('./demo/out.jpg', img_to_draw)
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        with tf.name_scope('define_input'):
            image_input = tf.placeholder(tf.uint8,
                                         shape=(None, None, 3),
                                         name='image_input')

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
            #(2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        def decode_fn(pred):
            return anchor_encoder_decoder.ext_decode_all_anchors(
                pred, all_anchors, all_num_anchors_depth,
                all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            with tf.variable_scope('cls_pred'):
                cls_pred = tf.concat(cls_pred, axis=0)
            with tf.variable_scope('location_pred'):
                location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        '''
        config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        config.mlu_options.data_parallelism = 1
        config.mlu_options.model_parallelism = 1
        config.mlu_options.core_num = 1
        config.mlu_options.core_version = 'MLU270'
        config.mlu_options.precision = 'float'
        with tf.Session(config = config) as sess:
        '''
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            np_image = imread('demo/test.jpg')
            labels_, scores_, bboxes_ = sess.run(
                [all_labels, all_scores, all_bboxes],
                feed_dict={image_input: np_image})
            #print('labels_', labels_, type(labels_), labels_.shape)
            #print('scores_', scores_, type(scores_), scores_.shape)
            #print('bboxes_', bboxes_, type(bboxes_), bboxes_.shape, bboxes_.shape[0])

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            imsave('demo/test_out.jpg', img_to_draw)
            saver.save(sess, 'model/ssd300_vgg16/ssd300_vgg16', global_step=0)
Beispiel #10
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features, output_shape = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)
        output_shape = tf.expand_dims(output_shape, axis=0)

        all_anchor_scales = [(30., ), (60., ), (112.5, ), (165., ), (217.5, ),
                             (270., )]
        all_extra_scales = [(42.43, ), (82.17, ), (136.23, ), (189.45, ),
                            (242.34, ), (295.08, )]
        all_anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333),
                             (1., 2., 3., .5, 0.3333),
                             (1., 2., 3., .5, 0.3333), (1., 2., .5),
                             (1., 2., .5)]
        # all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)]

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            with tf.device('/cpu:0'):
                anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
                    positive_threshold=None,
                    ignore_threshold=None,
                    prior_scaling=[0.1, 0.1, 0.2, 0.2])

                if FLAGS.data_format == 'channels_first':
                    all_layer_shapes = [
                        tf.shape(feat)[2:] for feat in feature_layers
                    ]
                else:
                    all_layer_shapes = [
                        tf.shape(feat)[1:3] for feat in feature_layers
                    ]
                all_layer_strides = [8, 16, 32, 64, 100, 300]
                total_layers = len(all_layer_shapes)
                anchors_height = list()
                anchors_width = list()
                anchors_depth = list()
                for ind in range(total_layers):
                    _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(
                        all_anchor_scales[ind],
                        all_extra_scales[ind],
                        all_anchor_ratios[ind],
                        name='get_anchors_width_height{}'.format(ind))
                    anchors_height.append(_anchors_height)
                    anchors_width.append(_anchors_width)
                    anchors_depth.append(_anchor_depth)
                anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, _ = anchor_encoder_decoder.get_all_anchors(
                    tf.squeeze(output_shape, axis=0), anchors_height,
                    anchors_width, anchors_depth, [0.5] * total_layers,
                    all_layer_shapes, all_layer_strides, [0.] * total_layers,
                    [False] * total_layers)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = anchor_encoder_decoder.decode_anchors(
                location_pred, anchors_ymin, anchors_xmin, anchors_ymax,
                anchors_xmax)
            selected_bboxes, selected_scores = bbox_util.parse_by_class(
                tf.squeeze(output_shape, axis=0), cls_pred, bboxes_pred,
                FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size,
                FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            np_image = imread('./demo/test.jpg')
            labels_, scores_, bboxes_, output_shape_ = sess.run(
                [all_labels, all_scores, all_bboxes, output_shape],
                feed_dict={
                    image_input: np_image,
                    shape_input: np_image.shape[:-1]
                })
            bboxes_[:,
                    0] = bboxes_[:, 0] * np_image.shape[0] / output_shape_[0,
                                                                           0]
            bboxes_[:,
                    1] = bboxes_[:, 1] * np_image.shape[1] / output_shape_[0,
                                                                           1]
            bboxes_[:,
                    2] = bboxes_[:, 2] * np_image.shape[0] / output_shape_[0,
                                                                           0]
            bboxes_[:,
                    3] = bboxes_[:, 3] * np_image.shape[1] / output_shape_[0,
                                                                           1]

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            imsave('./demo/test_out.jpg', img_to_draw)
Beispiel #11
0
    def input_fn():
        out_shape = [300, 510]  #[FLAGS.train_image_size] * 2
        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 64), (19, 32), (10, 16), (5, 8), (3, 6),
                           (1, 4)],
            anchor_scales=[(0.05, ), (0.1, ), (0.2, ), (0.3, ), (0.4, ),
                           (0.5, )],
            extra_anchor_scales=[(0.07, ), (0.1414, ), (0.245, ), (0.346, ),
                                 (0.447, ), (0.547, )],
            anchor_ratios=[(1., ), (1., ), (1., ), (1., ), (1., ), (1., )],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        num_anchors_per_layer = []
        for ind in range(len(all_anchors)):
            num_anchors_per_layer.append(all_num_anchors_depth[ind] *
                                         all_num_anchors_spatial[ind])

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=FLAGS.match_threshold,
            ignore_threshold=FLAGS.neg_threshold,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image(
            image_,
            labels_,
            bboxes_,
            out_shape,
            is_training=is_training,
            data_format=FLAGS.data_format,
            output_rgb=False)
        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(
            glabels_, gbboxes_, all_anchors, all_num_anchors_depth,
            all_num_anchors_spatial)

        image, filename, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(
            FLAGS.num_classes,
            batch_size, ('train' if is_training else 'val'),
            os.path.join(FLAGS.data_dir, dataset_pattern),
            FLAGS.num_readers,
            FLAGS.num_preprocessing_threads,
            image_preprocessing_fn,
            anchor_encoder_fn,
            num_epochs=FLAGS.train_epochs,
            is_training=is_training)
        global global_anchor_info
        global_anchor_info = {
            'decode_fn':
            lambda pred: anchor_encoder_decoder.decode_all_anchors(
                pred, num_anchors_per_layer),
            'num_anchors_per_layer':
            num_anchors_per_layer,
            'all_num_anchors_depth':
            all_num_anchors_depth
        }

        return {
            'image': image,
            'filename': filename,
            'shape': shape,
            'loc_targets': loc_targets,
            'cls_targets': cls_targets,
            'match_scores': match_scores
        }, None
Beispiel #12
0
def main(_):
    with tf.Graph().as_default():
        target_shape = None

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))

        features, output_shape = sfd_preprocessing.preprocess_for_eval(image_input, target_shape, data_format=FLAGS.data_format, output_rgb=False)
        features = tf.expand_dims(features, axis=0)
        output_shape = tf.expand_dims(output_shape, axis=0)

        all_anchor_scales = [(16.,), (32.,), (64.,), (128.,), (256.,), (512.,)]
        all_extra_scales = [(), (), (), (), (), ()]
        all_anchor_ratios = [(1.,), (1.,), (1.,), (1.,), (1.,), (1.,)]
        all_layer_strides = [4, 8, 16, 32, 64, 128]
        offset_list = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
        with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE):
            backbone = sfd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.get_featmaps(features, training=False)
            with tf.device('/cpu:0'):
                anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2])

                if FLAGS.data_format == 'channels_first':
                    all_layer_shapes = [tf.shape(feat)[2:] for feat in feature_layers]
                else:
                    all_layer_shapes = [tf.shape(feat)[1:3] for feat in feature_layers]
                total_layers = len(all_layer_shapes)
                anchors_height = list()
                anchors_width = list()
                anchors_depth = list()
                for ind in range(total_layers):
                    _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind))
                    anchors_height.append(_anchors_height)
                    anchors_width.append(_anchors_width)
                    anchors_depth.append(_anchor_depth)
                anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, _ = anchor_encoder_decoder.get_all_anchors(tf.squeeze(output_shape, axis=0),
                                                                                anchors_height, anchors_width, anchors_depth,
                                                                                offset_list, all_layer_shapes, all_layer_strides,
                                                                                [0.] * total_layers, [False] * total_layers)
            location_pred, cls_pred = backbone.multibox_head(feature_layers, [1] * len(feature_layers),
                                        [3] + [1] * (len(feature_layers) - 1), anchors_depth)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred]
                location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred]

            cls_pred = [tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred]
            location_pred = [tf.reshape(pred, [-1, 4]) for pred in location_pred]

            cls_pred = tf.nn.softmax(tf.concat(cls_pred, axis=0))[:, -1]
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = anchor_encoder_decoder.decode_anchors(location_pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            os.makedirs(FLAGS.det_dir, exist_ok=True)

            if FLAGS.subset is 'val':
                wider_face = sio.loadmat(os.path.join(FLAGS.data_dir, 'wider_face_split', 'wider_face_val.mat'))    # Val set
            else:
                wider_face = sio.loadmat(os.path.join(FLAGS.data_dir, 'wider_face_split', 'wider_face_test.mat'))     # Test set
            event_list = wider_face['event_list']
            file_list = wider_face['file_list']
            del wider_face

            Path = os.path.join(FLAGS.data_dir, ('WIDER_val' if FLAGS.subset is 'val' else 'WIDER_test'), 'images')
            save_path = os.path.join(FLAGS.det_dir, FLAGS.subset)
            len_event = len(event_list)
            for index, event in enumerate(event_list):
                filelist = file_list[index][0]
                len_files = len(filelist)
                if not os.path.exists(os.path.join(save_path, event[0][0])):
                    os.makedirs(os.path.join(save_path, event[0][0]))

                for num, file in enumerate(filelist):
                    im_name = file[0][0]
                    Image_Path = os.path.join(Path, event[0][0], im_name[:]+'.jpg')

                    image = imread(Image_Path)
                    #image = imread('manymany.jpg')

                    max_im_shrink = (0x7fffffff / FLAGS.memory_limit / (image.shape[0] * image.shape[1])) ** 0.5 # the max size of input image for caffe
                    #max_im_shrink = (0x7fffffff / 80.0 / (image.shape[0] * image.shape[1])) ** 0.5 # the max size of input image for caffe
                    shrink = max_im_shrink if max_im_shrink < 1 else 1

                    det0 = detect_face([sess, image_input, bboxes_pred, cls_pred], image, shrink)  # origin test
                    det1 = flip_test([sess, image_input, bboxes_pred, cls_pred], image, shrink)    # flip test
                    [det2, det3] = multi_scale_test([sess, image_input, bboxes_pred, cls_pred], image, max_im_shrink)  #multi-scale test
                    # merge all test results via bounding box voting
                    det = np.row_stack((det0, det1, det2, det3))
                    dets = bbox_vote(det)

                    f = open(os.path.join(save_path, event[0][0], im_name+'.txt'), 'w')
                    write_to_txt(f, dets, event, im_name)
                    f.close()
                    if num % FLAGS.log_every_n_steps == 0:
                        img_to_draw = draw_toolbox.bboxes_draw_on_img(image, (dets[:, 4] > 0.2).astype(np.int32), dets[:, 4], dets[:, :4], thickness=2)
                        imsave(os.path.join(FLAGS.debug_dir, '{}.jpg'.format(im_name)), img_to_draw)

                    #imsave(os.path.join('./debug/{}_{}.jpg').format(index, num), draw_toolbox.absolute_bboxes_draw_on_img(image, (dets[:, 4]>0.1).astype(np.int32), dets[:, 4], dets[:, :4], thickness=2))
                    #break
                    sys.stdout.write('\r>> Predicting event:%d/%d num:%d/%d' % (index + 1, len_event, num + 1, len_files))
                    sys.stdout.flush()
                sys.stdout.write('\n')
                sys.stdout.flush()
Beispiel #13
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors(
            pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            for i in range(video_frame_cnt):
                ret, img_ori = vid.read()

                # height_ori, width_ori = img_ori.shape[:2]
                # img = cv2.resize(img_ori, tuple(args.new_size))
                img = cv2.cvtColor(img_ori, cv2.COLOR_BGR2RGB)
                np_image = np.asarray(img, np.float32)

                start_time = time.time()
                labels_, scores_, bboxes_ = sess.run(
                    [all_labels, all_scores, all_bboxes],
                    feed_dict={
                        image_input: np_image,
                        shape_input: np_image.shape[:-1]
                    })
                end_time = time.time()

                img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                              labels_,
                                                              scores_,
                                                              bboxes_,
                                                              thickness=2)
                cv2.putText(img_to_draw,
                            '{:.2f}ms'.format((end_time - start_time) * 1000),
                            (40, 40),
                            0,
                            fontScale=1,
                            color=(0, 255, 0),
                            thickness=2)

                imsave('./test_out.jpg', img_to_draw)

                new_img = cv2.imread('./test_out.jpg')
                cv2.imshow('image', new_img)

                videoWriter.write(new_img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

            vid.release()
            videoWriter.release()
def slim_get_split(file_pattern='{}_????'):
    # Features in Pascal VOC TFRecords.
    keys_to_features = {
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string,
                                           default_value='jpeg'),
        'image/filename': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/height': tf.FixedLenFeature([1], tf.int64),
        'image/width': tf.FixedLenFeature([1], tf.int64),
        'image/channels': tf.FixedLenFeature([1], tf.int64),
        'image/shape': tf.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/blur': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/expression': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/illumination': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/invalid': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/occlusion': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/pose': tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image':
        slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'filename':
        slim.tfexample_decoder.Tensor('image/filename'),
        'shape':
        slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox':
        slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                           'image/object/bbox/'),
        'object/blur':
        slim.tfexample_decoder.Tensor('image/object/bbox/blur'),
        'object/expression':
        slim.tfexample_decoder.Tensor('image/object/bbox/expression'),
        'object/illumination':
        slim.tfexample_decoder.Tensor('image/object/bbox/illumination'),
        'object/invalid':
        slim.tfexample_decoder.Tensor('image/object/bbox/invalid'),
        'object/occlusion':
        slim.tfexample_decoder.Tensor('image/object/bbox/occlusion'),
        'object/pose':
        slim.tfexample_decoder.Tensor('image/object/bbox/pose'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                      items_to_handlers)

    dataset = slim.dataset.Dataset(data_sources=file_pattern,
                                   reader=tf.TFRecordReader,
                                   decoder=decoder,
                                   num_samples=100,
                                   items_to_descriptions=None,
                                   num_classes=21,
                                   labels_to_names=None)

    with tf.name_scope('dataset_data_provider'):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=2,
            common_queue_capacity=32,
            common_queue_min=8,
            shuffle=True,
            num_epochs=1)

    [org_image, filename, shape, \
    g_bboxes, g_blur, g_expression, \
    g_illumination, g_invalid, g_occlusion, g_pose] = provider.get(['image', 'filename', 'shape',
                                                                     'object/bbox', 'object/blur',
                                                                     'object/expression', 'object/illumination',
                                                                     'object/invalid', 'object/occlusion', 'object/pose'])
    isinvalid_mask = tf.ones_like(g_invalid < 1)
    g_bboxes = tf.boolean_mask(g_bboxes, isinvalid_mask)
    g_blur = tf.boolean_mask(g_blur, isinvalid_mask)
    g_expression = tf.boolean_mask(g_expression, isinvalid_mask)
    g_illumination = tf.boolean_mask(g_illumination, isinvalid_mask)
    g_invalid = tf.boolean_mask(g_invalid, isinvalid_mask)
    g_occlusion = tf.boolean_mask(g_occlusion, isinvalid_mask)
    g_pose = tf.boolean_mask(g_pose, isinvalid_mask)

    image, gbboxes = dan_preprocessing.preprocess_image(
        org_image,
        g_bboxes, [640, 640],
        is_training=True,
        data_format='channels_last',
        output_rgb=True)

    # gbboxes = tf.boolean_mask(gbboxes, small_mask)
    # g_blur = tf.boolean_mask(g_blur, small_mask)
    # g_expression = tf.boolean_mask(g_expression, small_mask)
    # g_illumination = tf.boolean_mask(g_illumination, small_mask)
    # g_invalid = tf.boolean_mask(g_invalid, small_mask)
    # g_occlusion = tf.boolean_mask(g_occlusion, small_mask)
    # g_pose = tf.boolean_mask(g_pose, small_mask)

    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
        positive_threshold=0.5,
        ignore_threshold=0.5,
        prior_scaling=[0.1, 0.1, 0.2, 0.2])

    all_anchor_scales = [(16., ), (32., ), (64., ), (128., ), (256., ),
                         (512., )]
    all_extra_scales = [(), (), (), (), (), ()]
    all_anchor_ratios = [(1., ), (1., ), (1., ), (1., ), (1., ), (1., )]
    all_layer_shapes = [(160, 160), (80, 80), (40, 40), (20, 20), (10, 10),
                        (5, 5)]
    all_layer_strides = [4, 8, 16, 32, 64, 128]
    offset_list = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

    total_layers = len(all_layer_shapes)
    anchors_height = list()
    anchors_width = list()
    anchors_depth = list()
    for ind in range(total_layers):
        _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(
            all_anchor_scales[ind], all_extra_scales[ind],
            all_anchor_ratios[ind])
        anchors_height.append(_anchors_height)
        anchors_width.append(_anchors_width)
        anchors_depth.append(_anchor_depth)
    anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors(
        [640] * 2, anchors_height, anchors_width, anchors_depth, offset_list,
        all_layer_shapes, all_layer_strides, [640.] * total_layers,
        [False] * total_layers)

    gt_targets, gt_labels, gt_scores, _ = anchor_encoder_decoder.encode_anchors(
        gbboxes,
        anchors_ymin,
        anchors_xmin,
        anchors_ymax,
        anchors_xmax,
        inside_mask,
        debug=True)

    num_anchors_per_layer = list()
    for ind, layer_shape in enumerate(all_layer_shapes):
        _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count(
            anchors_depth[ind], layer_shape)
        num_anchors_per_layer.append(_num_anchors_per_layer)

    # split by layers
    all_anchors = tf.stack(
        [anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1)

    gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0),\
                                                tf.split(gt_labels, num_anchors_per_layer, axis=0),\
                                                tf.split(gt_scores, num_anchors_per_layer, axis=0),\
                                                tf.split(all_anchors, num_anchors_per_layer, axis=0)

    save_image_op = tf.py_func(save_image_with_bbox, [
        dan_preprocessing.unwhiten_image(image),
        tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max),
        tf.concat(gt_scores, axis=0),
        tf.concat(gt_targets, axis=0)
    ],
                               tf.int64,
                               stateful=True)
    return save_image_op
Beispiel #15
0
def slim_get_split(file_pattern='{}_????'):
    # Features in Pascal VOC TFRecords.
    keys_to_features = {
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string,
                                           default_value='jpeg'),
        'image/height': tf.FixedLenFeature([1], tf.int64),
        'image/width': tf.FixedLenFeature([1], tf.int64),
        'image/channels': tf.FixedLenFeature([1], tf.int64),
        'image/shape': tf.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image':
        slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'shape':
        slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox':
        slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                           'image/object/bbox/'),
        'object/label':
        slim.tfexample_decoder.Tensor('image/object/bbox/label'),
        'object/difficult':
        slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
        'object/truncated':
        slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                      items_to_handlers)

    dataset = slim.dataset.Dataset(data_sources=file_pattern,
                                   reader=tf.TFRecordReader,
                                   decoder=decoder,
                                   num_samples=100,
                                   items_to_descriptions=None,
                                   num_classes=21,
                                   labels_to_names=None)

    with tf.name_scope('dataset_data_provider'):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=2,
            common_queue_capacity=32,
            common_queue_min=8,
            shuffle=True,
            num_epochs=1)

    [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(
        ['image', 'shape', 'object/label', 'object/bbox', 'object/difficult'])
    image, glabels, gbboxes = ssd_preprocessing.preprocess_image(
        org_image,
        glabels_raw,
        gbboxes_raw, [300, 300],
        is_training=True,
        data_format='channels_last',
        output_rgb=True)

    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
        positive_threshold=0.5,
        ignore_threshold=0.5,
        prior_scaling=[0.1, 0.1, 0.2, 0.2])

    all_anchor_scales = [(30., ), (60., ), (112.5, ), (165., ), (217.5, ),
                         (270., )]
    all_extra_scales = [(42.43, ), (82.17, ), (136.23, ), (189.45, ),
                        (242.34, ), (295.08, )]
    all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
                         (2., 3., .5, 0.3333), (2., .5), (2., .5)]
    all_layer_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
    all_layer_strides = [8, 16, 32, 64, 100, 300]
    total_layers = len(all_layer_shapes)
    anchors_height = list()
    anchors_width = list()
    anchors_depth = list()
    for ind in range(total_layers):
        _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(
            all_anchor_scales[ind], all_extra_scales[ind],
            all_anchor_ratios[ind])
        anchors_height.append(_anchors_height)
        anchors_width.append(_anchors_width)
        anchors_depth.append(_anchor_depth)
    anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors(
        [300] * 2, anchors_height, anchors_width, anchors_depth,
        [0.5] * total_layers, all_layer_shapes, all_layer_strides,
        [300.] * total_layers, [False] * total_layers)

    gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_anchors(
        glabels, gbboxes, anchors_ymin, anchors_xmin, anchors_ymax,
        anchors_xmax, inside_mask, True)

    num_anchors_per_layer = list()
    for ind, layer_shape in enumerate(all_layer_shapes):
        _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count(
            anchors_depth[ind], layer_shape)
        num_anchors_per_layer.append(_num_anchors_per_layer)

    # split by layers
    all_anchors = tf.stack(
        [anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1)

    gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0),\
                                                tf.split(gt_labels, num_anchors_per_layer, axis=0),\
                                                tf.split(gt_scores, num_anchors_per_layer, axis=0),\
                                                tf.split(all_anchors, num_anchors_per_layer, axis=0)

    save_image_op = tf.py_func(save_image_with_bbox, [
        ssd_preprocessing.unwhiten_image(image),
        tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max),
        tf.concat(gt_scores, axis=0),
        tf.concat(gt_targets, axis=0)
    ],
                               tf.int64,
                               stateful=True)
    return save_image_op
Beispiel #16
0
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
            #(2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])

        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )
        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        def decode_fn(pred):
            return anchor_encoder_decoder.ext_decode_all_anchors(
                pred, all_anchors, all_num_anchors_depth,
                all_num_anchors_spatial)

        with tf.name_scope('g2_cls_pred'):
            g2_cls_pred = tf.placeholder(tf.float32,
                                         shape=(8732, 21),
                                         name='g2_cls_pred')
        with tf.name_scope('g2_location_pred'):
            g2_location_pred = tf.placeholder(tf.float32,
                                              shape=(8732, 4),
Beispiel #17
0
def ssd_model_fn(features, labels, mode, params):
    """model_fn for SSD to be used with our Estimator."""
    filename = features['filename']
    filename = tf.identity(filename, name='filename')
    shape = features['shape']
    output_shape = features['output_shape']
    features = features['image']

    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2])
    all_anchor_scales = [(30.,), (60.,), (112.5,), (165.,), (217.5,), (270.,)]
    all_extra_scales = [(42.43,), (82.17,), (136.23,), (189.45,), (242.34,), (295.08,)]
    all_anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)]
    #all_anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)]

    with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE):
        backbone = ssd_net.VGG16Backbone(params['data_format'])
        # forward features
        feature_layers = backbone.forward(features, training=(mode == tf.estimator.ModeKeys.TRAIN))
        # generate anchors according to the feature map size
        with tf.device('/cpu:0'):
            if params['data_format'] == 'channels_first':
                all_layer_shapes = [tf.shape(feat)[2:] for feat in feature_layers]
            else:
                all_layer_shapes = [tf.shape(feat)[1:3] for feat in feature_layers]
            all_layer_strides = [8, 16, 32, 64, 100, 300]
            total_layers = len(all_layer_shapes)
            anchors_height = list()
            anchors_width = list()
            anchors_depth = list()
            for ind in range(total_layers):
                _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(all_anchor_scales[ind], all_extra_scales[ind], all_anchor_ratios[ind], name='get_anchors_width_height{}'.format(ind))
                anchors_height.append(_anchors_height)
                anchors_width.append(_anchors_width)
                anchors_depth.append(_anchor_depth)
            anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, _ = anchor_encoder_decoder.get_all_anchors(tf.squeeze(output_shape, axis=0),
                                                                            anchors_height, anchors_width, anchors_depth,
                                                                            [0.5] * total_layers, all_layer_shapes, all_layer_strides,
                                                                            [0.] * total_layers, [False] * total_layers)
        # generate predictions based on anchors
        location_pred, cls_pred = ssd_net.multibox_head(feature_layers, params['num_classes'], anchors_depth, data_format=params['data_format'])
        if params['data_format'] == 'channels_first':
            cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred]
            location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred]

        cls_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred]
        location_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred]

        cls_pred = tf.concat(cls_pred, axis=1)
        location_pred = tf.concat(location_pred, axis=1)

        cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']])
        location_pred = tf.reshape(location_pred, [-1, 4])
    # decode predictions
    with tf.device('/cpu:0'):
        bboxes_pred = anchor_encoder_decoder.decode_anchors(location_pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax)
        selected_bboxes, selected_scores = bbox_util.parse_by_class(tf.squeeze(output_shape, axis=0), cls_pred, bboxes_pred,
                                                        params['num_classes'], params['select_threshold'], params['min_size'],
                                                        params['keep_topk'], params['nms_topk'], params['nms_threshold'])

    labels_list = []
    scores_list = []
    bboxes_list = []
    for k, v in selected_scores.items():
        labels_list.append(tf.ones_like(v, tf.int32) * k)
        scores_list.append(v)
        bboxes_list.append(selected_bboxes[k])
    all_labels = tf.concat(labels_list, axis=0)
    all_scores = tf.concat(scores_list, axis=0)
    all_bboxes = tf.concat(bboxes_list, axis=0)
    save_image_op = tf.py_func(save_image_with_bbox,
                        [ssd_preprocessing.unwhiten_image(tf.squeeze(features, axis=0), output_rgb=False),
                        all_labels * tf.to_int32(all_scores > 0.3),
                        all_scores,
                        all_bboxes],
                        tf.int64, stateful=True)
    tf.identity(save_image_op, name='save_image_op')
    predictions = {'filename': filename, 'shape': shape, 'output_shape': output_shape }
    for class_ind in range(1, params['num_classes']):
        predictions['scores_{}'.format(class_ind)] = tf.expand_dims(selected_scores[class_ind], axis=0)
        predictions['bboxes_{}'.format(class_ind)] = tf.expand_dims(selected_bboxes[class_ind], axis=0)

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
              mode=mode,
              predictions=predictions,
              prediction_hooks=None, loss=None, train_op=None)
    else:
        raise ValueError('This script only support "PREDICT" mode!')
Beispiel #18
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2
        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
            #(2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        def decode_fn(pred):
            return anchor_encoder_decoder.ext_decode_all_anchors(
                pred, all_anchors, all_num_anchors_depth,
                all_num_anchors_spatial)

        with tf.name_scope('define_input'):
            image_input = tf.placeholder(tf.float32,
                                         shape=(1, 300, 300, 3),
                                         name='image_input')
        print('image_input', image_input)
        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[image_input],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(image_input, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            with tf.variable_scope('cls_pred'):
                cls_pred = tf.concat(cls_pred, axis=0)
            with tf.variable_scope('location_pred'):
                location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        '''
        config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        config.mlu_options.data_parallelism = 1
        config.mlu_options.model_parallelism = 1
        config.mlu_options.core_num = 1
        config.mlu_options.core_version = 'MLU270'
        config.mlu_options.precision = 'float'
        with tf.Session(config = config) as sess:
        '''
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            _R_MEAN = 123.68
            _G_MEAN = 116.78
            _B_MEAN = 103.94
            means = [
                _B_MEAN,
                _G_MEAN,
                _R_MEAN,
            ]
            np_image = cv2.imread('demo/test.jpg')
            image = cv2.resize(
                np_image, (FLAGS.train_image_size, FLAGS.train_image_size))
            image = (image - means)  # / 255.0
            image = np.expand_dims(image, axis=0)
            print('image', type(image), image.shape)
            '''
            image = tf.to_float(np_image)
            image = tf.image.resize_images(image, out_shape,
                                           method=tf.image.ResizeMethod.BILINEAR, align_corners=False)
            image.set_shape(out_shape + [3])
            num_channels = image.get_shape().as_list()[-1]
            channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
            for i in range(num_channels):
                channels[i] -= means[i]
            image = tf.concat(axis=2, values=channels)
            image_channels = tf.unstack(image, axis=-1, name='split_rgb')
            image = tf.stack([image_channels[2], image_channels[1], image_channels[0]], axis=-1, name='merge_bgr')
            '''

            labels_, scores_, bboxes_ = sess.run(
                [all_labels, all_scores, all_bboxes],
                feed_dict={image_input: image})
            #print('labels_', labels_, type(labels_), labels_.shape)
            #print('scores_', scores_, type(scores_), scores_.shape)
            #print('bboxes_', bboxes_, type(bboxes_), bboxes_.shape, bboxes_.shape[0])

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            cv2.imwrite('demo/test_out.jpg', img_to_draw)
            saver.save(sess,
                       'model/ssd300_vgg16/ssd300_vgg16_short',
                       global_step=0)
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(2., .5),
                           (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
                           (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors(
            pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            np_image = imread('./demo/test.jpg')
            labels_, scores_, bboxes_ = sess.run(
                [all_labels, all_scores, all_bboxes],
                feed_dict={
                    image_input: np_image,
                    shape_input: np_image.shape[:-1]
                })

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            imsave('./demo/test_out.jpg', img_to_draw)
Beispiel #20
0
    def input_fn():
        #train_imgage_size = 300 [300, 300]
        target_shape = [FLAGS.train_image_size] * 2

        #match_threshold:0.5
        #neg_threshold:0.5
        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            positive_threshold=FLAGS.match_threshold,
            ignore_threshold=FLAGS.neg_threshold,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        all_anchor_scales = [(30., ), (60., ), (112.5, ), (165., ), (217.5, ),
                             (270., )]
        all_extra_scales = [(42.43, ), (82.17, ), (136.23, ), (189.45, ),
                            (242.34, ), (295.08, )]
        all_anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333),
                             (1., 2., 3., .5, 0.3333),
                             (1., 2., 3., .5, 0.3333), (1., 2., .5),
                             (1., 2., .5)]
        all_layer_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                            (1, 1)]
        all_layer_strides = [8, 16, 32, 64, 100, 300]
        total_layers = len(all_layer_shapes)
        anchors_height = list()
        anchors_width = list()
        anchors_depth = list()
        for ind in range(total_layers):
            #若该层有n个default_prior_box则anchors_height是这些box的h,_anchor_depth是n
            _anchors_height, _anchors_width, _anchor_depth = anchor_encoder_decoder.get_anchors_width_height(
                all_anchor_scales[ind],
                all_extra_scales[ind],
                all_anchor_ratios[ind],
                name='get_anchors_width_height{}'.format(ind))
            anchors_height.append(_anchors_height)
            anchors_width.append(_anchors_width)
            anchors_depth.append(_anchor_depth)
        #anchors_ymin: [38*38*4 + 19*19*6 + 10*10*6 + 5*5*6 + 3*3*4 + 1*!*4]
        anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax, inside_mask = anchor_encoder_decoder.get_all_anchors(
            target_shape, anchors_height, anchors_width, anchors_depth,
            [0.5] * total_layers, all_layer_shapes, all_layer_strides,
            [FLAGS.train_image_size * 1.] * total_layers,
            [False] * total_layers)

        num_anchors_per_layer = list()
        #all_layer_shapes [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
        for ind, layer_shape in enumerate(all_layer_shapes):
            #num_anchors_per_layer:layer_shape[0]*layer_layer[1]*anchors_depth
            _, _num_anchors_per_layer = anchor_encoder_decoder.get_anchors_count(
                anchors_depth[ind],
                layer_shape,
                name='get_anchor_count{}'.format(ind))
            num_anchors_per_layer.append(_num_anchors_per_layer)
        #num_anchors_per_layer:[38*38*4, 19*19*6, 10*10*6, 5*5*6, 3*3*4, 1*!*4]
        image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image(
            image_,
            labels_,
            bboxes_,
            target_shape,
            is_training=is_training,
            data_format=FLAGS.data_format,
            output_rgb=False)
        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_anchors(
            glabels_, gbboxes_, anchors_ymin, anchors_xmin, anchors_ymax,
            anchors_xmax, inside_mask)

        image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(
            FLAGS.num_classes,
            batch_size, ('train' if is_training else 'val'),
            os.path.join(FLAGS.data_dir, dataset_pattern),
            FLAGS.num_readers,
            FLAGS.num_preprocessing_threads,
            image_preprocessing_fn,
            anchor_encoder_fn,
            num_epochs=FLAGS.train_epochs,
            is_training=is_training)
        global global_anchor_info
        global_anchor_info = {
            'decode_fn':
            lambda pred: anchor_encoder_decoder.batch_decode_anchors(
                pred, anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax),
            'num_anchors_per_layer':
            num_anchors_per_layer,
            'all_num_anchors_depth':
            anchors_depth
        }

        return image, {
            'shape': shape,
            'loc_targets': loc_targets,
            'cls_targets': cls_targets,
            'match_scores': match_scores
        }