Beispiel #1
0
    def input_fn():
        with tf.name_scope('post_forward'):
            out_shape = [FLAGS.train_image_size] * 2
            anchor_creator = anchor_manipulator.AnchorCreator(
                out_shape,
                layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                               (1, 1)],
                anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ),
                               (0.725, ), (0.9, )],
                extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                     (0.6315, ), (0.8078, ), (0.9836, )],
                anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                               (1., 2., 3., .5, 0.3333),
                               (1., 2., 3., .5, 0.3333), (1., 2., .5),
                               (1., 2., .5)],
                layer_steps=[8, 16, 32, 64, 100, 300])
            all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
            )

            num_anchors_per_layer = []
            for ind in range(len(all_anchors)):
                num_anchors_per_layer.append(all_num_anchors_depth[ind] *
                                             all_num_anchors_spatial[ind])

            anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
                allowed_borders=[1.0] * 6,
                positive_threshold=FLAGS.match_threshold,
                ignore_threshold=FLAGS.neg_threshold,
                prior_scaling=[0.1, 0.1, 0.2, 0.2])

            #             global global_anchor_info
            #             global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer),
            #                                 'num_anchors_per_layer': num_anchors_per_layer,
            #                                 'all_num_anchors_depth': all_num_anchors_depth,
            #                                 'encode_fn': lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)}

            image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image(
                image_,
                labels_,
                bboxes_,
                out_shape,
                is_training=is_training,
                data_format=FLAGS.data_format,
                output_rgb=False)

            #             anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

            filenames = tf.placeholder(tf.string, shape=[None])
            arga = tf.constant(True)

            dataset = tf.data.TFRecordDataset(training_files)
            dataset = dataset.map(lambda x: data_mapping_fn(
                x, is_training, image_preprocessing_fn))
            dataset = dataset.repeat()  # repeat the input infinitely
            dataset = dataset.batch(batch_size)  # set the batch size
            iterator = dataset.make_initializable_iterator()

            # return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
            return dataset
    def input_fn():
        out_shape = [FLAGS.train_image_size] * 2
        ssd300_anchor_params = {'layers_shapes': [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], 'anchor_scales': [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
                                                    'extra_anchor_scales': [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
                                                    'anchor_ratios': [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
                                                    'layer_steps': [8, 16, 32, 64, 100, 300]}
        ssd512_anchor_params = {'layers_shapes': [(64, 64), (32, 32), (16, 16), (8, 8), (4, 4), (2, 2), (1, 1)],
                                                    'anchor_scales': [(0.07,), (0.15,), (0.3,), (0.45,), (0.6,), (0.75,), (0.9,)],
                                                    'extra_anchor_scales': [(0.1025,), (0.2121,), (0.3674,), (0.5196,), (0.6708,), (0.8216,), (0.9721,)],
                                                    'anchor_ratios': [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
                                                    'layer_steps': [8, 16, 32, 64, 128, 256, 512]}
        if FLAGS.train_image_size == 512:
            net_params = ssd512_anchor_params
            print('using ssd512 model')
        else:
            net_params = ssd300_anchor_params
            print('using ssd300 model')
        anchor_creator = anchor_manipulator.AnchorCreator(out_shape, **net_params)
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()

        num_anchors_per_layer = []
        for ind in range(len(all_anchors)):
            num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * len(net_params['layer_steps']),
                                                            positive_threshold = FLAGS.match_threshold,
                                                            ignore_threshold = FLAGS.neg_threshold,
                                                            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        image, filename, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes,
                                                                                batch_size,
                                                                                ('train' if is_training else 'val'),
                                                                                os.path.join(FLAGS.data_dir, dataset_pattern),
                                                                                FLAGS.num_readers,
                                                                                FLAGS.num_preprocessing_threads,
                                                                                image_preprocessing_fn,
                                                                                anchor_encoder_fn,
                                                                                num_epochs=FLAGS.train_epochs,
                                                                                is_training=is_training)
        global global_anchor_info
        global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer),
                            'num_anchors_per_layer': num_anchors_per_layer,
                            'all_num_anchors_depth': all_num_anchors_depth }

        return {'image': image, 'filename': filename, 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}, None
Beispiel #3
0
    def input_fn():
        out_shape = [FLAGS.train_image_size] * 2
        anchor_creator = anchor_manipulator.AnchorCreator(out_shape,
                                                    layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
                                                    anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
                                                    extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
                                                    anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
                                                    layer_steps = [8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()
        # all_anchors: [[(38x38x1),(38x38x1),(4x1),(4x1)],[(19x19x1),(19x19x1),(4x1),(4x1)]... ] -> recording all the anchors information

        num_anchors_per_layer = []
        for ind in range(len(all_anchors)):
            num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6,
                                                            positive_threshold = FLAGS.match_threshold,
                                                            ignore_threshold = FLAGS.neg_threshold,
                                                            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
        
        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)
        
        anchor_decoder_fn = lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer)

        image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes,
                                                                                batch_size,
                                                                                ('train' if is_training else 'val'),
                                                                                os.path.join(FLAGS.data_dir, dataset_pattern),
                                                                                FLAGS.num_readers,
                                                                                FLAGS.num_preprocessing_threads,
                                                                                image_preprocessing_fn,
                                                                                anchor_encoder_fn,
                                                                                num_epochs=FLAGS.train_epochs,
                                                                                is_training=is_training)
        global global_anchor_info
        global_anchor_info = {'decode_fn': anchor_decoder_fn,
                            'num_anchors_per_layer': num_anchors_per_layer,
                            'all_num_anchors_depth': all_num_anchors_depth }

        return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(2., .5),
                           (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
                           (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors(
            pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            np_image = imread('./demo/test.jpg')
            labels_, scores_, bboxes_ = sess.run(
                [all_labels, all_scores, all_bboxes],
                feed_dict={
                    image_input: np_image,
                    shape_input: np_image.shape[:-1]
                })

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            imsave('./demo/test_out.jpg', img_to_draw)
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        with tf.name_scope('define_input'):
            image_input = tf.placeholder(tf.uint8,
                                         shape=(None, None, 3),
                                         name='image_input')

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
            #(2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        def decode_fn(pred):
            return anchor_encoder_decoder.ext_decode_all_anchors(
                pred, all_anchors, all_num_anchors_depth,
                all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            with tf.variable_scope('cls_pred'):
                cls_pred = tf.concat(cls_pred, axis=0)
            with tf.variable_scope('location_pred'):
                location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        '''
        config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        config.mlu_options.data_parallelism = 1
        config.mlu_options.model_parallelism = 1
        config.mlu_options.core_num = 1
        config.mlu_options.core_version = 'MLU270'
        config.mlu_options.precision = 'float'
        with tf.Session(config = config) as sess:
        '''
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            np_image = imread('demo/test.jpg')
            labels_, scores_, bboxes_ = sess.run(
                [all_labels, all_scores, all_bboxes],
                feed_dict={image_input: np_image})
            #print('labels_', labels_, type(labels_), labels_.shape)
            #print('scores_', scores_, type(scores_), scores_.shape)
            #print('bboxes_', bboxes_, type(bboxes_), bboxes_.shape, bboxes_.shape[0])

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            imsave('demo/test_out.jpg', img_to_draw)
            saver.save(sess, 'model/ssd300_vgg16/ssd300_vgg16', global_step=0)
Beispiel #6
0
def slim_get_split(file_pattern='{}_????'):
    # Features in Pascal VOC TFRecords.
    keys_to_features = {
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
        'image/height': tf.FixedLenFeature([1], tf.int64),
        'image/width': tf.FixedLenFeature([1], tf.int64),
        'image/channels': tf.FixedLenFeature([1], tf.int64),
        'image/shape': tf.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'shape': slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox': slim.tfexample_decoder.BoundingBox(
            ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
        'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
        'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
        'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)

    dataset = slim.dataset.Dataset(
        data_sources=file_pattern,
        reader=tf.TFRecordReader,
        decoder=decoder,
        num_samples=100,
        items_to_descriptions=None,
        num_classes=21,
        labels_to_names=None)

    with tf.name_scope('dataset_data_provider'):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=2,
            common_queue_capacity=32,
            common_queue_min=8,
            shuffle=True,
            num_epochs=1)

    [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'shape',
                                                                              'object/label',
                                                                              'object/bbox',
                                                                              'object/difficult'])
    image, glabels, gbboxes = ssd_preprocessing.preprocess_image(org_image, glabels_raw, gbboxes_raw, [300, 300],
                                                                 is_training=True, data_format='channels_last',
                                                                 output_rgb=True)

    anchor_creator = anchor_manipulator.AnchorCreator([300] * 2,
                                                      layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                                                                     (1, 1)],
                                                      anchor_scales=[(0.1,), (0.2,), (0.375,), (0.55,), (0.725,),
                                                                     (0.9,)],
                                                      extra_anchor_scales=[(0.1414,), (0.2739,), (0.4541,), (0.6315,),
                                                                           (0.8078,), (0.9836,)],
                                                      anchor_ratios=[(2., .5), (2., 3., .5, 0.3333),
                                                                     (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
                                                                     (2., .5), (2., .5)],
                                                      layer_steps=[8, 16, 32, 64, 100, 300])

    all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()

    num_anchors_per_layer = []
    for ind in range(len(all_anchors)):
        num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])

    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders=[1.0] * 6,
                                                              positive_threshold=0.5,
                                                              ignore_threshold=0.5,
                                                              prior_scaling=[0.1, 0.1, 0.2, 0.2])

    gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_all_anchors(glabels, gbboxes, all_anchors,
                                                                                 all_num_anchors_depth,
                                                                                 all_num_anchors_spatial, True)

    anchors = anchor_encoder_decoder._all_anchors
    # split by layers
    gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0), \
                                                tf.split(gt_labels, num_anchors_per_layer, axis=0), \
                                                tf.split(gt_scores, num_anchors_per_layer, axis=0), \
                                                [tf.split(anchor, num_anchors_per_layer, axis=0) for anchor in anchors]

    save_image_op = tf.py_func(save_image_with_bbox,
                               [ssd_preprocessing.unwhiten_image(image),
                                tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max),
                                tf.concat(gt_scores, axis=0),
                                tf.concat(gt_targets, axis=0)],
                               tf.int64, stateful=True)
    return save_image_op
Beispiel #7
0
def data_mapping_fn(example_proto, is_training, image_preprocessing_fn):
    keys_to_features = {
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string,
                                           default_value='jpeg'),
        'image/filename': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/height': tf.FixedLenFeature([1], tf.int64),
        'image/width': tf.FixedLenFeature([1], tf.int64),
        'image/channels': tf.FixedLenFeature([1], tf.int64),
        'image/shape': tf.FixedLenFeature([3], tf.int64),
        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image':
        slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'filename':
        slim.tfexample_decoder.Tensor('image/filename'),
        'shape':
        slim.tfexample_decoder.Tensor('image/shape'),
        'object/bbox':
        slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                           'image/object/bbox/'),
        'object/label':
        slim.tfexample_decoder.Tensor('image/object/bbox/label'),
        'object/difficult':
        slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
        'object/truncated':
        slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                      items_to_handlers)
    [org_image, filename, shape, glabels_raw, gbboxes_raw,
     isdifficult] = decoder.decode(example_proto, [
         'image', 'filename', 'shape', 'object/label', 'object/bbox',
         'object/difficult'
     ])

    if is_training:
        # if all is difficult, then keep the first one
        isdifficult_mask = tf.cond(
            tf.count_nonzero(isdifficult,
                             dtype=tf.int32) < tf.shape(isdifficult)[0],
            lambda: isdifficult < tf.ones_like(isdifficult),
            lambda: tf.one_hot(0,
                               tf.shape(isdifficult)[0],
                               on_value=True,
                               off_value=False,
                               dtype=tf.bool))

        glabels_raw = tf.boolean_mask(glabels_raw, isdifficult_mask)
        gbboxes_raw = tf.boolean_mask(gbboxes_raw, isdifficult_mask)

    # Pre-processing image, labels and bboxes.
    out_shape = [FLAGS.train_image_size] * 2
    anchor_creator = anchor_manipulator.AnchorCreator(
        out_shape,
        layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
        anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                       (0.9, )],
        extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ),
                             (0.8078, ), (0.9836, )],
        anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                       (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                       (1., 2., .5), (1., 2., .5)],
        layer_steps=[8, 16, 32, 64, 100, 300])

    all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
    )

    num_anchors_per_layer = []
    for ind in range(len(all_anchors)):
        num_anchors_per_layer.append(all_num_anchors_depth[ind] *
                                     all_num_anchors_spatial[ind])

    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
        allowed_borders=[1.0] * 6,
        positive_threshold=FLAGS.match_threshold,
        ignore_threshold=FLAGS.neg_threshold,
        prior_scaling=[0.1, 0.1, 0.2, 0.2])

    global global_anchor_info
    global_anchor_info = {
        'decode_fn':
        lambda pred: anchor_encoder_decoder.decode_all_anchors(
            pred, num_anchors_per_layer),
        'num_anchors_per_layer':
        num_anchors_per_layer,
        'all_num_anchors_depth':
        all_num_anchors_depth,
        'encode_fn':
        lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(
            glabels_, gbboxes_, all_anchors, all_num_anchors_depth,
            all_num_anchors_spatial)
    }

    #     global global_anchor_info

    #     global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer)}

    if is_training:
        image, glabels, gbboxes = image_preprocessing_fn(
            org_image, glabels_raw, gbboxes_raw)
    else:
        image = image_preprocessing_fn(org_image, glabels_raw, gbboxes_raw)
        glabels, gbboxes = glabels_raw, gbboxes_raw

    gt_targets, gt_labels, gt_scores = global_anchor_info['encode_fn'](glabels,
                                                                       gbboxes)

    # return [image, filename, shape, gt_targets, gt_labels, gt_scores]
    return image, {
        'shape': shape,
        'loc_targets': gt_targets,
        'cls_targets': gt_labels,
        'match_scores': gt_scores
    }
Beispiel #8
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)  #(N, W, H, C)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors(
            pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            #backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            backbone = ssd_net.MobileNetV2Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)

            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)  #
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            while (video.isOpened()):
                ret, frame = video.read()
                if ret == False:
                    break
                else:
                    timer2 = cv2.getTickCount()
                    if undistort == 'y':
                        ######################################## Undistortion Parts ########################################
                        dim2 = None
                        dim3 = None

                        timer = cv2.getTickCount()

                        dim1 = frame.shape[:
                                           2][::
                                              -1]  # dim1 is the dimension of input image to un-distort
                        assert dim1[0] / dim1[1] == DIM[0] / DIM[
                            1], "Image to undistort needs to have same aspect ratio as the ones used in calibration"
                        if not dim2:
                            dim2 = dim1
                        if not dim3:
                            dim3 = dim1
                        scaled_K = K * dim1[0] / DIM[
                            0]  # The values of K is to scale with image dimension.
                        scaled_K[2][
                            2] = 1.0  # Except that K[2][2] is always 1.0
                        # This is how scaled_K, dim2 and balance are used to determine the final K used to un-distort image. OpenCV document failed to make this clear!

                        new_K = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify(
                            scaled_K, D, dim2, np.eye(3), balance=0)
                        map1, map2 = cv2.fisheye.initUndistortRectifyMap(
                            scaled_K, D, np.eye(3), new_K, dim3, cv2.CV_16SC2)

                        frame_r = cv2.remap(frame,
                                            map1,
                                            map2,
                                            interpolation=cv2.INTER_LINEAR,
                                            borderMode=cv2.BORDER_CONSTANT)

                        t = (cv2.getTickCount() -
                             timer) / cv2.getTickFrequency()

                        # frame_r = cv2.resize(dst, (640, 360))
                        # frame_r = cv2.putText(frame_r, "Undistortion processing time: %.3f sec" % t, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0, 255, 255), 2) #(1.0 / (end - start))
                        # ###############################################################################################

                    #np_image = imread('./demo/test.jpg')
                    labels_, scores_, bboxes_ = sess.run(
                        [all_labels, all_scores, all_bboxes],
                        feed_dict={
                            image_input: frame,
                            shape_input: frame.shape[:-1]
                        })

                    img_to_draw = draw_toolbox.bboxes_draw_on_img(frame,
                                                                  labels_,
                                                                  scores_,
                                                                  bboxes_,
                                                                  thickness=2)
                    fps = cv2.getTickFrequency() / (cv2.getTickCount() -
                                                    timer2)
                    img_to_draw = cv2.putText(img_to_draw, "FPS : %.1f" % fps,
                                              (10, 20),
                                              cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                                              (0, 255, 255), 2)  # dst_r
                    cv2.imshow('Object detector', img_to_draw)  # dst_r
                    if cv2.waitKey(1) == ord('q'):
                        break
Beispiel #9
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2
        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
            #(2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        def decode_fn(pred):
            return anchor_encoder_decoder.ext_decode_all_anchors(
                pred, all_anchors, all_num_anchors_depth,
                all_num_anchors_spatial)

        with tf.name_scope('define_input'):
            image_input = tf.placeholder(tf.float32,
                                         shape=(1, 300, 300, 3),
                                         name='image_input')
        print('image_input', image_input)
        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[image_input],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(image_input, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            with tf.variable_scope('cls_pred'):
                cls_pred = tf.concat(cls_pred, axis=0)
            with tf.variable_scope('location_pred'):
                location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        '''
        config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        config.mlu_options.data_parallelism = 1
        config.mlu_options.model_parallelism = 1
        config.mlu_options.core_num = 1
        config.mlu_options.core_version = 'MLU270'
        config.mlu_options.precision = 'float'
        with tf.Session(config = config) as sess:
        '''
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            _R_MEAN = 123.68
            _G_MEAN = 116.78
            _B_MEAN = 103.94
            means = [
                _B_MEAN,
                _G_MEAN,
                _R_MEAN,
            ]
            np_image = cv2.imread('demo/test.jpg')
            image = cv2.resize(
                np_image, (FLAGS.train_image_size, FLAGS.train_image_size))
            image = (image - means)  # / 255.0
            image = np.expand_dims(image, axis=0)
            print('image', type(image), image.shape)
            '''
            image = tf.to_float(np_image)
            image = tf.image.resize_images(image, out_shape,
                                           method=tf.image.ResizeMethod.BILINEAR, align_corners=False)
            image.set_shape(out_shape + [3])
            num_channels = image.get_shape().as_list()[-1]
            channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
            for i in range(num_channels):
                channels[i] -= means[i]
            image = tf.concat(axis=2, values=channels)
            image_channels = tf.unstack(image, axis=-1, name='split_rgb')
            image = tf.stack([image_channels[2], image_channels[1], image_channels[0]], axis=-1, name='merge_bgr')
            '''

            labels_, scores_, bboxes_ = sess.run(
                [all_labels, all_scores, all_bboxes],
                feed_dict={image_input: image})
            #print('labels_', labels_, type(labels_), labels_.shape)
            #print('scores_', scores_, type(scores_), scores_.shape)
            #print('bboxes_', bboxes_, type(bboxes_), bboxes_.shape, bboxes_.shape[0])

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            cv2.imwrite('demo/test_out.jpg', img_to_draw)
            saver.save(sess,
                       'model/ssd300_vgg16/ssd300_vgg16_short',
                       global_step=0)
Beispiel #10
0
                        class_ind] = nms_bboxes(
                            selected_scores[class_ind],
                            selected_bboxes[class_ind], nms_topk,
                            nms_threshold, 'nms_bboxes_{}'.format(class_ind))

                return selected_bboxes, selected_scores

        out_shape = [300] * 2

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333),
            #(2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])

        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )
        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])
Beispiel #11
0
    def input_fn():
        out_shape = [300, 510]  #[FLAGS.train_image_size] * 2
        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 64), (19, 32), (10, 16), (5, 8), (3, 6),
                           (1, 4)],
            anchor_scales=[(0.05, ), (0.1, ), (0.2, ), (0.3, ), (0.4, ),
                           (0.5, )],
            extra_anchor_scales=[(0.07, ), (0.1414, ), (0.245, ), (0.346, ),
                                 (0.447, ), (0.547, )],
            anchor_ratios=[(1., ), (1., ), (1., ), (1., ), (1., ), (1., )],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        num_anchors_per_layer = []
        for ind in range(len(all_anchors)):
            num_anchors_per_layer.append(all_num_anchors_depth[ind] *
                                         all_num_anchors_spatial[ind])

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=FLAGS.match_threshold,
            ignore_threshold=FLAGS.neg_threshold,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        image_preprocessing_fn = lambda image_, labels_, bboxes_: ssd_preprocessing.preprocess_image(
            image_,
            labels_,
            bboxes_,
            out_shape,
            is_training=is_training,
            data_format=FLAGS.data_format,
            output_rgb=False)
        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(
            glabels_, gbboxes_, all_anchors, all_num_anchors_depth,
            all_num_anchors_spatial)

        image, filename, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(
            FLAGS.num_classes,
            batch_size, ('train' if is_training else 'val'),
            os.path.join(FLAGS.data_dir, dataset_pattern),
            FLAGS.num_readers,
            FLAGS.num_preprocessing_threads,
            image_preprocessing_fn,
            anchor_encoder_fn,
            num_epochs=FLAGS.train_epochs,
            is_training=is_training)
        global global_anchor_info
        global_anchor_info = {
            'decode_fn':
            lambda pred: anchor_encoder_decoder.decode_all_anchors(
                pred, num_anchors_per_layer),
            'num_anchors_per_layer':
            num_anchors_per_layer,
            'all_num_anchors_depth':
            all_num_anchors_depth
        }

        return {
            'image': image,
            'filename': filename,
            'shape': shape,
            'loc_targets': loc_targets,
            'cls_targets': cls_targets,
            'match_scores': match_scores
        }, None
Beispiel #12
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors(
            pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            for i in range(video_frame_cnt):
                ret, img_ori = vid.read()

                # height_ori, width_ori = img_ori.shape[:2]
                # img = cv2.resize(img_ori, tuple(args.new_size))
                img = cv2.cvtColor(img_ori, cv2.COLOR_BGR2RGB)
                np_image = np.asarray(img, np.float32)

                start_time = time.time()
                labels_, scores_, bboxes_ = sess.run(
                    [all_labels, all_scores, all_bboxes],
                    feed_dict={
                        image_input: np_image,
                        shape_input: np_image.shape[:-1]
                    })
                end_time = time.time()

                img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                              labels_,
                                                              scores_,
                                                              bboxes_,
                                                              thickness=2)
                cv2.putText(img_to_draw,
                            '{:.2f}ms'.format((end_time - start_time) * 1000),
                            (40, 40),
                            0,
                            fontScale=1,
                            color=(0, 255, 0),
                            thickness=2)

                imsave('./test_out.jpg', img_to_draw)

                new_img = cv2.imread('./test_out.jpg')
                cv2.imshow('image', new_img)

                videoWriter.write(new_img)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

            vid.release()
            videoWriter.release()