Example #1
0
def ssd_model_fn(features, labels, mode, params):
    """model_fn for SSD to be used with our Estimator."""
    shape = labels['shape']
    loc_targets = labels['loc_targets']
    cls_targets = labels['cls_targets']
    match_scores = labels['match_scores']

    global global_anchor_info
    decode_fn = global_anchor_info['decode_fn']
    num_anchors_per_layer = global_anchor_info['num_anchors_per_layer']
    all_num_anchors_depth = global_anchor_info['all_num_anchors_depth']

    # bboxes_pred = decode_fn(loc_targets[0])
    # bboxes_pred = [tf.reshape(preds, [-1, 4]) for preds in bboxes_pred]
    # bboxes_pred = tf.concat(bboxes_pred, axis=0)
    # save_image_op = tf.py_func(save_image_with_bbox,
    #                         [ssd_preprocessing.unwhiten_image(features[0]),
    #                         tf.clip_by_value(cls_targets[0], 0, tf.int64.max),
    #                         match_scores[0],
    #                         bboxes_pred],
    #                         tf.int64, stateful=True)
    # with tf.control_dependencies([save_image_op]):

    #print(all_num_anchors_depth)
    with tf.variable_scope(params['model_scope'],
                           default_name=None,
                           values=[features],
                           reuse=tf.AUTO_REUSE):
        backbone = ssd_net.VGG16Backbone(params['data_format'])
        feature_layers = backbone.forward(
            features, training=(mode == tf.estimator.ModeKeys.TRAIN))
        #print(feature_layers)
        location_pred, cls_pred = ssd_net.multibox_head(
            feature_layers,
            params['num_classes'],
            all_num_anchors_depth,
            data_format=params['data_format'])

        if params['data_format'] == 'channels_first':
            cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred]
            location_pred = [
                tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
            ]

        cls_pred = [
            tf.reshape(pred,
                       [tf.shape(features)[0], -1, params['num_classes']])
            for pred in cls_pred
        ]
        location_pred = [
            tf.reshape(pred, [tf.shape(features)[0], -1, 4])
            for pred in location_pred
        ]

        cls_pred = tf.concat(cls_pred, axis=1)
        location_pred = tf.concat(location_pred, axis=1)

        cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']])
        location_pred = tf.reshape(location_pred, [-1, 4])

    with tf.device('/cpu:0'):
        with tf.control_dependencies([cls_pred, location_pred]):
            with tf.name_scope('post_forward'):
                #bboxes_pred = decode_fn(location_pred)
                bboxes_pred = tf.map_fn(
                    lambda _preds: decode_fn(_preds),
                    tf.reshape(location_pred, [tf.shape(features)[0], -1, 4]),
                    dtype=[tf.float32] * len(num_anchors_per_layer),
                    back_prop=False)
                #cls_targets = tf.Print(cls_targets, [tf.shape(bboxes_pred[0]),tf.shape(bboxes_pred[1]),tf.shape(bboxes_pred[2]),tf.shape(bboxes_pred[3])])
                bboxes_pred = [
                    tf.reshape(preds, [-1, 4]) for preds in bboxes_pred
                ]
                bboxes_pred = tf.concat(bboxes_pred, axis=0)

                flaten_cls_targets = tf.reshape(cls_targets, [-1])
                flaten_match_scores = tf.reshape(match_scores, [-1])
                flaten_loc_targets = tf.reshape(loc_targets, [-1, 4])

                # each positive examples has one label
                positive_mask = flaten_cls_targets > 0
                n_positives = tf.count_nonzero(positive_mask)

                batch_n_positives = tf.count_nonzero(cls_targets, -1)

                batch_negtive_mask = tf.equal(
                    cls_targets, 0
                )  #tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.)
                batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1)

                batch_n_neg_select = tf.cast(
                    params['negative_ratio'] *
                    tf.cast(batch_n_positives, tf.float32), tf.int32)
                batch_n_neg_select = tf.minimum(
                    batch_n_neg_select, tf.cast(batch_n_negtives, tf.int32))

                # hard negative mining for classification
                predictions_for_bg = tf.nn.softmax(
                    tf.reshape(
                        cls_pred,
                        [tf.shape(features)[0], -1, params['num_classes']
                         ]))[:, :, 0]
                prob_for_negtives = tf.where(
                    batch_negtive_mask,
                    0. - predictions_for_bg,
                    # ignore all the positives
                    0. - tf.ones_like(predictions_for_bg))
                topk_prob_for_bg, _ = tf.nn.top_k(
                    prob_for_negtives, k=tf.shape(prob_for_negtives)[1])
                score_at_k = tf.gather_nd(
                    topk_prob_for_bg,
                    tf.stack([
                        tf.range(tf.shape(features)[0]), batch_n_neg_select - 1
                    ],
                             axis=-1))

                selected_neg_mask = prob_for_negtives >= tf.expand_dims(
                    score_at_k, axis=-1)

                # include both selected negtive and all positive examples
                final_mask = tf.stop_gradient(
                    tf.logical_or(
                        tf.reshape(
                            tf.logical_and(batch_negtive_mask,
                                           selected_neg_mask), [-1]),
                        positive_mask))
                total_examples = tf.count_nonzero(final_mask)

                cls_pred = tf.boolean_mask(cls_pred, final_mask)
                location_pred = tf.boolean_mask(
                    location_pred, tf.stop_gradient(positive_mask))
                flaten_cls_targets = tf.boolean_mask(
                    tf.clip_by_value(flaten_cls_targets, 0,
                                     params['num_classes']), final_mask)
                flaten_loc_targets = tf.stop_gradient(
                    tf.boolean_mask(flaten_loc_targets, positive_mask))

                predictions = {
                    'classes':
                    tf.argmax(cls_pred, axis=-1),
                    'probabilities':
                    tf.reduce_max(tf.nn.softmax(cls_pred,
                                                name='softmax_tensor'),
                                  axis=-1),
                    'loc_predict':
                    bboxes_pred
                }

                cls_accuracy = tf.metrics.accuracy(flaten_cls_targets,
                                                   predictions['classes'])
                metrics = {'cls_accuracy': cls_accuracy}

                # Create a tensor named train_accuracy for logging purposes.
                tf.identity(cls_accuracy[1], name='cls_accuracy')
                tf.summary.scalar('cls_accuracy', cls_accuracy[1])

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    #cross_entropy = tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred), lambda: 0.)# * (params['negative_ratio'] + 1.)
    #flaten_cls_targets=tf.Print(flaten_cls_targets, [flaten_loc_targets],summarize=50000)
    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
        labels=flaten_cls_targets,
        logits=cls_pred) * (params['negative_ratio'] + 1.)
    # Create a tensor named cross_entropy for logging purposes.
    tf.identity(cross_entropy, name='cross_entropy_loss')
    tf.summary.scalar('cross_entropy_loss', cross_entropy)

    #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred))
    loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.)
    #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets))
    loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1),
                              name='location_loss')
    tf.summary.scalar('location_loss', loc_loss)
    tf.losses.add_loss(loc_loss)

    l2_loss_vars = []
    for trainable_var in tf.trainable_variables():
        if '_bn' not in trainable_var.name:
            if 'conv4_3_scale' not in trainable_var.name:
                l2_loss_vars.append(tf.nn.l2_loss(trainable_var))
            else:
                l2_loss_vars.append(tf.nn.l2_loss(trainable_var) * 0.1)
    # Add weight decay to the loss. We exclude the batch norm variables because
    # doing so leads to a small improvement in accuracy.
    total_loss = tf.add(cross_entropy + loc_loss,
                        tf.multiply(params['weight_decay'],
                                    tf.add_n(l2_loss_vars),
                                    name='l2_loss'),
                        name='total_loss')

    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_or_create_global_step()

        lr_values = [
            params['learning_rate'] * decay
            for decay in params['lr_decay_factors']
        ]
        learning_rate = tf.train.piecewise_constant(
            tf.cast(global_step, tf.int32),
            [int(_) for _ in params['decay_boundaries']], lr_values)
        truncated_learning_rate = tf.maximum(learning_rate,
                                             tf.constant(
                                                 params['end_learning_rate'],
                                                 dtype=learning_rate.dtype),
                                             name='learning_rate')
        # Create a tensor named learning_rate for logging purposes.
        tf.summary.scalar('learning_rate', truncated_learning_rate)

        optimizer = tf.train.MomentumOptimizer(
            learning_rate=truncated_learning_rate, momentum=params['momentum'])
        optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(total_loss, global_step)
    else:
        train_op = None

    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        loss=total_loss,
        train_op=train_op,
        eval_metric_ops=metrics,
        scaffold=tf.train.Scaffold(init_fn=get_init_fn()))
Example #2
0
def main(_):
    with tf.Graph().as_default():
        out_shape = [FLAGS.train_image_size] * 2

        image_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
        shape_input = tf.placeholder(tf.int32, shape=(2, ))

        features = ssd_preprocessing.preprocess_for_eval(
            image_input,
            out_shape,
            data_format=FLAGS.data_format,
            output_rgb=False)
        features = tf.expand_dims(features, axis=0)

        anchor_creator = anchor_manipulator.AnchorCreator(
            out_shape,
            layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3),
                           (1, 1)],
            anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ),
                           (0.9, )],
            extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ),
                                 (0.6315, ), (0.8078, ), (0.9836, )],
            anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333),
                           (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333),
                           (1., 2., .5), (1., 2., .5)],
            #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)],
            layer_steps=[8, 16, 32, 64, 100, 300])
        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors(
        )

        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(
            allowed_borders=[1.0] * 6,
            positive_threshold=None,
            ignore_threshold=None,
            prior_scaling=[0.1, 0.1, 0.2, 0.2])

        decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors(
            pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)

        with tf.variable_scope(FLAGS.model_scope,
                               default_name=None,
                               values=[features],
                               reuse=tf.AUTO_REUSE):
            backbone = ssd_net.VGG16Backbone(FLAGS.data_format)
            feature_layers = backbone.forward(features, training=False)
            location_pred, cls_pred = ssd_net.multibox_head(
                feature_layers,
                FLAGS.num_classes,
                all_num_anchors_depth,
                data_format=FLAGS.data_format)
            if FLAGS.data_format == 'channels_first':
                cls_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred
                ]
                location_pred = [
                    tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
                ]

            cls_pred = [
                tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred
            ]
            location_pred = [
                tf.reshape(pred, [-1, 4]) for pred in location_pred
            ]

            cls_pred = tf.concat(cls_pred, axis=0)
            location_pred = tf.concat(location_pred, axis=0)

        with tf.device('/cpu:0'):
            bboxes_pred = decode_fn(location_pred)
            bboxes_pred = tf.concat(bboxes_pred, axis=0)
            selected_bboxes, selected_scores = parse_by_class(
                cls_pred, bboxes_pred, FLAGS.num_classes,
                FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk,
                FLAGS.nms_topk, FLAGS.nms_threshold)

            labels_list = []
            scores_list = []
            bboxes_list = []
            for k, v in selected_scores.items():
                labels_list.append(tf.ones_like(v, tf.int32) * k)
                scores_list.append(v)
                bboxes_list.append(selected_bboxes[k])
            all_labels = tf.concat(labels_list, axis=0)
            all_scores = tf.concat(scores_list, axis=0)
            all_bboxes = tf.concat(bboxes_list, axis=0)

        saver = tf.train.Saver()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            saver.restore(sess, get_checkpoint())

            np_image = imread('./demo/test.jpg')
            labels_, scores_, bboxes_ = sess.run(
                [all_labels, all_scores, all_bboxes],
                feed_dict={
                    image_input: np_image,
                    shape_input: np_image.shape[:-1]
                })

            img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image,
                                                          labels_,
                                                          scores_,
                                                          bboxes_,
                                                          thickness=2)
            imsave('./demo/test_out.jpg', img_to_draw)
Example #3
0
def ssd_model_fn(features, labels, mode, params):
    """model_fn for SSD to be used with our Estimator."""
    shape = labels['shape']
    loc_targets = labels['loc_targets']
    cls_targets = labels['cls_targets']
    match_scores = labels['match_scores']

    global global_anchor_info
    decode_fn = global_anchor_info['decode_fn']
    num_anchors_per_layer = global_anchor_info['num_anchors_per_layer']
    all_num_anchors_depth = global_anchor_info['all_num_anchors_depth']

    # bboxes_pred = decode_fn(loc_targets[0])
    # bboxes_pred = [tf.reshape(preds, [-1, 4]) for preds in bboxes_pred]
    # bboxes_pred = tf.concat(bboxes_pred, axis=0)
    # save_image_op = tf.py_func(save_image_with_bbox,
    #                         [ssd_preprocessing.unwhiten_image(features[0]),
    #                         tf.clip_by_value(cls_targets[0], 0, tf.int64.max),
    #                         match_scores[0],
    #                         bboxes_pred],
    #                         tf.int64, stateful=True)
    # with tf.control_dependencies([save_image_op]):

    #print(all_num_anchors_depth)
    with tf.variable_scope(params['model_scope'],
                           default_name=None,
                           values=[features],
                           reuse=tf.AUTO_REUSE):
        backbone = ssd_net.VGG16Backbone(params['data_format'])
        feature_layers = backbone.forward(
            features, training=(mode == tf.estimator.ModeKeys.TRAIN))
        #print(feature_layers)

        #location_pred:[[batch_Size,4, 38, 38],[]]
        #cls_pred:[[batch_Size,num_classes, 38, 38, ]...
        #                 10*10*6*num_classes, 5*5*6*num_classes, 3*3*4*num_classes, 1*!*4*num_classes]
        location_pred, cls_pred = ssd_net.multibox_head(
            feature_layers,
            params['num_classes'],
            all_num_anchors_depth,
            data_format=params['data_format'])

        if params['data_format'] == 'channels_first':
            cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred]
            location_pred = [
                tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
            ]
        #cls_pred:[[batch_size, 38, 38, 4],[]...]
        #location_pred:[[batch_size, 38, 38, 1],[]...]

        cls_pred = [
            tf.reshape(pred,
                       [tf.shape(features)[0], -1, params['num_classes']])
            for pred in cls_pred
        ]
        location_pred = [
            tf.reshape(pred, [tf.shape(features)[0], -1, 4])
            for pred in location_pred
        ]
        #clas_pred:[[batch_size, 38*38*4, class_num],...]
        #location_pred:[[batch_size, 38*38*4 , 4]...]

        cls_pred = tf.concat(cls_pred, axis=1)
        location_pred = tf.concat(location_pred, axis=1)

        cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']])
        location_pred = tf.reshape(location_pred, [-1, 4])
        # clas_pred:[batch_size*(38*38*4 + 19*19*6 + 10*10*6 + 5*5*6 + 3*3*4 + 1*1*4), num_class]
        # location_pred:[batch_size*(38*38*4 + 19*19*6 + 10*10*6 + 5*5*6 + 3*3*4 + 1*1*4), 4]
    with tf.device('/cpu:0'):
        with tf.control_dependencies([cls_pred, location_pred]):
            with tf.name_scope('post_forward'):
                #location_pred:[batch_size, 8732, 4] 里面包含着每一个prior_bbox的偏移量预测值
                #decode_fn:根据8732个prior_bbox自身的坐标与与之对应的偏移量,就可以得出实际的8732个预测框的位置
                bboxes_pred = decode_fn(
                    tf.reshape(location_pred, [tf.shape(features)[0], -1, 4]))
                bboxes_pred = tf.reshape(bboxes_pred, [-1, 4])
                #bboxes_pred:[batch_size*8732, 4], 4的含义是bbox的[ymin, xmin, ymax, xmax]

                #cls_targets:[batch_Size, 8732]
                flaten_cls_targets = tf.reshape(cls_targets,
                                                [-1])  #[batch_size*8732]
                flaten_match_scores = tf.reshape(match_scores, [-1])
                flaten_loc_targets = tf.reshape(loc_targets,
                                                [-1, 4])  #[batch_size*8732, 4]

                # each positive examples has one label
                positive_mask = flaten_cls_targets > 0
                n_positives = tf.count_nonzero(positive_mask)

                #batch_n_positives:[batch_size], 其中第i个数字x代表第i张图片上有x个正例prior_bbox。
                batch_n_positives = tf.count_nonzero(cls_targets > 0, -1)

                #batch_negative_mask:[batch_size, 8732].
                batch_negtive_mask = tf.equal(cls_targets, 0)
                #batch_n_negtives:[batch_size]其中第i个数字x代表第i张图片上有x个负例prior_bbox。
                batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1)

                #negative_ratio:3。 也就是说负例数量是正例的3倍
                batch_n_neg_select = tf.to_int32(
                    params['negative_ratio'] * tf.to_float(batch_n_positives))
                batch_n_neg_select = tf.minimum(batch_n_neg_select,
                                                tf.to_int32(batch_n_negtives))
                #batch_n_neg_select:[batch_size]->第i个数字x代表第i张图片选x个负例prior_bbox

                # hard negative mining for classification
                # predictions_for_bg:[batch_size, 8732]
                predictions_for_bg = tf.nn.softmax(
                    tf.reshape(
                        cls_pred,
                        [tf.shape(features)[0], -1, params['num_classes']
                         ]))[:, :, 0]
                prob_for_negtives = tf.where(
                    batch_negtive_mask,
                    0. - predictions_for_bg,
                    # ignore all the positives
                    0. - tf.ones_like(predictions_for_bg))
                #prob_for_negtives:[batch_size, 8732]。如果prior_bbox的label_cls为0则把背景预测值填进去,否则就填-1

                #topk_prob_for_bg:[batch_size, 8732],其中第二维度是从大大小排序的
                topk_prob_for_bg, _ = tf.nn.top_k(
                    prob_for_negtives, k=tf.shape(prob_for_negtives)[1])

                #score_at_k:[batch_size] 第i个数字x代表:第i张图片选m个负例prior_bbox, 而这m个框中预测是背景的最高分是-x。
                #换句话说,最低分代表预测得很离谱,明明是背景,但是它(-x)的分确很低。(带负号是因为line353,因为方便排序加上的)
                score_at_k = tf.gather_nd(
                    topk_prob_for_bg,
                    tf.stack([
                        tf.range(tf.shape(features)[0]), batch_n_neg_select - 1
                    ],
                             axis=-1))

                #selected_neg_mask:[batch_size, 8732].其中被选择的负例对应位置为True,否则是False
                selected_neg_mask = prob_for_negtives >= tf.expand_dims(
                    score_at_k, axis=-1)

                # include both selected negtive and all positive examples
                # final_mask:[batch_size, 8732], 被选中的正例和负例序号为True,其余为False。
                final_mask = tf.stop_gradient(
                    tf.logical_or(
                        tf.reshape(
                            tf.logical_and(batch_negtive_mask,
                                           selected_neg_mask), [-1]),
                        positive_mask))
                total_examples = tf.count_nonzero(final_mask)

                #假设batch个图片总共有m个正例,n个负例。
                #cls_pred:[m+n]
                cls_pred = tf.boolean_mask(cls_pred, final_mask)
                #location_pred:[m,4]
                location_pred = tf.boolean_mask(
                    location_pred, tf.stop_gradient(positive_mask))
                flaten_cls_targets = tf.boolean_mask(
                    tf.clip_by_value(flaten_cls_targets, 0,
                                     params['num_classes']), final_mask)
                flaten_loc_targets = tf.stop_gradient(
                    tf.boolean_mask(flaten_loc_targets, positive_mask))

                predictions = {
                    'classes':
                    tf.argmax(cls_pred, axis=-1),
                    'probabilities':
                    tf.reduce_max(tf.nn.softmax(cls_pred,
                                                name='softmax_tensor'),
                                  axis=-1),
                    'loc_predict':
                    bboxes_pred
                }

                cls_accuracy = tf.metrics.accuracy(flaten_cls_targets,
                                                   predictions['classes'])
                metrics = {'cls_accuracy': cls_accuracy}

                # Create a tensor named train_accuracy for logging purposes.
                tf.identity(cls_accuracy[1], name='cls_accuracy')
                tf.summary.scalar('cls_accuracy', cls_accuracy[1])

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    #cross_entropy = tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred), lambda: 0.)# * (params['negative_ratio'] + 1.)
    #flaten_cls_targets=tf.Print(flaten_cls_targets, [flaten_loc_targets],summarize=50000)
    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
        labels=flaten_cls_targets,
        logits=cls_pred) * (params['negative_ratio'] + 1.)
    # Create a tensor named cross_entropy for logging purposes.
    tf.identity(cross_entropy, name='cross_entropy_loss')
    tf.summary.scalar('cross_entropy_loss', cross_entropy)

    #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred))
    loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.)
    #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets))
    loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1),
                              name='location_loss')
    tf.summary.scalar('location_loss', loc_loss)
    tf.losses.add_loss(loc_loss)

    l2_loss_vars = []
    for trainable_var in tf.trainable_variables():
        if '_bn' not in trainable_var.name:
            if 'conv4_3_scale' not in trainable_var.name:
                l2_loss_vars.append(tf.nn.l2_loss(trainable_var))
            else:
                l2_loss_vars.append(tf.nn.l2_loss(trainable_var) * 0.1)
    # Add weight decay to the loss. We exclude the batch norm variables because
    # doing so leads to a small improvement in accuracy.
    total_loss = tf.add(cross_entropy + loc_loss,
                        tf.multiply(params['weight_decay'],
                                    tf.add_n(l2_loss_vars),
                                    name='l2_loss'),
                        name='total_loss')

    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_or_create_global_step()

        lr_values = [
            params['learning_rate'] * decay
            for decay in params['lr_decay_factors']
        ]
        learning_rate = tf.train.piecewise_constant(
            tf.cast(global_step, tf.int32),
            [int(_) for _ in params['decay_boundaries']], lr_values)
        truncated_learning_rate = tf.maximum(learning_rate,
                                             tf.constant(
                                                 params['end_learning_rate'],
                                                 dtype=learning_rate.dtype),
                                             name='learning_rate')
        # Create a tensor named learning_rate for logging purposes.
        tf.summary.scalar('learning_rate', truncated_learning_rate)

        optimizer = tf.train.MomentumOptimizer(
            learning_rate=truncated_learning_rate, momentum=params['momentum'])
        optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(total_loss, global_step)
    else:
        train_op = None

    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        loss=total_loss,
        train_op=train_op,
        eval_metric_ops=metrics,
        scaffold=tf.train.Scaffold(init_fn=get_init_fn()))
Example #4
0
def ssd_model_fn(features, labels, mode, params):
    """model_fn for SSD to be used with our Estimator."""
    filename = features['filename']
    shape = features['shape']
    loc_targets = features['loc_targets']
    cls_targets = features['cls_targets']
    match_scores = features['match_scores']
    features = features['image']

    global global_anchor_info
    decode_fn = global_anchor_info['decode_fn']
    num_anchors_per_layer = global_anchor_info['num_anchors_per_layer']
    all_num_anchors_depth = global_anchor_info['all_num_anchors_depth']

    with tf.variable_scope(params['model_scope'],
                           default_name=None,
                           values=[features],
                           reuse=tf.AUTO_REUSE):
        backbone = ssd_net.VGG16Backbone(params['data_format'])
        feature_layers = backbone.forward(
            features, training=(mode == tf.estimator.ModeKeys.TRAIN))
        #print(feature_layers)
        location_pred, cls_pred = ssd_net.multibox_head(
            feature_layers,
            params['num_classes'],
            all_num_anchors_depth,
            data_format=params['data_format'])
        if params['data_format'] == 'channels_first':
            cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred]
            location_pred = [
                tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred
            ]

        cls_pred = [
            tf.reshape(pred,
                       [tf.shape(features)[0], -1, params['num_classes']])
            for pred in cls_pred
        ]
        location_pred = [
            tf.reshape(pred, [tf.shape(features)[0], -1, 4])
            for pred in location_pred
        ]

        cls_pred = tf.concat(cls_pred, axis=1)
        location_pred = tf.concat(location_pred, axis=1)

        cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']])
        location_pred = tf.reshape(location_pred, [-1, 4])

    with tf.device('/cpu:0'):
        bboxes_pred = decode_fn(location_pred)
        bboxes_pred = tf.concat(bboxes_pred, axis=0)
        selected_bboxes, selected_scores = parse_by_class(
            cls_pred, bboxes_pred, params['num_classes'],
            params['select_threshold'], params['min_size'],
            params['keep_topk'], params['nms_topk'], params['nms_threshold'])

    predictions = {'filename': filename, 'shape': shape}
    for class_ind in range(1, params['num_classes']):
        predictions['scores_{}'.format(class_ind)] = tf.expand_dims(
            selected_scores[class_ind], axis=0)
        predictions['bboxes_{}'.format(class_ind)] = tf.expand_dims(
            selected_bboxes[class_ind], axis=0)

    flaten_cls_targets = tf.reshape(cls_targets, [-1])
    flaten_match_scores = tf.reshape(match_scores, [-1])
    flaten_loc_targets = tf.reshape(loc_targets, [-1, 4])

    # each positive examples has one label
    positive_mask = flaten_cls_targets > 0
    n_positives = tf.count_nonzero(positive_mask)

    batch_n_positives = tf.count_nonzero(cls_targets, -1)

    batch_negtive_mask = tf.equal(
        cls_targets,
        0)  #tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.)
    batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1)

    batch_n_neg_select = tf.cast(
        params['negative_ratio'] * tf.cast(batch_n_positives, tf.float32),
        tf.int32)
    batch_n_neg_select = tf.minimum(batch_n_neg_select,
                                    tf.cast(batch_n_negtives, tf.int32))

    # hard negative mining for classification
    predictions_for_bg = tf.nn.softmax(
        tf.reshape(cls_pred,
                   [tf.shape(features)[0], -1, params['num_classes']]))[:, :,
                                                                        0]
    prob_for_negtives = tf.where(
        batch_negtive_mask,
        0. - predictions_for_bg,
        # ignore all the positives
        0. - tf.ones_like(predictions_for_bg))
    topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives,
                                      k=tf.shape(prob_for_negtives)[1])
    score_at_k = tf.gather_nd(
        topk_prob_for_bg,
        tf.stack([tf.range(tf.shape(features)[0]), batch_n_neg_select - 1],
                 axis=-1))

    selected_neg_mask = prob_for_negtives >= tf.expand_dims(score_at_k,
                                                            axis=-1)

    # include both selected negtive and all positive examples
    final_mask = tf.stop_gradient(
        tf.logical_or(
            tf.reshape(tf.logical_and(batch_negtive_mask, selected_neg_mask),
                       [-1]), positive_mask))
    total_examples = tf.count_nonzero(final_mask)

    cls_pred = tf.boolean_mask(cls_pred, final_mask)
    location_pred = tf.boolean_mask(location_pred,
                                    tf.stop_gradient(positive_mask))
    flaten_cls_targets = tf.boolean_mask(
        tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']),
        final_mask)
    flaten_loc_targets = tf.stop_gradient(
        tf.boolean_mask(flaten_loc_targets, positive_mask))

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    #cross_entropy = (params['negative_ratio'] + 1.) * tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=glabels, logits=cls_pred), lambda: 0.)
    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
        labels=flaten_cls_targets,
        logits=cls_pred) * (params['negative_ratio'] + 1.)
    # Create a tensor named cross_entropy for logging purposes.
    tf.identity(cross_entropy, name='cross_entropy_loss')
    tf.summary.scalar('cross_entropy_loss', cross_entropy)

    #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred))
    loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.)
    loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1),
                              name='location_loss')
    tf.summary.scalar('location_loss', loc_loss)
    tf.losses.add_loss(loc_loss)

    # Add weight decay to the loss. We exclude the batch norm variables because
    # doing so leads to a small improvement in accuracy.
    total_loss = tf.add(cross_entropy, loc_loss, name='total_loss')

    cls_accuracy = tf.metrics.accuracy(flaten_cls_targets,
                                       tf.argmax(cls_pred, axis=-1))

    # Create a tensor named train_accuracy for logging purposes.
    tf.identity(cls_accuracy[1], name='cls_accuracy')
    tf.summary.scalar('cls_accuracy', cls_accuracy[1])

    summary_hook = tf.train.SummarySaverHook(
        save_steps=params['save_summary_steps'],
        output_dir=params['summary_dir'],
        summary_op=tf.summary.merge_all())
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          prediction_hooks=[summary_hook],
                                          loss=None,
                                          train_op=None)
    else:
        raise ValueError('This script only support "PREDICT" mode!')