def ssd_model_fn(features, labels, mode, params): """model_fn for SSD to be used with our Estimator.""" shape = labels['shape'] loc_targets = labels['loc_targets'] cls_targets = labels['cls_targets'] match_scores = labels['match_scores'] global global_anchor_info decode_fn = global_anchor_info['decode_fn'] num_anchors_per_layer = global_anchor_info['num_anchors_per_layer'] all_num_anchors_depth = global_anchor_info['all_num_anchors_depth'] # bboxes_pred = decode_fn(loc_targets[0]) # bboxes_pred = [tf.reshape(preds, [-1, 4]) for preds in bboxes_pred] # bboxes_pred = tf.concat(bboxes_pred, axis=0) # save_image_op = tf.py_func(save_image_with_bbox, # [ssd_preprocessing.unwhiten_image(features[0]), # tf.clip_by_value(cls_targets[0], 0, tf.int64.max), # match_scores[0], # bboxes_pred], # tf.int64, stateful=True) # with tf.control_dependencies([save_image_op]): #print(all_num_anchors_depth) with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(params['data_format']) feature_layers = backbone.forward( features, training=(mode == tf.estimator.ModeKeys.TRAIN)) #print(feature_layers) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, params['num_classes'], all_num_anchors_depth, data_format=params['data_format']) if params['data_format'] == 'channels_first': cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred ] cls_pred = tf.concat(cls_pred, axis=1) location_pred = tf.concat(location_pred, axis=1) cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) location_pred = tf.reshape(location_pred, [-1, 4]) with tf.device('/cpu:0'): with tf.control_dependencies([cls_pred, location_pred]): with tf.name_scope('post_forward'): #bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.map_fn( lambda _preds: decode_fn(_preds), tf.reshape(location_pred, [tf.shape(features)[0], -1, 4]), dtype=[tf.float32] * len(num_anchors_per_layer), back_prop=False) #cls_targets = tf.Print(cls_targets, [tf.shape(bboxes_pred[0]),tf.shape(bboxes_pred[1]),tf.shape(bboxes_pred[2]),tf.shape(bboxes_pred[3])]) bboxes_pred = [ tf.reshape(preds, [-1, 4]) for preds in bboxes_pred ] bboxes_pred = tf.concat(bboxes_pred, axis=0) flaten_cls_targets = tf.reshape(cls_targets, [-1]) flaten_match_scores = tf.reshape(match_scores, [-1]) flaten_loc_targets = tf.reshape(loc_targets, [-1, 4]) # each positive examples has one label positive_mask = flaten_cls_targets > 0 n_positives = tf.count_nonzero(positive_mask) batch_n_positives = tf.count_nonzero(cls_targets, -1) batch_negtive_mask = tf.equal( cls_targets, 0 ) #tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.) batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1) batch_n_neg_select = tf.cast( params['negative_ratio'] * tf.cast(batch_n_positives, tf.float32), tf.int32) batch_n_neg_select = tf.minimum( batch_n_neg_select, tf.cast(batch_n_negtives, tf.int32)) # hard negative mining for classification predictions_for_bg = tf.nn.softmax( tf.reshape( cls_pred, [tf.shape(features)[0], -1, params['num_classes'] ]))[:, :, 0] prob_for_negtives = tf.where( batch_negtive_mask, 0. - predictions_for_bg, # ignore all the positives 0. - tf.ones_like(predictions_for_bg)) topk_prob_for_bg, _ = tf.nn.top_k( prob_for_negtives, k=tf.shape(prob_for_negtives)[1]) score_at_k = tf.gather_nd( topk_prob_for_bg, tf.stack([ tf.range(tf.shape(features)[0]), batch_n_neg_select - 1 ], axis=-1)) selected_neg_mask = prob_for_negtives >= tf.expand_dims( score_at_k, axis=-1) # include both selected negtive and all positive examples final_mask = tf.stop_gradient( tf.logical_or( tf.reshape( tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]), positive_mask)) total_examples = tf.count_nonzero(final_mask) cls_pred = tf.boolean_mask(cls_pred, final_mask) location_pred = tf.boolean_mask( location_pred, tf.stop_gradient(positive_mask)) flaten_cls_targets = tf.boolean_mask( tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']), final_mask) flaten_loc_targets = tf.stop_gradient( tf.boolean_mask(flaten_loc_targets, positive_mask)) predictions = { 'classes': tf.argmax(cls_pred, axis=-1), 'probabilities': tf.reduce_max(tf.nn.softmax(cls_pred, name='softmax_tensor'), axis=-1), 'loc_predict': bboxes_pred } cls_accuracy = tf.metrics.accuracy(flaten_cls_targets, predictions['classes']) metrics = {'cls_accuracy': cls_accuracy} # Create a tensor named train_accuracy for logging purposes. tf.identity(cls_accuracy[1], name='cls_accuracy') tf.summary.scalar('cls_accuracy', cls_accuracy[1]) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate loss, which includes softmax cross entropy and L2 regularization. #cross_entropy = tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred), lambda: 0.)# * (params['negative_ratio'] + 1.) #flaten_cls_targets=tf.Print(flaten_cls_targets, [flaten_loc_targets],summarize=50000) cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=flaten_cls_targets, logits=cls_pred) * (params['negative_ratio'] + 1.) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy_loss') tf.summary.scalar('cross_entropy_loss', cross_entropy) #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred)) loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.) #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets)) loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1), name='location_loss') tf.summary.scalar('location_loss', loc_loss) tf.losses.add_loss(loc_loss) l2_loss_vars = [] for trainable_var in tf.trainable_variables(): if '_bn' not in trainable_var.name: if 'conv4_3_scale' not in trainable_var.name: l2_loss_vars.append(tf.nn.l2_loss(trainable_var)) else: l2_loss_vars.append(tf.nn.l2_loss(trainable_var) * 0.1) # Add weight decay to the loss. We exclude the batch norm variables because # doing so leads to a small improvement in accuracy. total_loss = tf.add(cross_entropy + loc_loss, tf.multiply(params['weight_decay'], tf.add_n(l2_loss_vars), name='l2_loss'), name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() lr_values = [ params['learning_rate'] * decay for decay in params['lr_decay_factors'] ] learning_rate = tf.train.piecewise_constant( tf.cast(global_step, tf.int32), [int(_) for _ in params['decay_boundaries']], lr_values) truncated_learning_rate = tf.maximum(learning_rate, tf.constant( params['end_learning_rate'], dtype=learning_rate.dtype), name='learning_rate') # Create a tensor named learning_rate for logging purposes. tf.summary.scalar('learning_rate', truncated_learning_rate) optimizer = tf.train.MomentumOptimizer( learning_rate=truncated_learning_rate, momentum=params['momentum']) optimizer = tf.contrib.estimator.TowerOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step) else: train_op = None return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=total_loss, train_op=train_op, eval_metric_ops=metrics, scaffold=tf.train.Scaffold(init_fn=get_init_fn()))
def main(_): with tf.Graph().as_default(): out_shape = [FLAGS.train_image_size] * 2 image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) shape_input = tf.placeholder(tf.int32, shape=(2, )) features = ssd_preprocessing.preprocess_for_eval( image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) features = tf.expand_dims(features, axis=0) anchor_creator = anchor_manipulator.AnchorCreator( out_shape, layers_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], anchor_scales=[(0.1, ), (0.2, ), (0.375, ), (0.55, ), (0.725, ), (0.9, )], extra_anchor_scales=[(0.1414, ), (0.2739, ), (0.4541, ), (0.6315, ), (0.8078, ), (0.9836, )], anchor_ratios=[(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], layer_steps=[8, 16, 32, 64, 100, 300]) all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors( ) anchor_encoder_decoder = anchor_manipulator.AnchorEncoder( allowed_borders=[1.0] * 6, positive_threshold=None, ignore_threshold=None, prior_scaling=[0.1, 0.1, 0.2, 0.2]) decode_fn = lambda pred: anchor_encoder_decoder.ext_decode_all_anchors( pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(FLAGS.data_format) feature_layers = backbone.forward(features, training=False) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) if FLAGS.data_format == 'channels_first': cls_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred ] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [-1, 4]) for pred in location_pred ] cls_pred = tf.concat(cls_pred, axis=0) location_pred = tf.concat(location_pred, axis=0) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) selected_bboxes, selected_scores = parse_by_class( cls_pred, bboxes_pred, FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) labels_list = [] scores_list = [] bboxes_list = [] for k, v in selected_scores.items(): labels_list.append(tf.ones_like(v, tf.int32) * k) scores_list.append(v) bboxes_list.append(selected_bboxes[k]) all_labels = tf.concat(labels_list, axis=0) all_scores = tf.concat(scores_list, axis=0) all_bboxes = tf.concat(bboxes_list, axis=0) saver = tf.train.Saver() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver.restore(sess, get_checkpoint()) np_image = imread('./demo/test.jpg') labels_, scores_, bboxes_ = sess.run( [all_labels, all_scores, all_bboxes], feed_dict={ image_input: np_image, shape_input: np_image.shape[:-1] }) img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) imsave('./demo/test_out.jpg', img_to_draw)
def ssd_model_fn(features, labels, mode, params): """model_fn for SSD to be used with our Estimator.""" shape = labels['shape'] loc_targets = labels['loc_targets'] cls_targets = labels['cls_targets'] match_scores = labels['match_scores'] global global_anchor_info decode_fn = global_anchor_info['decode_fn'] num_anchors_per_layer = global_anchor_info['num_anchors_per_layer'] all_num_anchors_depth = global_anchor_info['all_num_anchors_depth'] # bboxes_pred = decode_fn(loc_targets[0]) # bboxes_pred = [tf.reshape(preds, [-1, 4]) for preds in bboxes_pred] # bboxes_pred = tf.concat(bboxes_pred, axis=0) # save_image_op = tf.py_func(save_image_with_bbox, # [ssd_preprocessing.unwhiten_image(features[0]), # tf.clip_by_value(cls_targets[0], 0, tf.int64.max), # match_scores[0], # bboxes_pred], # tf.int64, stateful=True) # with tf.control_dependencies([save_image_op]): #print(all_num_anchors_depth) with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(params['data_format']) feature_layers = backbone.forward( features, training=(mode == tf.estimator.ModeKeys.TRAIN)) #print(feature_layers) #location_pred:[[batch_Size,4, 38, 38],[]] #cls_pred:[[batch_Size,num_classes, 38, 38, ]... # 10*10*6*num_classes, 5*5*6*num_classes, 3*3*4*num_classes, 1*!*4*num_classes] location_pred, cls_pred = ssd_net.multibox_head( feature_layers, params['num_classes'], all_num_anchors_depth, data_format=params['data_format']) if params['data_format'] == 'channels_first': cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] #cls_pred:[[batch_size, 38, 38, 4],[]...] #location_pred:[[batch_size, 38, 38, 1],[]...] cls_pred = [ tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred ] #clas_pred:[[batch_size, 38*38*4, class_num],...] #location_pred:[[batch_size, 38*38*4 , 4]...] cls_pred = tf.concat(cls_pred, axis=1) location_pred = tf.concat(location_pred, axis=1) cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) location_pred = tf.reshape(location_pred, [-1, 4]) # clas_pred:[batch_size*(38*38*4 + 19*19*6 + 10*10*6 + 5*5*6 + 3*3*4 + 1*1*4), num_class] # location_pred:[batch_size*(38*38*4 + 19*19*6 + 10*10*6 + 5*5*6 + 3*3*4 + 1*1*4), 4] with tf.device('/cpu:0'): with tf.control_dependencies([cls_pred, location_pred]): with tf.name_scope('post_forward'): #location_pred:[batch_size, 8732, 4] 里面包含着每一个prior_bbox的偏移量预测值 #decode_fn:根据8732个prior_bbox自身的坐标与与之对应的偏移量,就可以得出实际的8732个预测框的位置 bboxes_pred = decode_fn( tf.reshape(location_pred, [tf.shape(features)[0], -1, 4])) bboxes_pred = tf.reshape(bboxes_pred, [-1, 4]) #bboxes_pred:[batch_size*8732, 4], 4的含义是bbox的[ymin, xmin, ymax, xmax] #cls_targets:[batch_Size, 8732] flaten_cls_targets = tf.reshape(cls_targets, [-1]) #[batch_size*8732] flaten_match_scores = tf.reshape(match_scores, [-1]) flaten_loc_targets = tf.reshape(loc_targets, [-1, 4]) #[batch_size*8732, 4] # each positive examples has one label positive_mask = flaten_cls_targets > 0 n_positives = tf.count_nonzero(positive_mask) #batch_n_positives:[batch_size], 其中第i个数字x代表第i张图片上有x个正例prior_bbox。 batch_n_positives = tf.count_nonzero(cls_targets > 0, -1) #batch_negative_mask:[batch_size, 8732]. batch_negtive_mask = tf.equal(cls_targets, 0) #batch_n_negtives:[batch_size]其中第i个数字x代表第i张图片上有x个负例prior_bbox。 batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1) #negative_ratio:3。 也就是说负例数量是正例的3倍 batch_n_neg_select = tf.to_int32( params['negative_ratio'] * tf.to_float(batch_n_positives)) batch_n_neg_select = tf.minimum(batch_n_neg_select, tf.to_int32(batch_n_negtives)) #batch_n_neg_select:[batch_size]->第i个数字x代表第i张图片选x个负例prior_bbox # hard negative mining for classification # predictions_for_bg:[batch_size, 8732] predictions_for_bg = tf.nn.softmax( tf.reshape( cls_pred, [tf.shape(features)[0], -1, params['num_classes'] ]))[:, :, 0] prob_for_negtives = tf.where( batch_negtive_mask, 0. - predictions_for_bg, # ignore all the positives 0. - tf.ones_like(predictions_for_bg)) #prob_for_negtives:[batch_size, 8732]。如果prior_bbox的label_cls为0则把背景预测值填进去,否则就填-1 #topk_prob_for_bg:[batch_size, 8732],其中第二维度是从大大小排序的 topk_prob_for_bg, _ = tf.nn.top_k( prob_for_negtives, k=tf.shape(prob_for_negtives)[1]) #score_at_k:[batch_size] 第i个数字x代表:第i张图片选m个负例prior_bbox, 而这m个框中预测是背景的最高分是-x。 #换句话说,最低分代表预测得很离谱,明明是背景,但是它(-x)的分确很低。(带负号是因为line353,因为方便排序加上的) score_at_k = tf.gather_nd( topk_prob_for_bg, tf.stack([ tf.range(tf.shape(features)[0]), batch_n_neg_select - 1 ], axis=-1)) #selected_neg_mask:[batch_size, 8732].其中被选择的负例对应位置为True,否则是False selected_neg_mask = prob_for_negtives >= tf.expand_dims( score_at_k, axis=-1) # include both selected negtive and all positive examples # final_mask:[batch_size, 8732], 被选中的正例和负例序号为True,其余为False。 final_mask = tf.stop_gradient( tf.logical_or( tf.reshape( tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]), positive_mask)) total_examples = tf.count_nonzero(final_mask) #假设batch个图片总共有m个正例,n个负例。 #cls_pred:[m+n] cls_pred = tf.boolean_mask(cls_pred, final_mask) #location_pred:[m,4] location_pred = tf.boolean_mask( location_pred, tf.stop_gradient(positive_mask)) flaten_cls_targets = tf.boolean_mask( tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']), final_mask) flaten_loc_targets = tf.stop_gradient( tf.boolean_mask(flaten_loc_targets, positive_mask)) predictions = { 'classes': tf.argmax(cls_pred, axis=-1), 'probabilities': tf.reduce_max(tf.nn.softmax(cls_pred, name='softmax_tensor'), axis=-1), 'loc_predict': bboxes_pred } cls_accuracy = tf.metrics.accuracy(flaten_cls_targets, predictions['classes']) metrics = {'cls_accuracy': cls_accuracy} # Create a tensor named train_accuracy for logging purposes. tf.identity(cls_accuracy[1], name='cls_accuracy') tf.summary.scalar('cls_accuracy', cls_accuracy[1]) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate loss, which includes softmax cross entropy and L2 regularization. #cross_entropy = tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred), lambda: 0.)# * (params['negative_ratio'] + 1.) #flaten_cls_targets=tf.Print(flaten_cls_targets, [flaten_loc_targets],summarize=50000) cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=flaten_cls_targets, logits=cls_pred) * (params['negative_ratio'] + 1.) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy_loss') tf.summary.scalar('cross_entropy_loss', cross_entropy) #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred)) loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.) #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets)) loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1), name='location_loss') tf.summary.scalar('location_loss', loc_loss) tf.losses.add_loss(loc_loss) l2_loss_vars = [] for trainable_var in tf.trainable_variables(): if '_bn' not in trainable_var.name: if 'conv4_3_scale' not in trainable_var.name: l2_loss_vars.append(tf.nn.l2_loss(trainable_var)) else: l2_loss_vars.append(tf.nn.l2_loss(trainable_var) * 0.1) # Add weight decay to the loss. We exclude the batch norm variables because # doing so leads to a small improvement in accuracy. total_loss = tf.add(cross_entropy + loc_loss, tf.multiply(params['weight_decay'], tf.add_n(l2_loss_vars), name='l2_loss'), name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() lr_values = [ params['learning_rate'] * decay for decay in params['lr_decay_factors'] ] learning_rate = tf.train.piecewise_constant( tf.cast(global_step, tf.int32), [int(_) for _ in params['decay_boundaries']], lr_values) truncated_learning_rate = tf.maximum(learning_rate, tf.constant( params['end_learning_rate'], dtype=learning_rate.dtype), name='learning_rate') # Create a tensor named learning_rate for logging purposes. tf.summary.scalar('learning_rate', truncated_learning_rate) optimizer = tf.train.MomentumOptimizer( learning_rate=truncated_learning_rate, momentum=params['momentum']) optimizer = tf.contrib.estimator.TowerOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step) else: train_op = None return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=total_loss, train_op=train_op, eval_metric_ops=metrics, scaffold=tf.train.Scaffold(init_fn=get_init_fn()))
def ssd_model_fn(features, labels, mode, params): """model_fn for SSD to be used with our Estimator.""" filename = features['filename'] shape = features['shape'] loc_targets = features['loc_targets'] cls_targets = features['cls_targets'] match_scores = features['match_scores'] features = features['image'] global global_anchor_info decode_fn = global_anchor_info['decode_fn'] num_anchors_per_layer = global_anchor_info['num_anchors_per_layer'] all_num_anchors_depth = global_anchor_info['all_num_anchors_depth'] with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = ssd_net.VGG16Backbone(params['data_format']) feature_layers = backbone.forward( features, training=(mode == tf.estimator.ModeKeys.TRAIN)) #print(feature_layers) location_pred, cls_pred = ssd_net.multibox_head( feature_layers, params['num_classes'], all_num_anchors_depth, data_format=params['data_format']) if params['data_format'] == 'channels_first': cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] location_pred = [ tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred ] cls_pred = [ tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred ] location_pred = [ tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred ] cls_pred = tf.concat(cls_pred, axis=1) location_pred = tf.concat(location_pred, axis=1) cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) location_pred = tf.reshape(location_pred, [-1, 4]) with tf.device('/cpu:0'): bboxes_pred = decode_fn(location_pred) bboxes_pred = tf.concat(bboxes_pred, axis=0) selected_bboxes, selected_scores = parse_by_class( cls_pred, bboxes_pred, params['num_classes'], params['select_threshold'], params['min_size'], params['keep_topk'], params['nms_topk'], params['nms_threshold']) predictions = {'filename': filename, 'shape': shape} for class_ind in range(1, params['num_classes']): predictions['scores_{}'.format(class_ind)] = tf.expand_dims( selected_scores[class_ind], axis=0) predictions['bboxes_{}'.format(class_ind)] = tf.expand_dims( selected_bboxes[class_ind], axis=0) flaten_cls_targets = tf.reshape(cls_targets, [-1]) flaten_match_scores = tf.reshape(match_scores, [-1]) flaten_loc_targets = tf.reshape(loc_targets, [-1, 4]) # each positive examples has one label positive_mask = flaten_cls_targets > 0 n_positives = tf.count_nonzero(positive_mask) batch_n_positives = tf.count_nonzero(cls_targets, -1) batch_negtive_mask = tf.equal( cls_targets, 0) #tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.) batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1) batch_n_neg_select = tf.cast( params['negative_ratio'] * tf.cast(batch_n_positives, tf.float32), tf.int32) batch_n_neg_select = tf.minimum(batch_n_neg_select, tf.cast(batch_n_negtives, tf.int32)) # hard negative mining for classification predictions_for_bg = tf.nn.softmax( tf.reshape(cls_pred, [tf.shape(features)[0], -1, params['num_classes']]))[:, :, 0] prob_for_negtives = tf.where( batch_negtive_mask, 0. - predictions_for_bg, # ignore all the positives 0. - tf.ones_like(predictions_for_bg)) topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives, k=tf.shape(prob_for_negtives)[1]) score_at_k = tf.gather_nd( topk_prob_for_bg, tf.stack([tf.range(tf.shape(features)[0]), batch_n_neg_select - 1], axis=-1)) selected_neg_mask = prob_for_negtives >= tf.expand_dims(score_at_k, axis=-1) # include both selected negtive and all positive examples final_mask = tf.stop_gradient( tf.logical_or( tf.reshape(tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]), positive_mask)) total_examples = tf.count_nonzero(final_mask) cls_pred = tf.boolean_mask(cls_pred, final_mask) location_pred = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask)) flaten_cls_targets = tf.boolean_mask( tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']), final_mask) flaten_loc_targets = tf.stop_gradient( tf.boolean_mask(flaten_loc_targets, positive_mask)) # Calculate loss, which includes softmax cross entropy and L2 regularization. #cross_entropy = (params['negative_ratio'] + 1.) * tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=glabels, logits=cls_pred), lambda: 0.) cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=flaten_cls_targets, logits=cls_pred) * (params['negative_ratio'] + 1.) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy_loss') tf.summary.scalar('cross_entropy_loss', cross_entropy) #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred)) loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.) loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1), name='location_loss') tf.summary.scalar('location_loss', loc_loss) tf.losses.add_loss(loc_loss) # Add weight decay to the loss. We exclude the batch norm variables because # doing so leads to a small improvement in accuracy. total_loss = tf.add(cross_entropy, loc_loss, name='total_loss') cls_accuracy = tf.metrics.accuracy(flaten_cls_targets, tf.argmax(cls_pred, axis=-1)) # Create a tensor named train_accuracy for logging purposes. tf.identity(cls_accuracy[1], name='cls_accuracy') tf.summary.scalar('cls_accuracy', cls_accuracy[1]) summary_hook = tf.train.SummarySaverHook( save_steps=params['save_summary_steps'], output_dir=params['summary_dir'], summary_op=tf.summary.merge_all()) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, prediction_hooks=[summary_hook], loss=None, train_op=None) else: raise ValueError('This script only support "PREDICT" mode!')