def yolo_loss(args,
              anchors,
              num_classes,
              ignore_thresh=.5,
              label_smoothing=0.1,
              print_loss=False):
    # 一共有三层
    num_layers = len(anchors) // 3

    # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true]
    # y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
    # yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。
    y_true = args[num_layers:]
    yolo_outputs = args[:num_layers]

    # 先验框
    # 678为142,110,  192,243,  459,401
    # 345为36,75,  76,55,  72,146
    # 012为12,16,  19,36,  40,28
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]
                   ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]

    # 得到input_shpae为608,608
    input_shape = K.cast(
        K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))

    loss = 0

    # 取出每一张图片
    # m的值就是batch_size
    m = K.shape(yolo_outputs[0])[0]
    mf = K.cast(m, K.dtype(yolo_outputs[0]))

    # y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
    # yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。
    for l in range(num_layers):
        # 以第一个特征层(m,13,13,3,85)为例子
        # 取出该特征层中存在目标的点的位置。(m,13,13,3,1)
        object_mask = y_true[l][..., 4:5]
        # 取出其对应的种类(m,13,13,3,80)
        true_class_probs = y_true[l][..., 5:]
        if label_smoothing:
            true_class_probs = _smooth_labels(true_class_probs,
                                              label_smoothing)

        # 将yolo_outputs的特征层输出进行处理
        # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(m,13,13,3,85)
        # 还有解码后的xy,wh,(m,13,13,3,2)
        grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
                                                     anchors[anchor_mask[l]],
                                                     num_classes,
                                                     input_shape,
                                                     calc_loss=True)

        # 这个是解码后的预测的box的位置
        # (m,13,13,3,4)
        pred_box = K.concatenate([pred_xy, pred_wh])

        # 找到负样本群组,第一步是创建一个数组,[]
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                     size=1,
                                     dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        # 对每一张图片计算ignore_mask
        def loop_body(b, ignore_mask):
            # 取出第b副图内,真实存在的所有的box的参数
            # n,4
            true_box = tf.boolean_mask(y_true[l][b, ..., 0:4],
                                       object_mask_bool[b, ..., 0])
            # 计算预测结果与真实情况的iou
            # pred_box为13,13,3,4
            # 计算的结果是每个pred_box和其它所有真实框的iou
            # 13,13,3,n
            iou = box_iou(pred_box[b], true_box)

            # 13,13,3
            best_iou = K.max(iou, axis=-1)

            # 如果某些预测框和真实框的重合程度大于0.5,则忽略。
            ignore_mask = ignore_mask.write(
                b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        # 遍历所有的图片
        _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body,
                                       [0, ignore_mask])

        # 将每幅图的内容压缩,进行处理
        ignore_mask = ignore_mask.stack()
        # (m,13,13,3,1)
        ignore_mask = K.expand_dims(ignore_mask, -1)

        box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]

        # Calculate ciou loss as location loss
        raw_true_box = y_true[l][..., 0:4]
        ciou = box_ciou(pred_box, raw_true_box)
        ciou_loss = object_mask * box_loss_scale * (1 - ciou)
        ciou_loss = K.sum(ciou_loss) / mf
        location_loss = ciou_loss

        # 如果该位置本来有框,那么计算1与置信度的交叉熵
        # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本
        # best_iou<ignore_thresh用于限制负样本数量
        confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \
                          (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5],
                                                                    from_logits=True) * ignore_mask

        class_loss = object_mask * K.binary_crossentropy(
            true_class_probs, raw_pred[..., 5:], from_logits=True)

        confidence_loss = K.sum(confidence_loss) / mf
        class_loss = K.sum(class_loss) / mf
        loss += location_loss + confidence_loss + class_loss
        # if print_loss:
        # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ')
    return loss
Example #2
0
def yolo_loss(args,
              anchors,
              num_classes,
              ignore_thresh=.5,
              label_smoothing=0.1,
              print_loss=False):
    # num_anchors = 3
    num_layers = len(anchors) // 3

    # yolo_outputs = [shape = (None, h//32, w//32, num_anchors*(5+num_classes)),
    #                 shape = (None, h//16, w//16, num_anchors*(5+num_classes)),
    #                 shape = (None, h//8, w//8, num_anchors*(5+num_classes))]
    yolo_outputs = args[:num_layers]
    # y_true = [shape = (None, h//32, w//32, num_anchors, 5+num_classes),
    #           shape = (None, h//16, w//16, num_anchors, 5+num_classes),
    #           shape = (None, h//8, w//8, num_anchors, 5+num_classes)]
    y_true = args[num_layers:]

    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]
                   ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]

    # input_shape = (h, w)
    input_shape = K.cast(
        K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))

    loss = 0
    bs = K.shape(yolo_outputs[0])[0]
    batch_size = K.cast(bs, K.dtype(yolo_outputs[0]))

    # y_true是一个列表,包含三个特征层,shape分别为(bs,13,13,3,85),(bs,26,26,3,85),(bs,52,52,3,85)。
    # yolo_outputs是一个列表,包含三个特征层,shape分别为(bs,13,13,255),(bs,26,26,255),(bs,52,52,255)。
    for i in range(num_layers):
        # 以第一个特征层(bs,13,13,3,85)为例子
        # 取出该特征层中存在目标的点的位置。(bs,13,13,3,1)
        object_mask = y_true[i][..., 4:5]
        # 取出其对应的种类(bs,13,13,3,80)
        true_class_probabilities = y_true[i][..., 5:]
        if label_smoothing:
            true_class_probabilities = _smooth_labels(true_class_probabilities,
                                                      label_smoothing)

        # 将yolo_outputs的特征层输出进行处理
        # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(bs,13,13,3,85)
        # 还有解码后的xy,wh,(bs,13,13,3,2)
        grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[i],
                                                     anchors[anchor_mask[i]],
                                                     num_classes,
                                                     input_shape,
                                                     calc_loss=True)

        # 这个是解码后的预测的box的位置
        # (bs,13,13,3,4)
        pred_box = K.concatenate([pred_xy, pred_wh])

        # 找到负样本群组,第一步是创建一个数组,[]
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                     size=1,
                                     dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        # 对每一张图片计算ignore_mask
        def loop_body(b, ignore_mask):
            # 取出第b副图内,真实存在的所有的box的参数
            # n,4
            true_box = tf.boolean_mask(y_true[i][b, ..., 0:4],
                                       object_mask_bool[b, ..., 0])
            # 计算预测结果与真实情况的iou
            # pred_box为13,13,3,4
            # 计算的结果是每个pred_box和其它所有真实框的iou
            # 13,13,3,n
            iou = box_iou(pred_box[b], true_box)

            # 13,13,3
            best_iou = K.max(iou, axis=-1)

            # 如果某些预测框和真实框的重合程度大于0.5,则忽略。
            ignore_mask = ignore_mask.write(
                b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        # 遍历所有的图片
        _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < bs,
                                                       loop_body,
                                                       [0, ignore_mask])

        # 将每幅图的内容压缩,进行处理
        ignore_mask = ignore_mask.stack()
        # (bs,13,13,3,1)
        ignore_mask = K.expand_dims(ignore_mask, -1)

        box_loss_scale = 2 - y_true[i][..., 2:3] * y_true[i][..., 3:4]

        # Calculate ciou loss as location loss
        raw_true_box = y_true[i][..., 0:4]
        ciou = box_ciou(pred_box, raw_true_box)
        ciou_loss = object_mask * box_loss_scale * (1 - ciou)
        ciou_loss = K.sum(ciou_loss) / batch_size
        location_loss = ciou_loss

        # 如果该位置本来有框,那么计算1与置信度的交叉熵
        # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本
        # best_iou<ignore_thresh用于限制负样本数量
        confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \
                          (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5],
                                                                    from_logits=True) * ignore_mask

        class_loss = object_mask * K.binary_crossentropy(
            true_class_probabilities, raw_pred[..., 5:], from_logits=True)

        confidence_loss = K.sum(confidence_loss) / batch_size
        class_loss = K.sum(class_loss) / batch_size
        loss += location_loss + confidence_loss + class_loss
        # if print_loss:
        # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ')
    return loss
Example #3
0
def yolo_loss(args,
              anchors,
              num_classes,
              ignore_thresh=.5,
              label_smoothing=0.1,
              print_loss=False,
              normalize=True):
    # 一共有两层
    num_layers = len(anchors) // 3

    #---------------------------------------------------------------------------------------------------#
    #   将预测结果和实际ground truth分开,args是[*model_body.output, *y_true]
    #   y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    #   yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    #---------------------------------------------------------------------------------------------------#
    y_true = args[num_layers:]
    yolo_outputs = args[:num_layers]

    #-----------------------------------------------------------#
    #   13x13的特征层对应的anchor是[81,82], [135,169], [344,319]
    #   26x26的特征层对应的anchor是[23,27], [37,58], [81,82]
    #-----------------------------------------------------------#
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]
                   ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]

    # 得到input_shpae为416,416
    input_shape = K.cast(
        K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))

    loss = 0
    num_pos = 0
    #-----------------------------------------------------------#
    #   取出每一张图片
    #   m的值就是batch_size
    #-----------------------------------------------------------#
    m = K.shape(yolo_outputs[0])[0]
    mf = K.cast(m, K.dtype(yolo_outputs[0]))

    #---------------------------------------------------------------------------------------------------#
    #   y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    #   yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    #---------------------------------------------------------------------------------------------------#
    for l in range(num_layers):
        #-----------------------------------------------------------#
        #   以第一个特征层(m,13,13,3,85)为例子
        #   取出该特征层中存在目标的点的位置。(m,13,13,3,1)
        #-----------------------------------------------------------#
        object_mask = y_true[l][..., 4:5]
        #-----------------------------------------------------------#
        #   取出其对应的种类(m,13,13,3,80)
        #-----------------------------------------------------------#
        true_class_probs = y_true[l][..., 5:]
        if label_smoothing:
            true_class_probs = _smooth_labels(true_class_probs,
                                              label_smoothing)

        #-----------------------------------------------------------#
        #   将yolo_outputs的特征层输出进行处理、获得四个返回值
        #   其中:
        #   grid        (13,13,1,2) 网格坐标
        #   raw_pred    (m,13,13,3,85) 尚未处理的预测结果
        #   pred_xy     (m,13,13,3,2) 解码后的中心坐标
        #   pred_wh     (m,13,13,3,2) 解码后的宽高坐标
        #-----------------------------------------------------------#
        grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
                                                     anchors[anchor_mask[l]],
                                                     num_classes,
                                                     input_shape,
                                                     calc_loss=True)

        #-----------------------------------------------------------#
        #   pred_box是解码后的预测的box的位置
        #   (m,13,13,3,4)
        #-----------------------------------------------------------#
        pred_box = K.concatenate([pred_xy, pred_wh])

        #-----------------------------------------------------------#
        #   找到负样本群组,第一步是创建一个数组,[]
        #-----------------------------------------------------------#
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                     size=1,
                                     dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        #-----------------------------------------------------------#
        #   对每一张图片计算ignore_mask
        #-----------------------------------------------------------#
        def loop_body(b, ignore_mask):
            #-----------------------------------------------------------#
            #   取出n个真实框:n,4
            #-----------------------------------------------------------#
            true_box = tf.boolean_mask(y_true[l][b, ..., 0:4],
                                       object_mask_bool[b, ..., 0])
            #-----------------------------------------------------------#
            #   计算预测框与真实框的iou
            #   pred_box    13,13,3,4 预测框的坐标
            #   true_box    n,4 真实框的坐标
            #   iou         13,13,3,n 预测框和真实框的iou
            #-----------------------------------------------------------#
            iou = box_iou(pred_box[b], true_box)

            #-----------------------------------------------------------#
            #   best_iou    13,13,3 每个特征点与真实框的最大重合程度
            #-----------------------------------------------------------#
            best_iou = K.max(iou, axis=-1)

            #-----------------------------------------------------------#
            #   判断预测框和真实框的最大iou小于ignore_thresh
            #   则认为该预测框没有与之对应的真实框
            #   该操作的目的是:
            #   忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了
            #   不适合当作负样本,所以忽略掉。
            #-----------------------------------------------------------#
            ignore_mask = ignore_mask.write(
                b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        #-----------------------------------------------------------#
        #   在这个地方进行一个循环、循环是对每一张图片进行的
        #-----------------------------------------------------------#
        _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m,
                                                       loop_body,
                                                       [0, ignore_mask])

        #-----------------------------------------------------------#
        #   ignore_mask用于提取出作为负样本的特征点
        #   (m,13,13,3)
        #-----------------------------------------------------------#
        ignore_mask = ignore_mask.stack()
        #   (m,13,13,3,1)
        ignore_mask = K.expand_dims(ignore_mask, -1)

        #-----------------------------------------------------------#
        #   真实框越大,比重越小,小框的比重更大。
        #-----------------------------------------------------------#
        box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]

        #-----------------------------------------------------------#
        #   计算Ciou loss
        #-----------------------------------------------------------#
        raw_true_box = y_true[l][..., 0:4]
        ciou = box_ciou(pred_box, raw_true_box)
        ciou_loss = object_mask * box_loss_scale * (1 - ciou)

        #------------------------------------------------------------------------------#
        #   如果该位置本来有框,那么计算1与置信度的交叉熵
        #   如果该位置本来没有框,那么计算0与置信度的交叉熵
        #   在这其中会忽略一部分样本,这些被忽略的样本满足条件best_iou<ignore_thresh
        #   该操作的目的是:
        #   忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了
        #   不适合当作负样本,所以忽略掉。
        #------------------------------------------------------------------------------#
        confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
            (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask

        class_loss = object_mask * K.binary_crossentropy(
            true_class_probs, raw_pred[..., 5:], from_logits=True)

        location_loss = K.sum(
            tf.where(tf.is_nan(ciou_loss), tf.zeros_like(ciou_loss),
                     ciou_loss))
        confidence_loss = K.sum(
            tf.where(tf.is_nan(confidence_loss),
                     tf.zeros_like(confidence_loss), confidence_loss))
        class_loss = K.sum(
            tf.where(tf.is_nan(class_loss), tf.zeros_like(class_loss),
                     class_loss))
        #-----------------------------------------------------------#
        #   计算正样本数量
        #-----------------------------------------------------------#
        num_pos += tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1)
        loss += location_loss + confidence_loss + class_loss
        # if print_loss:
        #   loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ')

    if normalize:
        loss = loss / num_pos
    else:
        loss = loss / mf
    return loss
Example #4
0
def yolo_loss(args,
              anchors,
              num_classes,
              ignore_thresh=.5,
              label_smoothing=0.1,
              print_loss=False):
    # three floors
    num_layers = len(anchors) // 3

    y_true = args[num_layers:]
    yolo_outputs = args[:num_layers]

    # priori box
    # 678   142,110,  192,243,  459,401
    # 345   36,75,  76,55,  72,146
    # 012   12,16,  19,36,  40,28
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]
                   ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]

    #input_shpae  608,608
    input_shape = K.cast(
        K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))

    loss = 0

    # Take out each picture
    m = K.shape(yolo_outputs[0])[0]
    mf = K.cast(m, K.dtype(yolo_outputs[0]))

    for e in range(num_layers):
        # Extract the position of the point with target in the feature layer
        object_mask = y_true[e][..., 4:5]
        # Take out the corresponding species
        true_class_probs = y_true[e][..., 5:]
        if label_smoothing:
            true_class_probs = _smooth_labels(true_class_probs,
                                              label_smoothing)

        # Process the characteristic layer output of YOLO_outputs
        grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[e],
                                                     anchors[anchor_mask[e]],
                                                     num_classes,
                                                     input_shape,
                                                     calc_loss=True)

        # Decoded predicted box position
        # (m,13,13,3,4)
        pred_box = K.concatenate([pred_xy, pred_wh])

        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                     size=1,
                                     dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        # Calculate ignore_mask for each image
        def loop_body(b, ignore_mask):
            true_box = tf.boolean_mask(y_true[e][b, ..., 0:4],
                                       object_mask_bool[b, ..., 0])
            # Calculate the IOU of the prediction against the real situation
            # 13,13,3,n
            iou = box_iou(pred_box[b], true_box)

            # 13,13,3
            best_iou = K.max(iou, axis=-1)

            # If the degree of overlap between some prediction boxes and real boxes is greater than 0.5, it is ignored.。
            ignore_mask = ignore_mask.write(
                b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        # Walk through all the images
        _, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m,
                                                       loop_body,
                                                       [0, ignore_mask])

        # Compress the content of each picture for processing
        ignore_mask = ignore_mask.stack()
        # (m,13,13,3,1)
        ignore_mask = K.expand_dims(ignore_mask, -1)

        box_loss_scale = 2 - y_true[e][..., 2:3] * y_true[e][..., 3:4]

        # Calculate ciou loss as location loss
        raw_true_box = y_true[e][..., 0:4]
        ciou = box_ciou(pred_box, raw_true_box)
        ciou_loss = object_mask * box_loss_scale * (1 - ciou)
        ciou_loss = K.sum(ciou_loss) / mf
        location_loss = ciou_loss

        # Calculate the cross entropy of 1 and confidence if there is a box at the location
        # If the location does not have a box and satisfies best_iou&lt; Ignore_thresh is considered a negative sample
        # best_iou<ignore_thresh     Used to limit the number of negative samples
        confidence_loss = object_mask * \
            K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \
            (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask

        class_loss = object_mask * K.binary_crossentropy(
            true_class_probs, raw_pred[..., 5:], from_logits=True)

        confidence_loss = K.sum(confidence_loss) / mf
        class_loss = K.sum(class_loss) / mf
        loss += location_loss + confidence_loss + class_loss
        # if print_loss:
        # loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss:')
    return loss