Ejemplo n.º 1
0
 def call(self, inputs):
     X, Z = inputs
     X_layer = kl.Dense(16, activation='linear')(X)
     Z_dense = kl.Dense(16, activation='linear')
     combined = list()
     for i in range(Z.shape[0]):
         z = Z_dense(Z[:,i])
         l = X_layer + z
         l = K.expand_dims(l, axis=1)
         combined.append(l)
     combined = kl.concatenate(combined, axis=1)
     # combined is now shape (batch_size, z_size, 16)
     l = ka.relu(combined)
     l = kl.Dense(16, activation='relu')(l)
     l = kl.Dense(16, activation='relu')(l)
     l = kl.Dense(16, activation='linear')(l)
     return l
Ejemplo n.º 2
0
 def mask(self, inputs, masks):
     masks = K.cast(masks, 'float32')
     masks = K.tile(masks, [K.shape(inputs)[0] // K.shape(masks)[0], 1])
     masks = K.expand_dims(masks, 1)
     outputs = inputs + masks * self._masking_num
     return outputs
Ejemplo n.º 3
0
def box_ciou(b1, b2):
    """
    输入为:
    ----------
    b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
    b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh

    返回为:
    -------
    ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
    """
    #-----------------------------------------------------------#
    #   求出预测框左上角右下角
    #   b1_mins     (batch, feat_w, feat_h, anchor_num, 2)
    #   b1_maxes    (batch, feat_w, feat_h, anchor_num, 2)
    #-----------------------------------------------------------#
    b1_xy = b1[..., :2]
    b1_wh = b1[..., 2:4]
    b1_wh_half = b1_wh / 2.
    b1_mins = b1_xy - b1_wh_half
    b1_maxes = b1_xy + b1_wh_half
    #-----------------------------------------------------------#
    #   求出真实框左上角右下角
    #   b2_mins     (batch, feat_w, feat_h, anchor_num, 2)
    #   b2_maxes    (batch, feat_w, feat_h, anchor_num, 2)
    #-----------------------------------------------------------#
    b2_xy = b2[..., :2]
    b2_wh = b2[..., 2:4]
    b2_wh_half = b2_wh / 2.
    b2_mins = b2_xy - b2_wh_half
    b2_maxes = b2_xy + b2_wh_half

    #-----------------------------------------------------------#
    #   求真实框和预测框所有的iou
    #   iou         (batch, feat_w, feat_h, anchor_num)
    #-----------------------------------------------------------#
    intersect_mins = K.maximum(b1_mins, b2_mins)
    intersect_maxes = K.minimum(b1_maxes, b2_maxes)
    intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
    b1_area = b1_wh[..., 0] * b1_wh[..., 1]
    b2_area = b2_wh[..., 0] * b2_wh[..., 1]
    union_area = b1_area + b2_area - intersect_area
    iou = intersect_area / K.maximum(union_area, K.epsilon())

    #-----------------------------------------------------------#
    #   计算中心的差距
    #   center_distance (batch, feat_w, feat_h, anchor_num)
    #-----------------------------------------------------------#
    center_distance = K.sum(K.square(b1_xy - b2_xy), axis=-1)
    enclose_mins = K.minimum(b1_mins, b2_mins)
    enclose_maxes = K.maximum(b1_maxes, b2_maxes)
    enclose_wh = K.maximum(enclose_maxes - enclose_mins, 0.0)
    #-----------------------------------------------------------#
    #   计算对角线距离
    #   enclose_diagonal (batch, feat_w, feat_h, anchor_num)
    #-----------------------------------------------------------#
    enclose_diagonal = K.sum(K.square(enclose_wh), axis=-1)
    ciou = iou - 1.0 * (center_distance) / K.maximum(enclose_diagonal,
                                                     K.epsilon())

    v = 4 * K.square(
        tf.math.atan2(b1_wh[..., 0], K.maximum(b1_wh[..., 1], K.epsilon())) -
        tf.math.atan2(b2_wh[..., 0], K.maximum(b2_wh[..., 1], K.epsilon()))
    ) / (math.pi * math.pi)
    alpha = v / K.maximum((1.0 - iou + v), K.epsilon())
    ciou = ciou - alpha * v

    ciou = K.expand_dims(ciou, -1)
    return ciou
Ejemplo n.º 4
0
def yolo_loss(args,
              anchors,
              num_classes,
              ignore_thresh=.5,
              label_smoothing=0.1,
              print_loss=False):

    # 一共有2层
    num_layers = len(anchors) // 3

    # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true]
    # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)。
    # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,255),(m,26,26,255)。
    y_true = args[num_layers:]
    yolo_outputs = args[:num_layers]

    # 先验框
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]
                   ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]

    # 得到input_shpae为608,608
    input_shape = K.cast(
        K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))

    loss = 0

    # 取出每一张图片
    # m的值就是batch_size
    m = K.shape(yolo_outputs[0])[0]
    mf = K.cast(m, K.dtype(yolo_outputs[0]))

    # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)。
    # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,255),(m,26,26,255)。
    for l in range(num_layers):
        # 以第一个特征层(m,13,13,3,85)为例子
        # 取出该特征层中存在目标的点的位置。(m,13,13,3,1)
        object_mask = y_true[l][..., 4:5]
        # 取出其对应的种类(m,13,13,3,80)
        true_class_probs = y_true[l][..., 5:]
        if label_smoothing:
            true_class_probs = _smooth_labels(true_class_probs,
                                              label_smoothing)

        # 将yolo_outputs的特征层输出进行处理
        # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(m,13,13,3,85)
        # 还有解码后的xy,wh,(m,13,13,3,2)
        grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
                                                     anchors[anchor_mask[l]],
                                                     num_classes,
                                                     input_shape,
                                                     calc_loss=True)

        # 这个是解码后的预测的box的位置
        # (m,13,13,3,4)
        pred_box = K.concatenate([pred_xy, pred_wh])

        # 找到负样本群组,第一步是创建一个数组,[]
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                     size=1,
                                     dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        # 对每一张图片计算ignore_mask
        def loop_body(b, ignore_mask):
            # 取出第b副图内,真实存在的所有的box的参数
            # n,4
            true_box = tf.boolean_mask(y_true[l][b, ..., 0:4],
                                       object_mask_bool[b, ..., 0])
            # 计算预测结果与真实情况的iou
            # pred_box为13,13,3,4
            # 计算的结果是每个pred_box和其它所有真实框的iou
            # 13,13,3,n
            iou = box_iou(pred_box[b], true_box)

            # 13,13,3
            best_iou = K.max(iou, axis=-1)

            # 如果某些预测框和真实框的重合程度大于0.5,则忽略。
            ignore_mask = ignore_mask.write(
                b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        # 遍历所有的图片
        _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body,
                                       [0, ignore_mask])

        # 将每幅图的内容压缩,进行处理
        ignore_mask = ignore_mask.stack()
        #(m,13,13,3,1)
        ignore_mask = K.expand_dims(ignore_mask, -1)

        box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]

        # Calculate ciou loss as location loss
        raw_true_box = y_true[l][..., 0:4]
        ciou = box_ciou(pred_box, raw_true_box)
        ciou_loss = object_mask * box_loss_scale * (1 - ciou)
        ciou_loss = K.sum(ciou_loss) / mf
        location_loss = ciou_loss

        # 如果该位置本来有框,那么计算1与置信度的交叉熵
        # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本
        # best_iou<ignore_thresh用于限制负样本数量
        confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
            (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask

        class_loss = object_mask * K.binary_crossentropy(
            true_class_probs, raw_pred[..., 5:], from_logits=True)

        confidence_loss = K.sum(confidence_loss) / mf
        class_loss = K.sum(class_loss) / mf
        loss += location_loss + confidence_loss + class_loss
        # if print_loss:
        # loss = tf.Print(loss, [loss, confidence_loss, class_loss, location_loss], message='loss: ')
    loss = K.expand_dims(loss, axis=-1)
    return loss
Ejemplo n.º 5
0
 def find_path(argmin_table, best_idx):
     next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
     next_best_idx = K.expand_dims(next_best_idx)
     return next_best_idx, [next_best_idx]
Ejemplo n.º 6
0
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False):
    '''Return yolo_loss tensor

    Parameters
    ----------
    yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body
    y_true: list of array, the output of preprocess_true_boxes
    anchors: array, shape=(N, 2), wh
    num_classes: integer
    ignore_thresh: float, the iou threshold whether to ignore object confidence loss

    Returns
    -------
    loss: tensor, shape=(1,)

    '''
    num_layers = len(anchors) // 3  # default setting
    yolo_outputs = args[:num_layers]
    y_true = args[num_layers:]
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]
                   ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]
    input_shape = K.cast(
        K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))
    grid_shapes = [
        K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0]))
        for l in range(num_layers)
    ]
    loss = 0
    m = K.shape(yolo_outputs[0])[0]  # batch size, tensor
    mf = K.cast(m, K.dtype(yolo_outputs[0]))

    for l in range(num_layers):
        object_mask = y_true[l][..., 4:5]
        true_class_probs = y_true[l][..., 5:]

        grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
                                                     anchors[anchor_mask[l]],
                                                     num_classes,
                                                     input_shape,
                                                     calc_loss=True)
        pred_box = K.concatenate([pred_xy, pred_wh])

        # Darknet raw box to calculate loss.
        raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid
        raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] *
                            input_shape[::-1])
        raw_true_wh = K.switch(object_mask, raw_true_wh,
                               K.zeros_like(raw_true_wh))  # avoid log(0)=-inf
        box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]

        # Find ignore mask, iterate over each of batch.
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                     size=1,
                                     dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        def loop_body(b, ignore_mask):
            true_box = tf.boolean_mask(y_true[l][b, ..., 0:4],
                                       object_mask_bool[b, ..., 0])
            iou = box_iou(pred_box[b], true_box)
            best_iou = K.max(iou, axis=-1)
            ignore_mask = ignore_mask.write(
                b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body,
                                       [0, ignore_mask])
        ignore_mask = ignore_mask.stack()
        ignore_mask = K.expand_dims(ignore_mask, -1)

        # K.binary_crossentropy is helpful to avoid exp overflow.
        xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(
            raw_true_xy, raw_pred[..., 0:2], from_logits=True)
        wh_loss = object_mask * box_loss_scale * 0.5 * K.square(
            raw_true_wh - raw_pred[..., 2:4])
        confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
            (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask
        class_loss = object_mask * K.binary_crossentropy(
            true_class_probs, raw_pred[..., 5:], from_logits=True)

        xy_loss = K.sum(xy_loss) / mf
        wh_loss = K.sum(wh_loss) / mf
        confidence_loss = K.sum(confidence_loss) / mf
        class_loss = K.sum(class_loss) / mf
        loss += xy_loss + wh_loss + confidence_loss + class_loss
        if print_loss:
            loss = tf.print(loss, [
                loss, xy_loss, wh_loss, confidence_loss, class_loss,
                K.sum(ignore_mask)
            ],
                            message='loss: ')
    return loss
Ejemplo n.º 7
0
 def call(self, inputs, **kwargs):
     if inputs.get_shape().ndims == 5:
         assert inputs.get_shape(
         )[-2].value == 1, 'Error: Must have num_capsules = 1 going into Length'
         inputs = K.squeeze(inputs, axis=-2)
     return K.expand_dims(tf.norm(inputs, axis=-1), axis=-1)
Ejemplo n.º 8
0
embed_sem = Model(inputs=sem_in_, outputs=z_sem)
embed_etym = Model(inputs=[enc_in_, sem_in_], outputs=z_etym)

embed_lang_in_ = Input((latent_dim, ))
embed_POS_in_ = Input((latent_dim, ))
embed_sem_in_ = Input((latent_dim, ))
embed_etym_in_ = Input((latent_dim, ))

embedding = Dense(embed_dim)(enc_in_)

h_enc = Bidirectional(LSTM(hidden_dim, return_sequences=True),
                      'concat')(embedding) * enc_mask
h_dec = LSTM(hidden_dim, return_sequences=True,
             activation=None)(dec_in_) * dec_mask
#alignment_probs_,emission_probs = monotonic_alignment([h_enc,h_dec,T_x,T_y,Y,hidden_dim])
struc_zeros = K.expand_dims(
    K.cast(np.triu(np.ones([T_x, T_x])), dtype='float32'), 0)
alignment_probs = K.softmax(
    dot([Dense(hidden_dim)(h_enc), h_dec], axes=-1, normalize=False), -2)
h_enc_rep = K.tile(K.expand_dims(h_enc, -2), [1, 1, T_y, 1])
h_dec_rep = K.tile(K.expand_dims(h_dec, -3), [1, T_x, 1, 1])
h_rep = K.concatenate([h_enc_rep, h_dec_rep], -1)

alignment_probs_ = []
for i in range(T_y):
    if i == 0:
        align_prev_curr = tf.gather(alignment_probs, i, axis=-1)
    if i > 0:
        align_prev_curr = tf.einsum('nx,ny->nxy',
                                    tf.gather(alignment_probs, i, axis=-1),
                                    alignment_probs_[i - 1])
        align_prev_curr *= struc_zeros
Ejemplo n.º 9
0
 def fit_dimensionality(self, tensor, batch_size):
     tensor = K.expand_dims(tensor)  # channels
     tensor = K.expand_dims(tensor, axis=0)  # batches
     tensor = K.tile(tensor,
                     (batch_size, ) + (1, ) * 5)  # repeat over batches
     return tensor
Ejemplo n.º 10
0
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False, normalize=True):
    # 一共有两层
    num_layers = len(anchors) // 3

    # ---------------------------------------------------------------------------------------------------#
    #   将预测结果和实际ground truth分开,args是[*model_body.output, *y_true]
    #   y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    #   yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    # ---------------------------------------------------------------------------------------------------#
    y_true = args[num_layers:]
    yolo_outputs = args[:num_layers]

    # -----------------------------------------------------------#
    #   13x13的特征层对应的anchor是[81,82], [135,169], [344,319]
    #   26x26的特征层对应的anchor是[23,27], [37,58], [81,82]
    # -----------------------------------------------------------#
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]

    # 得到input_shpae为416,416 
    input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))

    loss = 0
    num_pos = 0

    # -----------------------------------------------------------#
    #   取出每一张图片
    #   m的值就是batch_size
    # -----------------------------------------------------------#
    m = K.shape(yolo_outputs[0])[0]
    mf = K.cast(m, K.dtype(yolo_outputs[0]))

    # ---------------------------------------------------------------------------------------------------#
    #   y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    #   yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)
    # ---------------------------------------------------------------------------------------------------#
    for l in range(num_layers):
        # -----------------------------------------------------------#
        #   以第一个特征层(m,13,13,3,85)为例子
        #   取出该特征层中存在目标的点的位置。(m,13,13,3,1)
        # -----------------------------------------------------------#
        object_mask = y_true[l][..., 4:5]
        # -----------------------------------------------------------#
        #   取出其对应的种类(m,13,13,3,80)
        # -----------------------------------------------------------#
        true_class_probs = y_true[l][..., 5:]
        if label_smoothing:
            true_class_probs = _smooth_labels(true_class_probs, label_smoothing)

        # -----------------------------------------------------------#
        #   将yolo_outputs的特征层输出进行处理、获得四个返回值
        #   其中:
        #   grid        (13,13,1,2) 网格坐标
        #   raw_pred    (m,13,13,3,85) 尚未处理的预测结果
        #   pred_xy     (m,13,13,3,2) 解码后的中心坐标
        #   pred_wh     (m,13,13,3,2) 解码后的宽高坐标
        # -----------------------------------------------------------#
        grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
                                                     anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True)

        # -----------------------------------------------------------#
        #   pred_box是解码后的预测的box的位置
        #   (m,13,13,3,4)
        # -----------------------------------------------------------#
        pred_box = K.concatenate([pred_xy, pred_wh])

        # -----------------------------------------------------------#
        #   找到负样本群组,第一步是创建一个数组,[]
        # -----------------------------------------------------------#
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        # -----------------------------------------------------------#
        #   对每一张图片计算ignore_mask
        # -----------------------------------------------------------#
        def loop_body(b, ignore_mask):
            # -----------------------------------------------------------#
            #   取出n个真实框:n,4
            # -----------------------------------------------------------#
            true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0])
            # -----------------------------------------------------------#
            #   计算预测框与真实框的iou
            #   pred_box    13,13,3,4 预测框的坐标
            #   true_box    n,4 真实框的坐标
            #   iou         13,13,3,n 预测框和真实框的iou
            # -----------------------------------------------------------#
            iou = box_iou(pred_box[b], true_box)

            # -----------------------------------------------------------#
            #   best_iou    13,13,3 每个特征点与真实框的最大重合程度
            # -----------------------------------------------------------#
            best_iou = K.max(iou, axis=-1)

            # -----------------------------------------------------------#
            #   判断预测框和真实框的最大iou小于ignore_thresh
            #   则认为该预测框没有与之对应的真实框
            #   该操作的目的是:
            #   忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了
            #   不适合当作负样本,所以忽略掉。
            # -----------------------------------------------------------#
            ignore_mask = ignore_mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        # -----------------------------------------------------------#
        #   在这个地方进行一个循环、循环是对每一张图片进行的
        # -----------------------------------------------------------#
        _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask])

        # -----------------------------------------------------------#
        #   ignore_mask用于提取出作为负样本的特征点
        #   (m,13,13,3)
        # -----------------------------------------------------------#
        ignore_mask = ignore_mask.stack()
        #   (m,13,13,3,1)
        ignore_mask = K.expand_dims(ignore_mask, -1)

        # -----------------------------------------------------------#
        #   真实框越大,比重越小,小框的比重更大。
        # -----------------------------------------------------------#
        box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]

        # -----------------------------------------------------------#
        #   计算Ciou loss
        # -----------------------------------------------------------#
        raw_true_box = y_true[l][..., 0:4]
        ciou = box_ciou(pred_box, raw_true_box)
        ciou_loss = object_mask * box_loss_scale * (1 - ciou)

        # ------------------------------------------------------------------------------#
        #   如果该位置本来有框,那么计算1与置信度的交叉熵
        #   如果该位置本来没有框,那么计算0与置信度的交叉熵
        #   在这其中会忽略一部分样本,这些被忽略的样本满足条件best_iou<ignore_thresh
        #   该操作的目的是:
        #   忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了
        #   不适合当作负样本,所以忽略掉。
        # ------------------------------------------------------------------------------#
        confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \
                          (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5],
                                                                    from_logits=True) * ignore_mask

        class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[..., 5:], from_logits=True)

        location_loss = K.sum(tf.where(tf.math.is_nan(ciou_loss), tf.zeros_like(ciou_loss), ciou_loss))
        confidence_loss = K.sum(
            tf.where(tf.math.is_nan(confidence_loss), tf.zeros_like(confidence_loss), confidence_loss))
        class_loss = K.sum(tf.where(tf.math.is_nan(class_loss), tf.zeros_like(class_loss), class_loss))
        # -----------------------------------------------------------#
        #   计算正样本数量
        # -----------------------------------------------------------#
        num_pos += tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1)
        loss += location_loss + confidence_loss + class_loss

    loss = K.expand_dims(loss, axis=-1)

    if normalize:
        loss = loss / num_pos
    else:
        loss = loss / mf
    return loss
Ejemplo n.º 11
0
def customized_yolo_loss(y_true,
                         y_pred,
                         input_shape,
                         candidate_anchors,
                         grid_shape,
                         num_classes,
                         ignore_thresh=.5):

    # y_pred = tf.convert_to_tensor(y_pred)
    # y_true = tf.cast(y_true, y_pred.dtype)
    from .base import yolo_head

    m = K.shape(y_pred)[0]  # batch size, tensor
    mf = K.cast(m, K.dtype(y_pred))

    grid_shape = K.cast(grid_shape[1:3], K.dtype(y_true))

    object_mask = y_true[..., 4:5]
    true_class_probs = y_true[..., 5:]

    grid, raw_pred, pred_xy, pred_wh = yolo_head(y_pred,
                                                 candidate_anchors,
                                                 num_classes,
                                                 input_shape,
                                                 calc_loss=True)
    pred_box = K.concatenate([pred_xy, pred_wh])

    # Darknet raw box to calculate loss.
    # print(y_true[..., :2].shape, grid_shape[::-1], grid.shape)
    raw_true_xy = y_true[..., :2] * grid_shape[::-1] - grid
    raw_true_wh = K.log(y_true[..., 2:4] / candidate_anchors *
                        input_shape[::-1] + 1e-10)
    raw_true_wh = K.switch(object_mask, raw_true_wh,
                           K.zeros_like(raw_true_wh))  # avoid log(0)=-inf
    box_loss_scale = 2 - y_true[..., 2:3] * y_true[..., 3:4]

    # Find ignore mask, iterate over each of batch.
    ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True)
    object_mask_bool = K.cast(object_mask, 'bool')

    def loop_body(b, ignore_mask):
        true_box = tf.boolean_mask(y_true[b, ..., 0:4],
                                   object_mask_bool[b, ..., 0])
        iou = box_iou(pred_box[b], true_box)
        best_iou = K.max(iou, axis=-1)
        ignore_mask = ignore_mask.write(
            b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
        return b + 1, ignore_mask

    _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body,
                                   [0, ignore_mask])
    ignore_mask = ignore_mask.stack()
    ignore_mask = K.expand_dims(ignore_mask, -1)

    # K.binary_crossentropy is helpful to avoid exp overflow.
    xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(
        raw_true_xy, raw_pred[..., 0:2], from_logits=True)
    wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh -
                                                            raw_pred[..., 2:4])

    confidence_loss = object_mask * K.binary_crossentropy(
        object_mask, raw_pred[..., 4:5], from_logits=True)
    confidence_loss += (1 - object_mask) * K.binary_crossentropy(
        object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask

    class_loss = object_mask * K.binary_crossentropy(
        true_class_probs, raw_pred[..., 5:], from_logits=True)

    xy_loss = K.sum(xy_loss) / mf
    wh_loss = K.sum(wh_loss) / mf
    confidence_loss = K.sum(confidence_loss) / mf
    class_loss = K.sum(class_loss) / mf

    return xy_loss + wh_loss + confidence_loss + class_loss
Ejemplo n.º 12
0
 def call(self, inputs):
     input_shapes = nest.map_structure(lambda x: x.shape, inputs)
     output_shapes = self.compute_output_shape(input_shapes)
     means, covariances = inputs
     # there is no 1d pooling until tf-1.14, so we use 2d pooling instead
     if self.data_format == "channels_last":
         means = K.expand_dims(means, 1)
         if self.mode == "diag":
             covariances = K.expand_dims(covariances, 1)
         elif self.mode == "half":
             covariances = K.expand_dims(covariances, 2)
         elif self.mode == "full":
             covariances = K.expand_dims(covariances, 1)
             covariances = K.expand_dims(covariances, 4)
         pool_shape = list((1, ) + self.pool_size)
         strides = list((1, ) + self.strides)
         data_format = "NHWC"
     else:
         means = K.expand_dims(means, 2)
         if self.mode == "diag":
             covariances = K.expand_dims(covariances, 2)
         elif self.mode == "half":
             covariances = K.expand_dims(covariances, 3)
         elif self.mode == "full":
             covariances = K.expand_dims(covariances, 2)
             covariances = K.expand_dims(covariances, 5)
         pool_shape = list((1, ) + self.pool_size)
         strides = list((1, ) + self.strides)
         data_format = "NCHW"
     outputs = [[], []]
     outputs[0] = K.reshape(
         self.pool_function(
             means,
             ksize=pool_shape,
             strides=strides,
             padding=self.padding.upper(),
             data_format=data_format,
         ),
         [-1] + output_shapes[0].as_list()[1:],
     )
     if self.mode == "diag":
         outputs[1] = K.reshape(
             self.pool_function(
                 covariances / np.prod(pool_shape),
                 ksize=pool_shape,
                 strides=strides,
                 padding=self.padding.upper(),
                 data_format=data_format,
             ),
             [-1] + output_shapes[1].as_list()[1:],
         )
     elif self.mode == "half":
         cov_shape = covariances.get_shape().as_list()
         covariances = K.reshape(covariances, [-1] + cov_shape[2:])
         outputs[1] = K.reshape(
             self.pool_function(
                 covariances,
                 ksize=pool_shape,
                 strides=strides,
                 padding=self.padding.upper(),
                 data_format=data_format,
             ),
             [-1] + output_shapes[1].as_list()[1:],
         )
     elif self.mode == "full":
         cov_shape = covariances.get_shape().as_list()
         out_shape = output_shapes[1].as_list()
         if self.data_format == "channels_last":
             out_shape = (out_shape[:1] + [1] + out_shape[1:3] + [1] +
                          out_shape[3:])
         elif self.data_format == "channels_first":
             out_shape = (out_shape[:2] + [1] + out_shape[2:4] + [1] +
                          out_shape[4:])
         covariances = K.reshape(covariances, [-1] + cov_shape[4:])
         covariances = K.reshape(
             self.pool_function(
                 covariances,
                 ksize=pool_shape,
                 strides=strides,
                 padding=self.padding.upper(),
                 data_format=data_format,
             ),
             ([-1] + cov_shape[1:4] + out_shape[-3:]),
         )
         covariances = K.permute_dimensions(
             covariances,
             ([0] + list(range(4, 7)) + list(range(1, 4))),
         )
         covariances = K.reshape(covariances, [-1] + cov_shape[1:4])
         covariances = K.reshape(
             self.pool_function(
                 covariances,
                 ksize=pool_shape,
                 strides=strides,
                 padding=self.padding.upper(),
                 data_format=data_format,
             ),
             ([-1] + out_shape[-3:] + out_shape[1:4]),
         )
         outputs[1] = K.reshape(
             K.permute_dimensions(
                 covariances,
                 ([0] + list(range(4, 7)) + list(range(1, 4))),
             ),
             [-1] + output_shapes[1].as_list()[1:],
         )
     return outputs
def create_model(linear_feature_columns, dnn_feature_columns, fm_group=[DEFAULT_GROUP_NAME], dnn_hidden_units=(128, 128),
                 l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
                 dnn_activation='relu', dnn_use_bn=False, task='binary'):

    K.clear_session()
#!################################################################################################################
    inputs_all = [get_input_feature_layer(
        name='slotid_nettype', feature_shape=dense_feature_size)]
    # slotid_nettype
    layer_slotid_nettype = inputs_all[0]
    layer_slotid_nettype = K.expand_dims(layer_slotid_nettype, 1)
#!################################################################################################################
    seq_inputs_dict = get_cross_seq_input_layers(cols=cross_arr_name_list)
    inputs_all = inputs_all + list(seq_inputs_dict.values())  # 输入层list 做交叉

    cross_emb_out = []
    last_col = ''
    for index, col in enumerate(cross_arr_name_list):
        #         print(col, 'get embedding!')
        emb_layer = get_emb_layer(
            col, trainable=False, emb_matrix=dict_cross_emb_all[col])
        x = emb_layer(inputs_all[1+index])
        if col.split('_')[-1] == 'i':
            cross_user_item_i = x
            last_col = col
            continue
        else:
            print(f'crossing net add {last_col} and {col}')
            cross_emb_out.append(
                cross_net(cross_user_item_i, x, layer_slotid_nettype, hidden_unit=4))
    cross_emb_out = tf.keras.layers.concatenate(cross_emb_out)
    cross_emb_out = tf.squeeze(cross_emb_out, [1])
#!################################################################################################################
    seq_inputs_dict = get_seq_input_layers(cols=arr_name_list)
    inputs_all = inputs_all+list(seq_inputs_dict.values())  # 输入层list
    masks = tf.equal(seq_inputs_dict['task_id'], 0)
    # 普通序列+label序列
    layers2concat = []
    for index, col in enumerate(arr_name_list):
        print(col, 'get embedding!')
        emb_layer = get_emb_layer(
            col, trainable=TRAINABLE_DICT[col], emb_matrix=id_list_dict_emb_all[col][1])
        x = emb_layer(seq_inputs_dict[col])
        if conv1d_info_dict[col] > -1:
            cov_layer = tf.keras.layers.Conv1D(filters=conv1d_info_dict[col],
                                               kernel_size=1,
                                               activation='relu')
            x = cov_layer(x)
        layers2concat.append(x)
    x = tf.keras.layers.concatenate(layers2concat)
#!################################################################################################################
#!mix1
    x = trans_net(x, masks, hidden_unit=256)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()
    average_pool = tf.keras.layers.GlobalAveragePooling1D()
    xmaxpool = max_pool(x)
    xmeanpool = average_pool(x)

    trans_output = tf.keras.layers.concatenate([xmaxpool, xmeanpool])


#!################################################################################################################
#!mix2
    features = build_input_features(
        linear_feature_columns + dnn_feature_columns)

    inputs_list = list(features.values())

    linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
                                    l2_reg=l2_reg_linear)

    group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, l2_reg_embedding,
                                                                        seed, support_group=True)

    fm_logit = add_func([FM()(concat_func(v, axis=1))
                         for k, v in group_embedding_dict.items() if k in fm_group])

    dnn_input = combined_dnn_input(list(chain.from_iterable(
        group_embedding_dict.values())), dense_value_list)

    mix = concatenate([cross_emb_out, trans_output,
                       dnn_input], axis=-1)  # !#mix

    dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                     dnn_use_bn, seed)(mix)

    dnn_logit = tf.keras.layers.Dense(
        1, use_bias=False, activation=None)(dnn_output)

    final_logit = add_func([linear_logit, fm_logit, dnn_logit])
    output = PredictionLayer(task)(final_logit)

#!################################################################################################################

    model = Model(inputs=inputs_all+[features],
                  outputs=[output])
    print(model.summary())
    return model
def _get_best_anchor(y_true, anchors, width, height):
    """
    get the correct anchor that is assoiciated with each box using IOU betwenn input anchors and gt
    Args:
        y_true: tf.Tensor[] for the list of bounding boxes in the yolo format
        anchors: list or tensor for the anchor boxes to be used in prediction found via Kmeans
        size: size of the image that the bounding boxes were selected at 416 is the default for the original YOLO model
    return:
        tf.Tensor: y_true with the anchor associated with each ground truth box known
    """
    with tf.name_scope("get_anchor"):
        width = tf.cast(width, dtype=tf.float32)
        height = tf.cast(height, dtype=tf.float32)

        anchor_xy = y_true[..., 0:2]
        true_wh = y_true[..., 2:4]

        # scale thhe boxes
        anchors = tf.convert_to_tensor(anchors, dtype=tf.float32)
        anchors_x = anchors[..., 0] / width
        anchors_y = anchors[..., 1] / height
        anchors = tf.stack([anchors_x, anchors_y], axis=-1)

        # build a matrix of anchor boxes
        anchors = tf.transpose(anchors, perm=[1, 0])
        anchor_xy = tf.tile(tf.expand_dims(anchor_xy, axis=-1),
                            [1, 1, tf.shape(anchors)[-1]])
        anchors = tf.tile(tf.expand_dims(anchors, axis=0),
                          [tf.shape(anchor_xy)[0], 1, 1])

        # stack the xy so, each anchor is asscoaited once with each center from the ground truth input
        anchors = K.concatenate([anchor_xy, anchors], axis=1)
        anchors = tf.transpose(anchors, perm=[2, 0, 1])

        # copy the gt n times so that each anchor from above can be compared to input ground truth
        truth_comp = tf.tile(tf.expand_dims(y_true[..., 0:4], axis=-1),
                             [1, 1, tf.shape(anchors)[0]])
        truth_comp = tf.transpose(truth_comp, perm=[2, 0, 1])

        # compute intersection over union of the boxes, and take the argmax of comuted iou for each box.
        # thus each box is associated with the largest interection over union
        iou_raw = compute_iou(truth_comp, anchors)

        gt_mask = tf.cast(iou_raw > 0.213, dtype=iou_raw.dtype)

        num_k = tf.reduce_max(
            tf.reduce_sum(tf.transpose(gt_mask, perm=[1, 0]), axis=1))
        if num_k <= 0:
            num_k = 1.0

        values, indexes = tf.math.top_k(tf.transpose(iou_raw, perm=[1, 0]),
                                        k=tf.cast(num_k, dtype=tf.int32),
                                        sorted=True)
        ind_mask = tf.cast(values > 0.213, dtype=indexes.dtype)
        iou_index = tf.concat([
            K.expand_dims(indexes[..., 0], axis=-1),
            ((indexes[..., 1:] + 1) * ind_mask[..., 1:]) - 1
        ],
                              axis=-1)

        stack = tf.zeros(
            [tf.shape(iou_index)[0],
             tf.cast(1, dtype=iou_index.dtype)],
            dtype=iou_index.dtype) - 1
        #tf.print(tf.shape(iou_index))
        while num_k < 5:
            iou_index = tf.concat([iou_index, stack], axis=-1)
            num_k += 1
        iou_index = iou_index[..., :5]

        values = tf.concat([
            K.expand_dims(values[..., 0], axis=-1),
            ((values[..., 1:]) * tf.cast(ind_mask[..., 1:], dtype=tf.float32))
        ],
                           axis=-1)
        # iou_anchors = K.argmax(iou_raw, axis = 0)
        # iou_anchors = K.expand_dims(tf.cast(iou_anchors, dtype = tf.float32), axis = -1)
        # tf.print(iou_index, values)
        #flatten the list from above and attach to the end of input y_true, then return it
        #y_true = K.concatenate([y_true, K.expand_dims(iou_anchors, axis = -1)], axis = -1)
    return tf.cast(iou_index, dtype=tf.float32)
Ejemplo n.º 15
0
    def __call__(self, y_true, y_pred):
        # 1. generate and store constants and format output
        shape = tf.shape(y_pred)
        batch_size, width, height = shape[0], shape[1], shape[2]
        y_pred = tf.cast(
            tf.reshape(y_pred, [batch_size, width, height, self._num, -1]),
            tf.float32)
        grid_points, anchor_grid, y_true = self._get_label_attributes(
            width, height, batch_size, y_true, y_pred, y_pred.dtype)

        fwidth = tf.cast(width, y_pred.dtype)
        fheight = tf.cast(height, y_pred.dtype)

        # 2. split up layer output into components, xy, wh, confidence, class -> then apply activations to the correct items
        pred_xy, pred_wh, pred_box = self._get_predicted_box(
            fwidth, fheight, y_pred[..., 0:4], anchor_grid, grid_points)
        pred_conf = tf.expand_dims(tf.math.sigmoid(y_pred[..., 4]), axis=-1)
        pred_conf = self.rm_nan_inf(pred_conf)
        pred_class = tf.math.sigmoid(y_pred[..., 5:])
        self.print_error(pred_box)

        # 3. split up ground_truth into components, xy, wh, confidence, class -> apply calculations to acchive safe format as predictions
        true_box = y_true[..., 0:4]
        true_conf = y_true[..., 4]
        true_class = y_true[..., 5:]

        # 5. apply generalized IOU or mse to the box predictions -> only the indexes where an object exists will affect the total loss -> found via the true_confidnce in ground truth
        if self._loss_type == "giou":
            iou, giou = iou_ops.compute_giou(true_box, pred_box)
            mask_iou = tf.cast(iou < self._ignore_thresh, dtype=y_pred.dtype)
            loss_box = (1 - giou) * self._iou_normalizer * true_conf
            #loss_box = tf.math.minimum(loss_box, self._max_value)
        elif self._loss_type == "ciou":
            iou, ciou = iou_ops.compute_ciou(true_box, pred_box)
            mask_iou = tf.cast(iou < self._ignore_thresh, dtype=y_pred.dtype)
            loss_box = (1 - ciou) * self._iou_normalizer * true_conf
            #loss_box = tf.math.minimum(loss_box, self._max_value)
        else:
            # iou mask computation
            iou = iou_ops.compute_iou(true_box, pred_box)
            mask_iou = tf.cast(iou < self._ignore_thresh, dtype=y_pred.dtype)

            # mse loss computation :: yolo_layer.c: scale = (2-truth.w*truth.h)
            scale = (
                2 - true_box[..., 2] * true_box[..., 3]) * self._iou_normalizer
            true_xy, true_wh = self._scale_ground_truth_box(
                true_box, fwidth, fheight, anchor_grid, grid_points,
                y_pred.dtype)
            loss_xy = tf.reduce_sum(K.square(true_xy - pred_xy), axis=-1)
            loss_wh = tf.reduce_sum(K.square(true_wh - pred_wh), axis=-1)
            loss_box = (loss_wh + loss_xy) * true_conf * scale
            #loss_box = tf.math.minimum(loss_box, self._max_value)

        # 6. apply binary cross entropy(bce) to class attributes -> only the indexes where an object exists will affect the total loss -> found via the true_confidnce in ground truth
        class_loss = self._cls_normalizer * tf.reduce_sum(
            ks.losses.binary_crossentropy(K.expand_dims(true_class, axis=-1),
                                          K.expand_dims(pred_class, axis=-1)),
            axis=-1) * true_conf

        # 7. apply bce to confidence at all points and then strategiacally penalize the network for making predictions of objects at locations were no object exists
        bce = ks.losses.binary_crossentropy(K.expand_dims(true_conf, axis=-1),
                                            pred_conf)
        conf_loss = (true_conf + (1 - true_conf) * mask_iou) * bce

        # 8. take the sum of all the dimentions and reduce the loss such that each batch has a unique loss value
        loss_box = tf.reduce_mean(
            tf.cast(tf.reduce_sum(loss_box, axis=(1, 2, 3)),
                    dtype=y_pred.dtype))
        conf_loss = tf.reduce_mean(
            tf.cast(tf.reduce_sum(conf_loss, axis=(1, 2, 3)),
                    dtype=y_pred.dtype))
        class_loss = tf.reduce_mean(
            tf.cast(tf.reduce_sum(class_loss, axis=(1, 2, 3)),
                    dtype=y_pred.dtype))

        # 9. i beleive tensorflow will take the average of all the batches loss, so add them and let TF do its thing
        loss = class_loss + conf_loss + loss_box

        # 10. store values for use in metrics
        recall50 = tf.reduce_mean(
            tf.math.divide_no_nan(
                tf.reduce_sum(tf.cast(tf.squeeze(pred_conf, axis=-1) > 0.5,
                                      dtype=true_conf.dtype) * true_conf,
                              axis=(1, 2, 3)),
                (tf.reduce_sum(true_conf, axis=(1, 2, 3)))))
        avg_iou = tf.math.divide_no_nan(
            tf.reduce_sum(iou),
            tf.cast(tf.math.count_nonzero(tf.cast(iou > 0,
                                                  dtype=y_pred.dtype)),
                    dtype=y_pred.dtype))
        return loss, loss_box, conf_loss, class_loss, avg_iou, recall50
Ejemplo n.º 16
0
 def call(self, x):
     x = K.expand_dims(x, axis=1)
     output = K.sum(x * self.kernel, axis=(2, 3))
     if self.activation is not None:
         return self.activation(output)
     return output
Ejemplo n.º 17
0
    def __init__(self, model, upsample_size=UPSAMPLE_SIZE):

        mask_size = np.ceil(np.array((32, 32), dtype=float) /
                            upsample_size)
        mask_size = mask_size.astype(int)
        self.mask_size = mask_size
        mask = np.zeros(self.mask_size)
        pattern = np.zeros((32, 32, 3))
        mask = np.expand_dims(mask, axis=2)

        mask_tanh = np.zeros_like(mask)
        pattern_tanh = np.zeros_like(pattern)

        # prepare mask related tensors
        self.mask_tanh_tensor = K.variable(mask_tanh)
        mask_tensor_unrepeat = (K.tanh(self.mask_tanh_tensor) \
            / (2 - K.epsilon()) + 0.5)
        mask_tensor_unexpand = K.repeat_elements(
            mask_tensor_unrepeat,
            rep=3,
            axis=2)
        self.mask_tensor = K.expand_dims(mask_tensor_unexpand, axis=0)
        upsample_layer = UpSampling2D(
            size=(upsample_size, upsample_size))
        mask_upsample_tensor_uncrop = upsample_layer(self.mask_tensor)
        uncrop_shape = K.int_shape(mask_upsample_tensor_uncrop)[1:]
        cropping_layer = Cropping2D(
            cropping=((0, uncrop_shape[0] - 32),
                      (0, uncrop_shape[1] - 32)))
        self.mask_upsample_tensor = cropping_layer(
            mask_upsample_tensor_uncrop)
        # self.mask_upsample_tensor = K.round(self.mask_upsample_tensor)
        reverse_mask_tensor = (K.ones_like(self.mask_upsample_tensor) -
                               self.mask_upsample_tensor)

        # prepare pattern related tensors
        self.pattern_tanh_tensor = K.variable(pattern_tanh)
        self.pattern_raw_tensor = (
            (K.tanh(self.pattern_tanh_tensor) / (2 - K.epsilon()) + 0.5) *
            255.0)

        # prepare input image related tensors
        # ignore clip operation here
        # assume input image is already clipped into valid color range
        input_tensor = K.placeholder((None,32,32,3))
        input_raw_tensor = input_tensor

        # IMPORTANT: MASK OPERATION IN RAW DOMAIN
        X_adv_raw_tensor = (
            reverse_mask_tensor * input_raw_tensor +
            self.mask_upsample_tensor * self.pattern_raw_tensor)

        X_adv_tensor = X_adv_raw_tensor

        output_tensor = model(X_adv_tensor)
        y_target_tensor = K.placeholder((None,43))
        y_true_tensor = K.placeholder((None,43))

        self.loss_ce = categorical_crossentropy(output_tensor, y_target_tensor)

        self.hyperparameters = K.reshape(K.constant(np.array([1e-2, 1e-5, 1e-7, 1e-8, 0, 1e-2])), shape=(6, 1))
        self.loss_reg = self.build_tabor_regularization(input_raw_tensor,
                                                        model, y_target_tensor,
                                                        y_true_tensor)
        self.loss_reg = K.dot(K.reshape(self.loss_reg, shape=(1, 6)), self.hyperparameters)
        self.loss = K.mean(self.loss_ce) + self.loss_reg
        self.opt = Adam(lr=1e-3, beta_1=0.5, beta_2=0.9)
        self.updates = self.opt.get_updates(
            params=[self.pattern_tanh_tensor, self.mask_tanh_tensor],
            loss=self.loss)
        self.train = K.function(
            [input_tensor, y_true_tensor, y_target_tensor],
            [self.loss_ce, self.loss_reg, self.loss],
            updates=self.updates)
Ejemplo n.º 18
0
    def call(self, inputs, val_mode=False, dropout=False):
        # Train or validation mode ##############################################
        if val_mode:
            logging.debug("MODEL DRAKE NESTED CALL - Train mode")
        else:
            logging.debug("MODEL DRAKE NESTED CALL - Validation mode")

        # STEP 0: Process Inputs ################################################
        # Input               | Encoder input       | batch_size=None x         #
        #                     |                     | feature_map_wxh=None(64) x#
        #                     |                     | image_embedding_dim=      #
        #                     |                     | image_embedding_dim       #
        #                     | Token input         | batch_size = None x       #
        #                     |                     | token_seq_len =  x        #
        #                     |                     | token_seq_len             #
        #_____________________|_____________________|___________________________#
        input_image = inputs[0]
        input_tokens = inputs[1]
        self.batch_size = input_tokens.shape[0]
        batch_token_seq_len = input_tokens.shape[1]

        # Logging, Debug & Assert
        logging.debug("MODEL DRAKE NESTED CALL - Step 0 - Process inputs - "
                      "batch_size {}".format(self.batch_size))
        logging.debug("MODEL DRAKE NESTED CALL - Step 0 - Process inputs - "
                      "input_image shape {}".format(K.int_shape(input_image)))
        logging.debug("MODEL DRAKE NESTED CALL - Step 0 - Process inputs - "
                      "input_tokens shape {}".format(
                          K.int_shape(input_tokens)))
        if self.image_encoder == "inceptionv3":
            tf.compat.v1.debugging.assert_equal(K.int_shape(input_image),
                                                (self.batch_size, 299, 299, 3))
        else:
            tf.compat.v1.debugging.assert_equal(K.int_shape(input_image),
                                                (self.batch_size, 64, 2048))
        tf.compat.v1.debugging.assert_equal(
            K.int_shape(input_tokens), (self.batch_size, batch_token_seq_len))

        # STEP 1: Reset Decoder Hidden State ####################################
        # Zeroes              | Initial decoder     | batch_size=None x         #
        #                     | hidden state        | decoder_hidden_dim=       #
        #                     |                     | decoder_hidden_dim        #
        #                     | Initial outer       | batch_size=None x         #
        #                     | decoder hidden      | decoder_hidden_dim=       #
        #                     | state               | decoder_hidden_dim        #
        #_____________________|_____________________|___________________________#
        decoder_hidden_state = \
          keras.backend.zeros(shape=(self.batch_size, self.decoder_hidden_dim))

        # Logging, Debug & Assert
        logging.debug(
            "MODEL DRAKE NESTED CALL - Step 1 - Reset decoder hidden "
            "state - decoder_hidden_state shape {}".format(
                K.int_shape(decoder_hidden_state)))
        tf.compat.v1.debugging.assert_equal(
            K.int_shape(decoder_hidden_state),
            (self.batch_size, self.decoder_hidden_dim))

        # STEP 2: Image Encoding ################################################
        # Dense + Activations | Image encoder       | batch_size=None x         #
        #                     | output              | feature_map_wxh=None(64)  #
        #                     |                     | image_embedding_dim=      #
        #                     |                     | image_embedding_dim       #
        #_____________________|_____________________|___________________________#
        input_image_features = \
          self.model1_image_encoding([input_image],
                                      dropout=dropout)

        # Logging, Debug & Assert
        logging.debug(
            "MODEL DRAKE NESTED CALL - Step 2 - Image encoding dense"
            " and activations - input_image_features shape {}".format(
                K.int_shape(input_image_features)))
        tf.compat.v1.debugging.assert_equal(
            K.int_shape(input_image_features),
            (self.batch_size, 64, self.image_embedding_dim))

        # STEP 3: Token Embedding for all batch input sequences #################
        # Embedding           | Token embedding     | batch_size=None x         #
        #                     |                     | token_seq_len =           #
        #                     |                     | token_seq_len x           #
        #                     |                     | token_embedding_dim=      #
        #                     |                     | token_embedding_dim       #
        # ____________________|_____________________|___________________________#
        input_token_embeddings = \
          self.model2_token_embedding([input_tokens],
                                      dropout=dropout)

        # Logging, Debug & Assert
        logging.debug("MODEL DRAKE NESTED CALL - Step 3 - Token embeddings - "
                      "target_token_embeddings shape {}".format(
                          keras.backend.int_shape(input_token_embeddings)))
        tf.compat.v1.debugging.assert_equal(
            keras.backend.int_shape(input_token_embeddings),
            (self.batch_size, batch_token_seq_len, self.token_embedding_dim))

        # STEP 4: Decoder inputs is a 'GO' ######################################
        # Slice + Expand dims | GO column           | batch_size=None x         #
        #                     |                     | token_seq_len = 1 x       #
        #                     |                     | token_embedding_dim=      #
        #                     |                     | token_embedding_dim       #
        # ____________________|_____________________|___________________________#
        # For first character input is always  GO = 1 at index 0
        # Both for teaching forcing mode and validation mode
        decoder_token_input = \
          K.expand_dims(input_token_embeddings[:, 0], 1)

        # Logging, Debug, & Assert
        logging.debug("MODEL DRAKE NESTED CALL - Step 4 - Decoder inputs - "
                      "decoder_teaching_forcing_inputs shape {}".format(
                          K.int_shape(decoder_token_input)))
        tf.compat.v1.debugging.assert_equal(
            K.int_shape(decoder_token_input),
            (self.batch_size, 1, self.token_embedding_dim))

        # STEP 5: Loop through token sequence ###################################
        batch_loss = 0
        batch_mean_edit_distance = 0
        if val_mode:
            list_predictions = []
        for i in range(1, batch_token_seq_len):
            # STEP 5.1: Outer attention ###########################################
            # Summed weights    | Outer context       | batch_size=None x         #
            #                   | vector              | decoder_hidden_dim=       #
            #                   |                     | decoder_hidden_dim        #
            # __________________|_____________________|___________________________#
            outer_attention_map, outer_attention_weights = \
              self.model5_attention_map([input_image_features,
                                         decoder_hidden_state],
                                        dropout=dropout)

            # Logging, Debug & Assert
            logging.debug(
                "MODEL DRAKE NESTED CALL - Step 5.1 - Outer attention - "
                "Context vector shape {}".format(
                    K.int_shape(outer_attention_map)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(outer_attention_map),
                (self.batch_size, 64, self.decoder_hidden_dim))

            # STEP 5.4: Inner attention ###########################################
            # Summed weights    | Context vector      | batch_size=None x         #
            #                   |                     | decoder_hidden_dim=       #
            #                   |                     | decoder_hidden_dim        #
            # __________________|_____________________|___________________________#
            context_vector, attention_weights = \
              self.model3_attention([outer_attention_map,
                                     decoder_hidden_state],
                                    dropout=dropout)

            # Logging, Debug & Assert
            logging.debug(
                "MODEL DRAKE NESTED CALL - Step 5.4 - Inner attention - "
                "Context vector shape {}".format(K.int_shape(context_vector)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(context_vector),
                (self.batch_size, self.decoder_hidden_dim))

            # STEP 5.5: LSTM Input ################################################
            # Expand +          | LSTM input          | batch_size=None x         #
            # Concatenate       |                     | token_seq_len=1 x         #
            #                   |                     | lstm_input_dim=           #
            #                   |                     | decoder_hidden_dim +      #
            #                   |                     | token_embedding_dim       #
            # __________________|_____________________|___________________________#
            context_vector_expanded = self.layer1_expand_dims(
                context_vector, 1)

            # Logging, Debug & Assert
            logging.debug(
                "MODEL DRAKE NESTED CALL - Step 5.5 - Expand context "
                "vector - context_vector_expanded shape {}".format(
                    K.int_shape(context_vector_expanded)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(context_vector_expanded),
                (self.batch_size, 1, self.decoder_hidden_dim))

            lstm_input = self.layer2_concatenate(
                [context_vector_expanded, decoder_token_input], axis=-1)

            # Logging, Debug & Assert
            logging.debug(
                "MODEL DRAKE NESTED CALL - Step 5.5 - Concat context"
                "vector and token embedding - lstm_input shape {}".format(
                    K.int_shape(lstm_input)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(lstm_input),
                (self.batch_size, 1,
                 self.token_embedding_dim + self.decoder_hidden_dim))

            # STEP 5.6: LSTM ######################################################
            # LSTM return       | LSTM Output         | batch_size=None x         #
            # sequences and     |                     | token_seq_len=1 x         #
            # state             |                     | decoder_hidden_dim=       #
            #                   |                     | decoder_hidden_dim        #
            #                   |                     |                           #
            #                   | LSTM Hidden State   | batch_size=None x         #
            #                   |                     | decoder_hidden_dim=       #
            #                   |                     | decoder_hidden_dim        #
            #                   |                     |                           #
            #                   | LSTM Cell State     | batch_size=None x         #
            #                   |                     | decoder_hidden_dim=       #
            #                   |                     | decoder_hidden_dim        #
            # __________________|_____________________|___________________________#
            lstm_output, decoder_hidden_state, decoder_cell_state = \
              self.layer3_lstm(lstm_input)

            # Logging, Debug & Assert
            logging.debug("MODEL DRAKE NESTED CALL - Step 5.6 - LSTM output - "
                          "lstm_output shape {}".format(
                              K.int_shape(lstm_output)))
            logging.debug("MODEL DRAKE NESTED CALL - Step 5.6 - LSTM output - "
                          "decoder_hidden_state shape {}".format(
                              K.int_shape(decoder_hidden_state)))
            logging.debug("MODEL DRAKE NESTED CALL - Step 5.6 - LSTM output - "
                          "decoder_cell_state shape {}".format(
                              K.int_shape(decoder_cell_state)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(lstm_output),
                (self.batch_size, 1, self.decoder_hidden_dim))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(decoder_hidden_state),
                (self.batch_size, self.decoder_hidden_dim))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(decoder_cell_state),
                (self.batch_size, self.decoder_hidden_dim))

            # STEP 5.7: MLP #######################################################
            # Dense             | Predicted token     | batch_size=None x         #
            #                   |                     | token_vocab_size x        #
            #                   |                     | token_vocab_size          #
            #___________________|_____________________|___________________________#
            mlp_input = self.layer4_concatenate(
                [context_vector_expanded, lstm_output], axis=-1)

            single_token_prediction = self.model4_mlp([mlp_input],
                                                      dropout=dropout)

            # Logging. Debug & Assert
            logging.debug("MODEL DRAKE NESTED CALL - Step 5.7 - MLP output - "
                          "single_token_prediction shape {}".format(
                              K.int_shape(single_token_prediction)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(single_token_prediction),
                (self.batch_size, self.token_vocab_size))

            # STEP 5.8: Calculate loss ############################################
            # Loss              | Single token loss   | int                       #
            #___________________|_____________________|___________________________#
            batch_loss += masked_ce_loss_fn(
                target=input_tokens[:, i],
                prediction=single_token_prediction,
                batch_size=self.batch_size,
                token_vocab_size=self.token_vocab_size)

            # Logging, Debug & Assert
            logging.debug("MODEL DRAKE NESTED CALL - Step 5.8 - "
                          "Single prediction loss {}".format(batch_loss))

            # STEP 5.9 Update decoder input #######################################
            # Decoder input     | New decoder         | batch_size=None x         #
            #                   | hidden state        | decoder_hidden_dim=       #
            #                   |                     | decoder_hidden_dim        #
            #___________________|_____________________|___________________________#
            if val_mode:
                # In validation mode use argmax output from decoder
                argmax_prediction = tf.argmax(single_token_prediction,
                                              axis=1,
                                              output_type=tf.dtypes.int32)
                list_predictions.append(argmax_prediction)
                argmax_prediction_expanded = K.expand_dims(argmax_prediction)
                decoder_token_input = \
                  self.model2_token_embedding([argmax_prediction_expanded])
            else:
                # In training mode use teacher forcing inputs
                decoder_token_input = \
                  K.expand_dims(input_token_embeddings[:, i], 1)

            # Logging, Debug & Assert
            logging.debug(
                "MODEL DRAKE NESTED CALL - Step 5.9 - Update decoder "
                " input - decoder_token_input shape {}".format(
                    K.int_shape(decoder_token_input)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(decoder_token_input),
                (self.batch_size, 1, self.token_embedding_dim))

        # STEP 6: Calculate levenstein distance
        if val_mode:
            stack_predictions = tf.stack(list_predictions, axis=1)
            stack_predictions_len = stack_predictions.shape[1]

            # Logging, Debug & Assert
            logging.debug(
                "MODEL DRAKE NESTED CALL - Step 6 - Stack predictions "
                "shape {}".format(K.int_shape(stack_predictions)))
            tf.compat.v1.debugging.assert_equal(
                K.int_shape(stack_predictions),
                (self.batch_size, stack_predictions_len))

            batch_mean_edit_distance = \
              edit_distance_metric(
                target=input_tokens[:, 1:stack_predictions_len +1],
                prediction=stack_predictions,
                predictions_file=self.predictions_file)

        # STEP 7: Return word sequence batch loss ###############################
        return batch_loss, batch_mean_edit_distance
Ejemplo n.º 19
0
    def yolov2_loss(self, detector_mask, matching_true_boxes, class_one_hot, true_boxes_grid, y_pred, info = False):
        '''
        Calculate YOLO V2 loss from prediction (y_pred) and ground truth tensors (detector_mask,
        matching_true_boxes, class_one_hot, true_boxes_grid,)

        Parameters
        ----------
        - detector_mask : tensor, shape (batch, size, GRID_W, GRID_H, anchors_count, 1)
            1 if bounding box detected by grid cell, else 0
        - matching_true_boxes : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, 5)
            Contains adjusted coords of bounding box in YOLO format
        - class_one_hot : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, class_count)
            One hot representation of bounding box label
        - true_boxes_grid : annotations : tensor (shape : batch_size, max annot, 5)
            true_boxes_grid format : x, y, w, h, c (coords unit : grid cell)
        - y_pred : prediction from model. tensor (shape : batch_size, GRID_W, GRID_H, anchors count, (5 + labels count)
        - info : boolean. True to get some infox about loss value

        Returns
        -------
        - loss : scalar
        - sub_loss : sub loss list : coords loss, class loss and conf loss : scalar
        '''

        # anchors tensor
        anchors = np.array(ANCHORS)
        anchors = anchors.reshape(len(anchors) // 2, 2)

        # grid coords tensor
        coord_x = tf.cast(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)), tf.float32)
        coord_y = tf.transpose(coord_x, (0, 2, 1, 3, 4))
        coords = tf.tile(tf.concat([coord_x, coord_y], -1), [y_pred.shape[0], 1, 1, 5, 1])

        # coordinate loss
        pred_xy = K.sigmoid(y_pred[:, :, :, :, 0:2])  # adjust coords between 0 and 1
        pred_xy = (pred_xy + coords)  # add cell coord for comparaison with ground truth. New coords in grid cell unit
        pred_wh = K.exp(y_pred[:, :, :, :,
                        2:4]) * anchors  # adjust width and height for comparaison with ground truth. New coords in grid cell unit
        # pred_wh = (pred_wh * anchors) # unit : grid cell
        nb_detector_mask = K.sum(tf.cast(detector_mask > 0.0, tf.float32))
        xy_loss = LAMBDA_COORD * K.sum(detector_mask * K.square(matching_true_boxes[..., :2] - pred_xy)) / (
                nb_detector_mask + 1e-6)  # Non /2
        wh_loss = LAMBDA_COORD * K.sum(detector_mask * K.square(K.sqrt(matching_true_boxes[..., 2:4]) -
                                                                K.sqrt(pred_wh))) / (nb_detector_mask + 1e-6)
        coord_loss = xy_loss + wh_loss

        # class loss
        pred_box_class = y_pred[..., 5:]
        true_box_class = tf.argmax(class_one_hot, -1)
        # class_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
        class_loss = K.sparse_categorical_crossentropy(target=true_box_class, output=pred_box_class, from_logits=True)
        class_loss = K.expand_dims(class_loss, -1) * detector_mask
        class_loss = LAMBDA_CLASS * K.sum(class_loss) / (nb_detector_mask + 1e-6)

        # confidence loss
        pred_conf = K.sigmoid(y_pred[..., 4:5])
        # for each detector : iou between prediction and ground truth
        x1 = matching_true_boxes[..., 0]
        y1 = matching_true_boxes[..., 1]
        w1 = matching_true_boxes[..., 2]
        h1 = matching_true_boxes[..., 3]
        x2 = pred_xy[..., 0]
        y2 = pred_xy[..., 1]
        w2 = pred_wh[..., 0]
        h2 = pred_wh[..., 1]
        ious = self.iou(x1, y1, w1, h1, x2, y2, w2, h2)
        ious = K.expand_dims(ious, -1)

        # for each detector : best ious between prediction and true_boxes (every bounding box of image)
        pred_xy = K.expand_dims(pred_xy, 4)  # shape : m, GRID_W, GRID_H, BOX, 1, 2
        pred_wh = K.expand_dims(pred_wh, 4)
        pred_wh_half = pred_wh / 2.
        pred_mins = pred_xy - pred_wh_half
        pred_maxes = pred_xy + pred_wh_half
        true_boxe_shape = K.int_shape(true_boxes_grid)
        true_boxes_grid = K.reshape(true_boxes_grid,
                                    [true_boxe_shape[0], 1, 1, 1, true_boxe_shape[1], true_boxe_shape[2]])
        true_xy = true_boxes_grid[..., 0:2]
        true_wh = true_boxes_grid[..., 2:4]
        true_wh_half = true_wh * 0.5
        true_mins = true_xy - true_wh_half
        true_maxes = true_xy + true_wh_half
        intersect_mins = K.maximum(pred_mins, true_mins)  # shape : m, GRID_W, GRID_H, BOX, max_annot, 2
        intersect_maxes = K.minimum(pred_maxes, true_maxes)  # shape : m, GRID_W, GRID_H, BOX, max_annot, 2
        intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)  # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
        intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]  # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
        pred_areas = pred_wh[..., 0] * pred_wh[..., 1]  # shape : m, GRID_W, GRID_H, BOX, 1, 1
        true_areas = true_wh[..., 0] * true_wh[..., 1]  # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
        union_areas = pred_areas + true_areas - intersect_areas
        iou_scores = intersect_areas / union_areas  # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
        best_ious = K.max(iou_scores, axis=4)  # Best IOU scores.
        best_ious = K.expand_dims(best_ious)  # shape : m, GRID_W, GRID_H, BOX, 1

        # no object confidence loss
        no_object_detection = K.cast(best_ious < 0.6, K.dtype(best_ious))
        noobj_mask = no_object_detection * (1 - detector_mask)
        nb_noobj_mask = K.sum(tf.cast(noobj_mask > 0.0, tf.float32))

        noobject_loss = LAMBDA_NOOBJECT * K.sum(noobj_mask * K.square(-pred_conf)) / (nb_noobj_mask + 1e-6)
        # object confidence loss
        object_loss = LAMBDA_OBJECT * K.sum(detector_mask * K.square(ious - pred_conf)) / (nb_detector_mask + 1e-6)
        # total confidence loss
        conf_loss = noobject_loss + object_loss

        # total loss
        loss = conf_loss + class_loss + coord_loss
        sub_loss = [conf_loss, class_loss, coord_loss]

        #     # 'triple' mask
        #     true_box_conf_IOU = ious * detector_mask
        #     conf_mask = noobj_mask * LAMBDA_NOOBJECT
        #     conf_mask = conf_mask + detector_mask * LAMBDA_OBJECT
        #     nb_conf_box  = K.sum(tf.to_float(conf_mask  > 0.0))
        #     conf_loss = K.sum(K.square(true_box_conf_IOU - pred_conf) * conf_mask)  / (nb_conf_box  + 1e-6)

        #     # total loss
        #     loss = conf_loss /2. + class_loss + coord_loss /2.
        #     sub_loss = [conf_loss /2., class_loss, coord_loss /2.]

        if info:
            print('conf_loss   : {:.4f}'.format(conf_loss))
            print('class_loss  : {:.4f}'.format(class_loss))
            print('coord_loss  : {:.4f}'.format(coord_loss))
            print('    xy_loss : {:.4f}'.format(xy_loss))
            print('    wh_loss : {:.4f}'.format(wh_loss))
            print('--------------------')
            print('total loss  : {:.4f}'.format(loss))

            # display masks for each anchors
            for i in range(len(anchors)):
                f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 5))
                f.tight_layout()
                f.suptitle('MASKS FOR ANCHOR {} :'.format(anchors[i, ...]))

                ax1.matshow((K.sum(detector_mask[0, :, :, i], axis=2)), cmap='Greys', vmin=0, vmax=1)
                ax1.set_title(
                    'detector_mask, count : {}'.format(K.sum(tf.cast(detector_mask[0, :, :, i] > 0., tf.int32))))
                ax1.xaxis.set_ticks_position('bottom')

                ax2.matshow((K.sum(no_object_detection[0, :, :, i], axis=2)), cmap='Greys', vmin=0, vmax=1)
                ax2.set_title('no_object_detection mask')
                ax2.xaxis.set_ticks_position('bottom')

                ax3.matshow((K.sum(noobj_mask[0, :, :, i], axis=2)), cmap='Greys', vmin=0, vmax=1)
                ax3.set_title('noobj_mask')
                ax3.xaxis.set_ticks_position('bottom')
                plt.show()
        return loss, sub_loss
Ejemplo n.º 20
0
 def call(self, inputs):
     outputs = K.expand_dims(inputs, 1)
     return outputs
Ejemplo n.º 21
0
def yolo_loss(y_true, y_pred):
    label_class = y_true[..., :1]  # ? * 7 * 7 * 1
    # 分类
    label_box = y_true[..., 1:5]  # ? * 7 * 7 * 4
    # BB1的坐标
    response_mask = y_true[..., 5]  # ? * 7 * 7
    # BB1的置信度
    response_mask = K.expand_dims(response_mask)  # ? * 7 * 7 * 1

    predict_class = y_pred[..., :1]  # ? * 7 * 7 * 1
    # 分类
    predict_trust = y_pred[..., 1:3]  # ? * 7 * 7 * 2
    # BB1和BB2的置信度
    predict_box = y_pred[..., 3:]  # ? * 7 * 7 * 8
    # BB1和BB2的坐标

    _label_box = K.reshape(label_box, [-1, 7, 7, 1, 4])
    _predict_box = K.reshape(predict_box, [-1, 7, 7, 2, 4])

    label_xy, label_wh = yolo_head(_label_box, img_size=224)  # ? * 7 * 7 * 1 * 2, ? * 7 * 7 * 1 * 2
    label_xy = K.expand_dims(label_xy, 3)  # ? * 7 * 7 * 1 * 1 * 2
    label_wh = K.expand_dims(label_wh, 3)  # ? * 7 * 7 * 1 * 1 * 2
    label_xy_min, label_xy_max = X_Y_W_H_To_Min_Max(label_xy,
                                                    label_wh)  # ? * 7 * 7 * 1 * 1 * 2, ? * 7 * 7 * 1 * 1 * 2

    predict_xy, predict_wh = yolo_head(_predict_box, img_size=224)  # ? * 7 * 7 * 2 * 2, ? * 7 * 7 * 2 * 2
    predict_xy = K.expand_dims(predict_xy, 4)  # ? * 7 * 7 * 2 * 1 * 2
    predict_wh = K.expand_dims(predict_wh, 4)  # ? * 7 * 7 * 2 * 1 * 2
    predict_xy_min, predict_xy_max = X_Y_W_H_To_Min_Max(predict_xy,
                                                        predict_wh)  # ? * 7 * 7 * 2 * 1 * 2, ? * 7 * 7 * 2 * 1 * 2

    iou_scores = iou(predict_xy_min, predict_xy_max, label_xy_min, label_xy_max)  # ? * 7 * 7 * 2 * 1
    best_ious = K.max(iou_scores, axis=4)  # ? * 7 * 7 * 2
    best_box = K.max(best_ious, axis=3, keepdims=True)  # ? * 7 * 7 * 1

    box_mask = K.cast(best_ious >= best_box, K.dtype(best_ious))  # ? * 7 * 7 * 2

    no_object_loss = 0.5 * (1 - box_mask * response_mask) * K.square(0 - predict_trust)
    object_loss = box_mask * response_mask * K.square(1 - predict_trust)
    confidence_loss = no_object_loss + object_loss
    confidence_loss = K.sum(confidence_loss)

    class_loss = response_mask * K.square(label_class - predict_class)
    class_loss = K.sum(class_loss)

    _label_box = K.reshape(label_box, [-1, 7, 7, 1, 4])
    _predict_box = K.reshape(predict_box, [-1, 7, 7, 2, 4])

    label_xy, label_wh = yolo_head(_label_box, img_size=224)  # ? * 7 * 7 * 1 * 2, ? * 7 * 7 * 1 * 2
    predict_xy, predict_wh = yolo_head(_predict_box, img_size=224)  # ? * 7 * 7 * 2 * 2, ? * 7 * 7 * 2 * 2

    box_mask = K.expand_dims(box_mask)
    response_mask = K.expand_dims(response_mask)

    box_loss = 5 * box_mask * response_mask * K.square((label_xy - predict_xy) / 224)
    box_loss += 5 * box_mask * response_mask * K.square((K.sqrt(label_wh) - K.sqrt(predict_wh)) / 224)
    box_loss = K.sum(box_loss)

    loss = confidence_loss + class_loss + box_loss

    return loss
Ejemplo n.º 22
0
    def recursion(self,
                  input_energy,
                  mask=None,
                  go_backwards=False,
                  return_sequences=True,
                  return_logZ=True,
                  input_length=None):
        """Forward (alpha) or backward (beta) recursion

    If `return_logZ = True`, compute the logZ, the normalization constant:

    \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
      = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3))
      = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3))
      sum_{y1} exp(-(u1' y1' + y1' W y2))) \]

    Denote:
        \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \]
        \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \]
        \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \]
    Note that:
          yi's are one-hot vectors
          u1, u3: boundary energies have been merged

    If `return_logZ = False`, compute the Viterbi's best path lookup table.
    """
        chain_energy = self.chain_kernel
        # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
        chain_energy = K.expand_dims(chain_energy, 0)
        # shape=(B, F), dtype=float32
        prev_target_val = K.zeros_like(input_energy[:, 0, :])

        if go_backwards:
            input_energy = K.reverse(input_energy, 1)
            if mask is not None:
                mask = K.reverse(mask, 1)

        initial_states = [
            prev_target_val,
            K.zeros_like(prev_target_val[:, :1])
        ]
        constants = [chain_energy]

        if mask is not None:
            mask2 = K.cast(
                K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
                K.floatx())
            constants.append(mask2)

        def _step(input_energy_i, states):
            return self.step(input_energy_i, states, return_logZ)

        target_val_last, target_val_seq, _ = K.rnn(_step,
                                                   input_energy,
                                                   initial_states,
                                                   constants=constants,
                                                   input_length=input_length,
                                                   unroll=self.unroll)

        if return_sequences:
            if go_backwards:
                target_val_seq = K.reverse(target_val_seq, 1)
            return target_val_seq
        else:
            return target_val_last
Ejemplo n.º 23
0
 def call(self, inputs):
     segment, memory = inputs
     full = K.concatenate([K.zeros_like(memory[:, :, 0]), segment], axis=1)
     relative = K.not_equal(K.expand_dims(segment, axis=-1), K.expand_dims(full, axis=1))
     relative = K.one_hot(K.cast(relative, 'uint8'), 2)
     return [relative, self.embeddings + 0.0]
Ejemplo n.º 24
0
def atrous_spatial_pyramid_pooling(input_layer,
                                   global_image_pooling_upsampling_factor=None
                                   ):
    # branch: 1x1 conv
    b_aspp_0 = _Conv2D(input_layer,
                       filters=256,
                       kernel_size=1,
                       name='aspp_0_conv',
                       bn_epsilon=1e-5)

    # branch: 3x3 conv, rate 6
    b_aspp_1 = SeparableConv2D(filters=256,
                               kernel_size=3,
                               padding='same',
                               dilation_rate=6,
                               use_bias=False,
                               name='aspp_1_sepconv')(input_layer)
    b_aspp_1 = BatchNormalization(name='aspp_1_sepconv_bn',
                                  epsilon=1e-5)(b_aspp_1)
    b_aspp_1 = ReLU()(b_aspp_1)

    # branch: 3x3 conv, rate 12
    b_aspp_2 = SeparableConv2D(filters=256,
                               kernel_size=3,
                               padding='same',
                               dilation_rate=12,
                               use_bias=False,
                               name='aspp_2_sepconv')(input_layer)
    b_aspp_2 = BatchNormalization(name='aspp_2_sepconv_bn',
                                  epsilon=1e-5)(b_aspp_2)
    b_aspp_2 = ReLU()(b_aspp_2)

    # branch: 3x3 conv, rate 18
    b_aspp_3 = SeparableConv2D(filters=256,
                               kernel_size=3,
                               padding='same',
                               dilation_rate=18,
                               use_bias=False,
                               name='pyramid_3x3sepconv')(input_layer)
    b_aspp_3 = BatchNormalization(name='pyramid_3x3sepconv_bn',
                                  epsilon=1e-5)(b_aspp_3)
    b_aspp_3 = ReLU()(b_aspp_3)

    if global_image_pooling_upsampling_factor is None:
        output_layer = Concatenate()([b_aspp_0, b_aspp_1, b_aspp_2, b_aspp_3])

    else:
        # branch: global image pooling
        b_image_pooling = GlobalAveragePooling2D(
            name='pyramid_img_pool')(input_layer)
        b_image_pooling = Lambda(
            lambda x: K.expand_dims(K.expand_dims(x, 1), 1))(
                b_image_pooling
            )  # (batch size x channels)->(batch size x 1 x 1 x channels)
        b_image_pooling = Conv2D(filters=256,
                                 kernel_size=1,
                                 padding='same',
                                 use_bias=False,
                                 name='pyramid_img_pool_conv')(b_image_pooling)
        b_image_pooling = BatchNormalization(
            name='pyramid_img_pool_conv_bn')(b_image_pooling)
        b_image_pooling = ReLU()(b_image_pooling)
        b_image_pooling = UpSampling2D(
            global_image_pooling_upsampling_factor,
            interpolation='bilinear')(b_image_pooling)

        output_layer = Concatenate()(
            [b_aspp_0, b_aspp_1, b_aspp_2, b_aspp_3, b_image_pooling])

    return output_layer
Ejemplo n.º 25
0
def yolo2_loss(args,
               anchors,
               num_classes,
               label_smoothing=0,
               use_crossentropy_loss=False,
               use_crossentropy_obj_loss=False,
               rescore_confidence=False,
               use_diou_loss=False):
    """YOLOv2 loss function.

    Parameters
    ----------
    yolo_output : tensor
        Final convolutional layer features.

    true_boxes : tensor
        Ground truth boxes tensor with shape [batch, num_true_boxes, 5]
        containing box x_center, y_center, width, height, and class.

    y_true : array
        output of preprocess_true_boxes, with shape [conv_height, conv_width, num_anchors, 6]

    anchors : tensor
        Anchor boxes for model.

    num_classes : int
        Number of object classes.

    rescore_confidence : bool, default=False
        If true then set confidence target to IOU of best predicted box with
        the closest matching ground truth box.


    Returns
    -------
    total_loss : float
        total mean YOLOv2 loss across minibatch
    """
    (yolo_output, true_boxes, y_true) = args
    num_anchors = len(anchors)
    yolo_output_shape = K.shape(yolo_output)
    input_shape = yolo_output_shape[1:3] * 32
    batch_size_f = K.cast(yolo_output_shape[0],
                          K.dtype(yolo_output))  # batch size, float tensor
    object_scale = 5
    no_object_scale = 1
    class_scale = 1
    location_scale = 1

    pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo2_head(
        yolo_output, anchors, num_classes, input_shape)

    object_mask = y_true[..., 4:5]

    # Expand pred x,y,w,h to allow comparison with ground truth.
    # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
    pred_boxes = K.concatenate([pred_xy, pred_wh])
    pred_boxes = K.expand_dims(pred_boxes, 4)

    # reshape true_boxes to:
    # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params
    true_boxes_shape = K.shape(true_boxes)
    true_boxes = K.reshape(true_boxes, [
        true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2]
    ])

    iou_scores = box_iou(pred_boxes, true_boxes)
    iou_scores = K.squeeze(iou_scores, axis=0)

    # Best IOUs for each location.
    best_ious = K.max(iou_scores, axis=4)  # Best IOU scores.
    best_ious = K.expand_dims(best_ious)

    # A detector has found an object if IOU > thresh for some true box.
    object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious))

    # Determine confidence weights from object and no_object weights.
    # NOTE: YOLOv2 does not use binary cross-entropy. Here we try it.
    no_object_weights = (no_object_scale * (1 - object_detections) *
                         (1 - object_mask))
    if use_crossentropy_obj_loss:
        no_objects_loss = no_object_weights * K.binary_crossentropy(
            K.zeros(K.shape(pred_confidence)),
            pred_confidence,
            from_logits=False)

        if rescore_confidence:
            objects_loss = (object_scale * object_mask * K.binary_crossentropy(
                best_ious, pred_confidence, from_logits=False))
        else:
            objects_loss = (
                object_scale * object_mask *
                K.binary_crossentropy(K.ones(K.shape(pred_confidence)),
                                      pred_confidence,
                                      from_logits=False))
    else:
        no_objects_loss = no_object_weights * K.square(-pred_confidence)

        if rescore_confidence:
            objects_loss = (object_scale * object_mask *
                            K.square(best_ious - pred_confidence))
        else:
            objects_loss = (object_scale * object_mask *
                            K.square(1 - pred_confidence))
    confidence_loss = objects_loss + no_objects_loss

    # Classification loss for matching detections.
    # NOTE: YOLOv2 does not use categorical cross-entropy loss.
    #       Here we try it.
    matching_classes = K.cast(y_true[..., 5], 'int32')
    matching_classes = K.one_hot(matching_classes, num_classes)

    if label_smoothing:
        matching_classes = _smooth_labels(matching_classes, label_smoothing)

    if use_crossentropy_loss:
        classification_loss = (
            class_scale * object_mask *
            K.expand_dims(K.categorical_crossentropy(
                matching_classes, pred_class_prob, from_logits=False),
                          axis=-1))
    else:
        classification_loss = (class_scale * object_mask *
                               K.square(matching_classes - pred_class_prob))

    if use_diou_loss:
        # Calculate DIoU loss as location loss
        diou = box_diou(pred_boxes, true_boxes)
        diou = K.squeeze(diou, axis=-1)
        diou_loss = location_scale * object_mask * (1 - diou)
        location_loss = diou_loss
    else:
        # YOLOv2 location loss for matching detection boxes.
        matching_boxes = y_true[..., 0:4]

        feats = K.reshape(yolo_output, [
            -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors,
            num_classes + 5
        ])
        # Unadjusted box predictions for loss.
        # TODO: Remove extra computation shared with yolo2_head.
        raw_pred_boxes = K.concatenate(
            (K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1)

        location_loss = (location_scale * object_mask *
                         K.square(matching_boxes - raw_pred_boxes))

    confidence_loss_sum = K.sum(confidence_loss) / batch_size_f
    classification_loss_sum = K.sum(classification_loss) / batch_size_f
    location_loss_sum = K.sum(location_loss) / batch_size_f
    total_loss = 0.5 * (confidence_loss_sum + classification_loss_sum +
                        location_loss_sum)

    # Fit for tf 2.0.0 loss shape
    total_loss = K.expand_dims(total_loss, axis=-1)

    return total_loss, location_loss_sum, confidence_loss_sum, classification_loss_sum
Ejemplo n.º 26
0
    def call(self, inputs, mask=None, **kwargs):
        if isinstance(inputs, list):
            inputs, positions = inputs
            positions = K.cast(positions, 'int32')
            mask = mask[1]
        else:
            positions = None

        input_len = K.shape(inputs)[1]

        if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD:
            e = self._call_additive_emission(inputs)
        elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL:
            e = self._call_multiplicative_emission(inputs)

        if self.attention_activation is not None:
            e = self.attention_activation(e)
        e = K.exp(e - K.max(e, axis=-1, keepdims=True))
        if self.attention_width is not None:
            ones = tf.ones((input_len, input_len))
            if self.history_only:
                local = tf.linalg.band_part(
                    ones,
                    K.minimum(input_len, self.attention_width - 1),
                    0,
                )
            else:
                local = tf.linalg.band_part(
                    ones,
                    K.minimum(input_len, self.attention_width // 2),
                    K.minimum(input_len, (self.attention_width - 1) // 2),
                )
            e = e * K.expand_dims(local, 0)
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask)
            e = K.permute_dimensions(
                K.permute_dimensions(e * mask, (0, 2, 1)) * mask, (0, 2, 1))

        # a_{t} = \text{softmax}(e_t)
        s = K.sum(e, axis=-1)
        s = K.tile(K.expand_dims(s, axis=-1), K.stack([1, 1, input_len]))
        a = e / (s + K.epsilon())

        # l_t = \sum_{t'} a_{t, t'} x_{t'}
        v = K.batch_dot(a, inputs)
        if self.attention_regularizer_weight > 0.0:
            self.add_loss(self._attention_regularizer(a))

        if positions is not None:
            pos_num = K.shape(positions)[1]
            batch_indices = K.tile(
                K.expand_dims(K.arange(K.shape(inputs)[0]), axis=-1),
                K.stack([1, pos_num]))
            pos_indices = K.stack([batch_indices, positions], axis=-1)
            v = tf.gather_nd(v, pos_indices)
            a = tf.gather_nd(a, pos_indices)

        if self.return_attention:
            return [v, a]
        return v
Ejemplo n.º 27
0
def yolo_loss(args,
              input_shape,
              anchors,
              anchors_mask,
              num_classes,
              ignore_thresh=0.5,
              balance=[0.4, 1.0, 4],
              box_ratio=0.05,
              obj_ratio=1,
              cls_ratio=0.5 / 4,
              label_smoothing=0.1,
              focal_loss=False,
              focal_loss_ratio=10,
              gamma=2,
              alpha=0.25,
              print_loss=False):
    num_layers = len(anchors_mask)
    #---------------------------------------------------------------------------------------------------#
    #   将预测结果和实际ground truth分开,args是[*model_body.output, *y_true]
    #   y_true是一个列表,包含三个特征层,shape分别为:
    #   (m,13,13,3,85)
    #   (m,26,26,3,85)
    #   (m,52,52,3,85)
    #   yolo_outputs是一个列表,包含三个特征层,shape分别为:
    #   (m,13,13,3,85)
    #   (m,26,26,3,85)
    #   (m,52,52,3,85)
    #---------------------------------------------------------------------------------------------------#
    y_true = args[num_layers:]
    yolo_outputs = args[:num_layers]

    #-----------------------------------------------------------#
    #   得到input_shpae为416,416
    #-----------------------------------------------------------#
    input_shape = K.cast(input_shape, K.dtype(y_true[0]))

    #-----------------------------------------------------------#
    #   取出每一张图片
    #   m的值就是batch_size
    #-----------------------------------------------------------#
    m = K.shape(yolo_outputs[0])[0]

    loss = 0
    #---------------------------------------------------------------------------------------------------#
    #   y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
    #   yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
    #---------------------------------------------------------------------------------------------------#
    for l in range(num_layers):
        #-----------------------------------------------------------#
        #   以第一个特征层(m,13,13,3,85)为例子
        #   取出该特征层中存在目标的点的位置。(m,13,13,3,1)
        #-----------------------------------------------------------#
        object_mask = y_true[l][..., 4:5]
        #-----------------------------------------------------------#
        #   取出其对应的种类(m,13,13,3,80)
        #-----------------------------------------------------------#
        true_class_probs = y_true[l][..., 5:]
        if label_smoothing:
            true_class_probs = _smooth_labels(true_class_probs,
                                              label_smoothing)

        #-----------------------------------------------------------#
        #   将yolo_outputs的特征层输出进行处理、获得四个返回值
        #   其中:
        #   grid        (13,13,1,2) 网格坐标
        #   raw_pred    (m,13,13,3,85) 尚未处理的预测结果
        #   pred_xy     (m,13,13,3,2) 解码后的中心坐标
        #   pred_wh     (m,13,13,3,2) 解码后的宽高坐标
        #-----------------------------------------------------------#
        grid, raw_pred, pred_xy, pred_wh = get_anchors_and_decode(
            yolo_outputs[l],
            anchors[anchors_mask[l]],
            num_classes,
            input_shape,
            calc_loss=True)

        #-----------------------------------------------------------#
        #   pred_box是解码后的预测的box的位置
        #   (m,13,13,3,4)
        #-----------------------------------------------------------#
        pred_box = K.concatenate([pred_xy, pred_wh])

        #-----------------------------------------------------------#
        #   找到负样本群组,第一步是创建一个数组,[]
        #-----------------------------------------------------------#
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                     size=1,
                                     dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')

        #-----------------------------------------------------------#
        #   对每一张图片计算ignore_mask
        #-----------------------------------------------------------#
        def loop_body(b, ignore_mask):
            #-----------------------------------------------------------#
            #   取出n个真实框:n,4
            #-----------------------------------------------------------#
            true_box = tf.boolean_mask(y_true[l][b, ..., 0:4],
                                       object_mask_bool[b, ..., 0])
            #-----------------------------------------------------------#
            #   计算预测框与真实框的iou
            #   pred_box    13,13,3,4 预测框的坐标
            #   true_box    n,4 真实框的坐标
            #   iou         13,13,3,n 预测框和真实框的iou
            #-----------------------------------------------------------#
            iou = box_iou(pred_box[b], true_box)

            #-----------------------------------------------------------#
            #   best_iou    13,13,3 每个特征点与真实框的最大重合程度
            #-----------------------------------------------------------#
            best_iou = K.max(iou, axis=-1)

            #-----------------------------------------------------------#
            #   判断预测框和真实框的最大iou小于ignore_thresh
            #   则认为该预测框没有与之对应的真实框
            #   该操作的目的是:
            #   忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了
            #   不适合当作负样本,所以忽略掉。
            #-----------------------------------------------------------#
            ignore_mask = ignore_mask.write(
                b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
            return b + 1, ignore_mask

        #-----------------------------------------------------------#
        #   在这个地方进行一个循环、循环是对每一张图片进行的
        #-----------------------------------------------------------#
        _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body,
                                       [0, ignore_mask])

        #-----------------------------------------------------------#
        #   ignore_mask用于提取出作为负样本的特征点
        #   (m,13,13,3)
        #-----------------------------------------------------------#
        ignore_mask = ignore_mask.stack()
        #   (m,13,13,3,1)
        ignore_mask = K.expand_dims(ignore_mask, -1)

        #-----------------------------------------------------------#
        #   真实框越大,比重越小,小框的比重更大。
        #   使用iou损失时,大中小目标的回归损失不存在比例失衡问题,故弃用
        #-----------------------------------------------------------#
        box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]

        #-----------------------------------------------------------#
        #   计算Ciou loss
        #-----------------------------------------------------------#
        raw_true_box = y_true[l][..., 0:4]
        ciou = box_ciou(pred_box, raw_true_box)
        ciou_loss = object_mask * (1 - ciou)
        location_loss = K.sum(ciou_loss)

        #------------------------------------------------------------------------------#
        #   如果该位置本来有框,那么计算1与置信度的交叉熵
        #   如果该位置本来没有框,那么计算0与置信度的交叉熵
        #   在这其中会忽略一部分样本,这些被忽略的样本满足条件best_iou<ignore_thresh
        #   该操作的目的是:
        #   忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了
        #   不适合当作负样本,所以忽略掉。
        #------------------------------------------------------------------------------#
        if focal_loss:
            confidence_loss = (object_mask * (tf.ones_like(raw_pred[...,4:5]) - tf.sigmoid(raw_pred[...,4:5])) ** gamma * alpha * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) + \
                        (1 - object_mask) * ignore_mask * tf.sigmoid(raw_pred[...,4:5]) ** gamma * (1 - alpha) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)) * focal_loss_ratio
        else:
            confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) + \
                        (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask

        class_loss = object_mask * K.binary_crossentropy(
            true_class_probs, raw_pred[..., 5:], from_logits=True)

        #-----------------------------------------------------------#
        #   计算正样本数量
        #-----------------------------------------------------------#
        num_pos = tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1)
        num_neg = tf.maximum(
            K.sum(K.cast((1 - object_mask) * ignore_mask, tf.float32)), 1)

        #-----------------------------------------------------------#
        #   将所有损失求和
        #-----------------------------------------------------------#
        location_loss = location_loss * box_ratio / num_pos
        confidence_loss = K.sum(confidence_loss) * balance[l] * obj_ratio / (
            num_pos + num_neg)
        class_loss = K.sum(class_loss) * cls_ratio / num_pos / num_classes

        loss += location_loss + confidence_loss + class_loss
        if print_loss:
            loss = tf.Print(loss, [
                loss, location_loss, confidence_loss, class_loss,
                tf.shape(ignore_mask)
            ],
                            summarize=100,
                            message='loss: ')
    return loss