def call(self, inputs): X, Z = inputs X_layer = kl.Dense(16, activation='linear')(X) Z_dense = kl.Dense(16, activation='linear') combined = list() for i in range(Z.shape[0]): z = Z_dense(Z[:,i]) l = X_layer + z l = K.expand_dims(l, axis=1) combined.append(l) combined = kl.concatenate(combined, axis=1) # combined is now shape (batch_size, z_size, 16) l = ka.relu(combined) l = kl.Dense(16, activation='relu')(l) l = kl.Dense(16, activation='relu')(l) l = kl.Dense(16, activation='linear')(l) return l
def mask(self, inputs, masks): masks = K.cast(masks, 'float32') masks = K.tile(masks, [K.shape(inputs)[0] // K.shape(masks)[0], 1]) masks = K.expand_dims(masks, 1) outputs = inputs + masks * self._masking_num return outputs
def box_ciou(b1, b2): """ 输入为: ---------- b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh 返回为: ------- ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1) """ #-----------------------------------------------------------# # 求出预测框左上角右下角 # b1_mins (batch, feat_w, feat_h, anchor_num, 2) # b1_maxes (batch, feat_w, feat_h, anchor_num, 2) #-----------------------------------------------------------# b1_xy = b1[..., :2] b1_wh = b1[..., 2:4] b1_wh_half = b1_wh / 2. b1_mins = b1_xy - b1_wh_half b1_maxes = b1_xy + b1_wh_half #-----------------------------------------------------------# # 求出真实框左上角右下角 # b2_mins (batch, feat_w, feat_h, anchor_num, 2) # b2_maxes (batch, feat_w, feat_h, anchor_num, 2) #-----------------------------------------------------------# b2_xy = b2[..., :2] b2_wh = b2[..., 2:4] b2_wh_half = b2_wh / 2. b2_mins = b2_xy - b2_wh_half b2_maxes = b2_xy + b2_wh_half #-----------------------------------------------------------# # 求真实框和预测框所有的iou # iou (batch, feat_w, feat_h, anchor_num) #-----------------------------------------------------------# intersect_mins = K.maximum(b1_mins, b2_mins) intersect_maxes = K.minimum(b1_maxes, b2_maxes) intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] b1_area = b1_wh[..., 0] * b1_wh[..., 1] b2_area = b2_wh[..., 0] * b2_wh[..., 1] union_area = b1_area + b2_area - intersect_area iou = intersect_area / K.maximum(union_area, K.epsilon()) #-----------------------------------------------------------# # 计算中心的差距 # center_distance (batch, feat_w, feat_h, anchor_num) #-----------------------------------------------------------# center_distance = K.sum(K.square(b1_xy - b2_xy), axis=-1) enclose_mins = K.minimum(b1_mins, b2_mins) enclose_maxes = K.maximum(b1_maxes, b2_maxes) enclose_wh = K.maximum(enclose_maxes - enclose_mins, 0.0) #-----------------------------------------------------------# # 计算对角线距离 # enclose_diagonal (batch, feat_w, feat_h, anchor_num) #-----------------------------------------------------------# enclose_diagonal = K.sum(K.square(enclose_wh), axis=-1) ciou = iou - 1.0 * (center_distance) / K.maximum(enclose_diagonal, K.epsilon()) v = 4 * K.square( tf.math.atan2(b1_wh[..., 0], K.maximum(b1_wh[..., 1], K.epsilon())) - tf.math.atan2(b2_wh[..., 0], K.maximum(b2_wh[..., 1], K.epsilon())) ) / (math.pi * math.pi) alpha = v / K.maximum((1.0 - iou + v), K.epsilon()) ciou = ciou - alpha * v ciou = K.expand_dims(ciou, -1) return ciou
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False): # 一共有2层 num_layers = len(anchors) // 3 # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true] # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)。 # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,255),(m,26,26,255)。 y_true = args[num_layers:] yolo_outputs = args[:num_layers] # 先验框 anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # 得到input_shpae为608,608 input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 # 取出每一张图片 # m的值就是batch_size m = K.shape(yolo_outputs[0])[0] mf = K.cast(m, K.dtype(yolo_outputs[0])) # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85)。 # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,255),(m,26,26,255)。 for l in range(num_layers): # 以第一个特征层(m,13,13,3,85)为例子 # 取出该特征层中存在目标的点的位置。(m,13,13,3,1) object_mask = y_true[l][..., 4:5] # 取出其对应的种类(m,13,13,3,80) true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) # 将yolo_outputs的特征层输出进行处理 # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(m,13,13,3,85) # 还有解码后的xy,wh,(m,13,13,3,2) grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) # 这个是解码后的预测的box的位置 # (m,13,13,3,4) pred_box = K.concatenate([pred_xy, pred_wh]) # 找到负样本群组,第一步是创建一个数组,[] ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # 对每一张图片计算ignore_mask def loop_body(b, ignore_mask): # 取出第b副图内,真实存在的所有的box的参数 # n,4 true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) # 计算预测结果与真实情况的iou # pred_box为13,13,3,4 # 计算的结果是每个pred_box和其它所有真实框的iou # 13,13,3,n iou = box_iou(pred_box[b], true_box) # 13,13,3 best_iou = K.max(iou, axis=-1) # 如果某些预测框和真实框的重合程度大于0.5,则忽略。 ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask # 遍历所有的图片 _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) # 将每幅图的内容压缩,进行处理 ignore_mask = ignore_mask.stack() #(m,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Calculate ciou loss as location loss raw_true_box = y_true[l][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) ciou_loss = K.sum(ciou_loss) / mf location_loss = ciou_loss # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本 # best_iou<ignore_thresh用于限制负样本数量 confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += location_loss + confidence_loss + class_loss # if print_loss: # loss = tf.Print(loss, [loss, confidence_loss, class_loss, location_loss], message='loss: ') loss = K.expand_dims(loss, axis=-1) return loss
def find_path(argmin_table, best_idx): next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0]) next_best_idx = K.expand_dims(next_best_idx) return next_best_idx, [next_best_idx]
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask) ], message='loss: ') return loss
def call(self, inputs, **kwargs): if inputs.get_shape().ndims == 5: assert inputs.get_shape( )[-2].value == 1, 'Error: Must have num_capsules = 1 going into Length' inputs = K.squeeze(inputs, axis=-2) return K.expand_dims(tf.norm(inputs, axis=-1), axis=-1)
embed_sem = Model(inputs=sem_in_, outputs=z_sem) embed_etym = Model(inputs=[enc_in_, sem_in_], outputs=z_etym) embed_lang_in_ = Input((latent_dim, )) embed_POS_in_ = Input((latent_dim, )) embed_sem_in_ = Input((latent_dim, )) embed_etym_in_ = Input((latent_dim, )) embedding = Dense(embed_dim)(enc_in_) h_enc = Bidirectional(LSTM(hidden_dim, return_sequences=True), 'concat')(embedding) * enc_mask h_dec = LSTM(hidden_dim, return_sequences=True, activation=None)(dec_in_) * dec_mask #alignment_probs_,emission_probs = monotonic_alignment([h_enc,h_dec,T_x,T_y,Y,hidden_dim]) struc_zeros = K.expand_dims( K.cast(np.triu(np.ones([T_x, T_x])), dtype='float32'), 0) alignment_probs = K.softmax( dot([Dense(hidden_dim)(h_enc), h_dec], axes=-1, normalize=False), -2) h_enc_rep = K.tile(K.expand_dims(h_enc, -2), [1, 1, T_y, 1]) h_dec_rep = K.tile(K.expand_dims(h_dec, -3), [1, T_x, 1, 1]) h_rep = K.concatenate([h_enc_rep, h_dec_rep], -1) alignment_probs_ = [] for i in range(T_y): if i == 0: align_prev_curr = tf.gather(alignment_probs, i, axis=-1) if i > 0: align_prev_curr = tf.einsum('nx,ny->nxy', tf.gather(alignment_probs, i, axis=-1), alignment_probs_[i - 1]) align_prev_curr *= struc_zeros
def fit_dimensionality(self, tensor, batch_size): tensor = K.expand_dims(tensor) # channels tensor = K.expand_dims(tensor, axis=0) # batches tensor = K.tile(tensor, (batch_size, ) + (1, ) * 5) # repeat over batches return tensor
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False, normalize=True): # 一共有两层 num_layers = len(anchors) // 3 # ---------------------------------------------------------------------------------------------------# # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true] # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) # ---------------------------------------------------------------------------------------------------# y_true = args[num_layers:] yolo_outputs = args[:num_layers] # -----------------------------------------------------------# # 13x13的特征层对应的anchor是[81,82], [135,169], [344,319] # 26x26的特征层对应的anchor是[23,27], [37,58], [81,82] # -----------------------------------------------------------# anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # 得到input_shpae为416,416 input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 num_pos = 0 # -----------------------------------------------------------# # 取出每一张图片 # m的值就是batch_size # -----------------------------------------------------------# m = K.shape(yolo_outputs[0])[0] mf = K.cast(m, K.dtype(yolo_outputs[0])) # ---------------------------------------------------------------------------------------------------# # y_true是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) # yolo_outputs是一个列表,包含两个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85) # ---------------------------------------------------------------------------------------------------# for l in range(num_layers): # -----------------------------------------------------------# # 以第一个特征层(m,13,13,3,85)为例子 # 取出该特征层中存在目标的点的位置。(m,13,13,3,1) # -----------------------------------------------------------# object_mask = y_true[l][..., 4:5] # -----------------------------------------------------------# # 取出其对应的种类(m,13,13,3,80) # -----------------------------------------------------------# true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) # -----------------------------------------------------------# # 将yolo_outputs的特征层输出进行处理、获得四个返回值 # 其中: # grid (13,13,1,2) 网格坐标 # raw_pred (m,13,13,3,85) 尚未处理的预测结果 # pred_xy (m,13,13,3,2) 解码后的中心坐标 # pred_wh (m,13,13,3,2) 解码后的宽高坐标 # -----------------------------------------------------------# grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) # -----------------------------------------------------------# # pred_box是解码后的预测的box的位置 # (m,13,13,3,4) # -----------------------------------------------------------# pred_box = K.concatenate([pred_xy, pred_wh]) # -----------------------------------------------------------# # 找到负样本群组,第一步是创建一个数组,[] # -----------------------------------------------------------# ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # -----------------------------------------------------------# # 对每一张图片计算ignore_mask # -----------------------------------------------------------# def loop_body(b, ignore_mask): # -----------------------------------------------------------# # 取出n个真实框:n,4 # -----------------------------------------------------------# true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) # -----------------------------------------------------------# # 计算预测框与真实框的iou # pred_box 13,13,3,4 预测框的坐标 # true_box n,4 真实框的坐标 # iou 13,13,3,n 预测框和真实框的iou # -----------------------------------------------------------# iou = box_iou(pred_box[b], true_box) # -----------------------------------------------------------# # best_iou 13,13,3 每个特征点与真实框的最大重合程度 # -----------------------------------------------------------# best_iou = K.max(iou, axis=-1) # -----------------------------------------------------------# # 判断预测框和真实框的最大iou小于ignore_thresh # 则认为该预测框没有与之对应的真实框 # 该操作的目的是: # 忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了 # 不适合当作负样本,所以忽略掉。 # -----------------------------------------------------------# ignore_mask = ignore_mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask # -----------------------------------------------------------# # 在这个地方进行一个循环、循环是对每一张图片进行的 # -----------------------------------------------------------# _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) # -----------------------------------------------------------# # ignore_mask用于提取出作为负样本的特征点 # (m,13,13,3) # -----------------------------------------------------------# ignore_mask = ignore_mask.stack() # (m,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) # -----------------------------------------------------------# # 真实框越大,比重越小,小框的比重更大。 # -----------------------------------------------------------# box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # -----------------------------------------------------------# # 计算Ciou loss # -----------------------------------------------------------# raw_true_box = y_true[l][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) # ------------------------------------------------------------------------------# # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,那么计算0与置信度的交叉熵 # 在这其中会忽略一部分样本,这些被忽略的样本满足条件best_iou<ignore_thresh # 该操作的目的是: # 忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了 # 不适合当作负样本,所以忽略掉。 # ------------------------------------------------------------------------------# confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \ (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[..., 5:], from_logits=True) location_loss = K.sum(tf.where(tf.math.is_nan(ciou_loss), tf.zeros_like(ciou_loss), ciou_loss)) confidence_loss = K.sum( tf.where(tf.math.is_nan(confidence_loss), tf.zeros_like(confidence_loss), confidence_loss)) class_loss = K.sum(tf.where(tf.math.is_nan(class_loss), tf.zeros_like(class_loss), class_loss)) # -----------------------------------------------------------# # 计算正样本数量 # -----------------------------------------------------------# num_pos += tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1) loss += location_loss + confidence_loss + class_loss loss = K.expand_dims(loss, axis=-1) if normalize: loss = loss / num_pos else: loss = loss / mf return loss
def customized_yolo_loss(y_true, y_pred, input_shape, candidate_anchors, grid_shape, num_classes, ignore_thresh=.5): # y_pred = tf.convert_to_tensor(y_pred) # y_true = tf.cast(y_true, y_pred.dtype) from .base import yolo_head m = K.shape(y_pred)[0] # batch size, tensor mf = K.cast(m, K.dtype(y_pred)) grid_shape = K.cast(grid_shape[1:3], K.dtype(y_true)) object_mask = y_true[..., 4:5] true_class_probs = y_true[..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(y_pred, candidate_anchors, num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. # print(y_true[..., :2].shape, grid_shape[::-1], grid.shape) raw_true_xy = y_true[..., :2] * grid_shape[::-1] - grid raw_true_wh = K.log(y_true[..., 2:4] / candidate_anchors * input_shape[::-1] + 1e-10) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[..., 2:3] * y_true[..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy( object_mask, raw_pred[..., 4:5], from_logits=True) confidence_loss += (1 - object_mask) * K.binary_crossentropy( object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf return xy_loss + wh_loss + confidence_loss + class_loss
def call(self, inputs): input_shapes = nest.map_structure(lambda x: x.shape, inputs) output_shapes = self.compute_output_shape(input_shapes) means, covariances = inputs # there is no 1d pooling until tf-1.14, so we use 2d pooling instead if self.data_format == "channels_last": means = K.expand_dims(means, 1) if self.mode == "diag": covariances = K.expand_dims(covariances, 1) elif self.mode == "half": covariances = K.expand_dims(covariances, 2) elif self.mode == "full": covariances = K.expand_dims(covariances, 1) covariances = K.expand_dims(covariances, 4) pool_shape = list((1, ) + self.pool_size) strides = list((1, ) + self.strides) data_format = "NHWC" else: means = K.expand_dims(means, 2) if self.mode == "diag": covariances = K.expand_dims(covariances, 2) elif self.mode == "half": covariances = K.expand_dims(covariances, 3) elif self.mode == "full": covariances = K.expand_dims(covariances, 2) covariances = K.expand_dims(covariances, 5) pool_shape = list((1, ) + self.pool_size) strides = list((1, ) + self.strides) data_format = "NCHW" outputs = [[], []] outputs[0] = K.reshape( self.pool_function( means, ksize=pool_shape, strides=strides, padding=self.padding.upper(), data_format=data_format, ), [-1] + output_shapes[0].as_list()[1:], ) if self.mode == "diag": outputs[1] = K.reshape( self.pool_function( covariances / np.prod(pool_shape), ksize=pool_shape, strides=strides, padding=self.padding.upper(), data_format=data_format, ), [-1] + output_shapes[1].as_list()[1:], ) elif self.mode == "half": cov_shape = covariances.get_shape().as_list() covariances = K.reshape(covariances, [-1] + cov_shape[2:]) outputs[1] = K.reshape( self.pool_function( covariances, ksize=pool_shape, strides=strides, padding=self.padding.upper(), data_format=data_format, ), [-1] + output_shapes[1].as_list()[1:], ) elif self.mode == "full": cov_shape = covariances.get_shape().as_list() out_shape = output_shapes[1].as_list() if self.data_format == "channels_last": out_shape = (out_shape[:1] + [1] + out_shape[1:3] + [1] + out_shape[3:]) elif self.data_format == "channels_first": out_shape = (out_shape[:2] + [1] + out_shape[2:4] + [1] + out_shape[4:]) covariances = K.reshape(covariances, [-1] + cov_shape[4:]) covariances = K.reshape( self.pool_function( covariances, ksize=pool_shape, strides=strides, padding=self.padding.upper(), data_format=data_format, ), ([-1] + cov_shape[1:4] + out_shape[-3:]), ) covariances = K.permute_dimensions( covariances, ([0] + list(range(4, 7)) + list(range(1, 4))), ) covariances = K.reshape(covariances, [-1] + cov_shape[1:4]) covariances = K.reshape( self.pool_function( covariances, ksize=pool_shape, strides=strides, padding=self.padding.upper(), data_format=data_format, ), ([-1] + out_shape[-3:] + out_shape[1:4]), ) outputs[1] = K.reshape( K.permute_dimensions( covariances, ([0] + list(range(4, 7)) + list(range(1, 4))), ), [-1] + output_shapes[1].as_list()[1:], ) return outputs
def create_model(linear_feature_columns, dnn_feature_columns, fm_group=[DEFAULT_GROUP_NAME], dnn_hidden_units=(128, 128), l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False, task='binary'): K.clear_session() #!################################################################################################################ inputs_all = [get_input_feature_layer( name='slotid_nettype', feature_shape=dense_feature_size)] # slotid_nettype layer_slotid_nettype = inputs_all[0] layer_slotid_nettype = K.expand_dims(layer_slotid_nettype, 1) #!################################################################################################################ seq_inputs_dict = get_cross_seq_input_layers(cols=cross_arr_name_list) inputs_all = inputs_all + list(seq_inputs_dict.values()) # 输入层list 做交叉 cross_emb_out = [] last_col = '' for index, col in enumerate(cross_arr_name_list): # print(col, 'get embedding!') emb_layer = get_emb_layer( col, trainable=False, emb_matrix=dict_cross_emb_all[col]) x = emb_layer(inputs_all[1+index]) if col.split('_')[-1] == 'i': cross_user_item_i = x last_col = col continue else: print(f'crossing net add {last_col} and {col}') cross_emb_out.append( cross_net(cross_user_item_i, x, layer_slotid_nettype, hidden_unit=4)) cross_emb_out = tf.keras.layers.concatenate(cross_emb_out) cross_emb_out = tf.squeeze(cross_emb_out, [1]) #!################################################################################################################ seq_inputs_dict = get_seq_input_layers(cols=arr_name_list) inputs_all = inputs_all+list(seq_inputs_dict.values()) # 输入层list masks = tf.equal(seq_inputs_dict['task_id'], 0) # 普通序列+label序列 layers2concat = [] for index, col in enumerate(arr_name_list): print(col, 'get embedding!') emb_layer = get_emb_layer( col, trainable=TRAINABLE_DICT[col], emb_matrix=id_list_dict_emb_all[col][1]) x = emb_layer(seq_inputs_dict[col]) if conv1d_info_dict[col] > -1: cov_layer = tf.keras.layers.Conv1D(filters=conv1d_info_dict[col], kernel_size=1, activation='relu') x = cov_layer(x) layers2concat.append(x) x = tf.keras.layers.concatenate(layers2concat) #!################################################################################################################ #!mix1 x = trans_net(x, masks, hidden_unit=256) max_pool = tf.keras.layers.GlobalMaxPooling1D() average_pool = tf.keras.layers.GlobalAveragePooling1D() xmaxpool = max_pool(x) xmeanpool = average_pool(x) trans_output = tf.keras.layers.concatenate([xmaxpool, xmeanpool]) #!################################################################################################################ #!mix2 features = build_input_features( linear_feature_columns + dnn_feature_columns) inputs_list = list(features.values()) linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear', l2_reg=l2_reg_linear) group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, l2_reg_embedding, seed, support_group=True) fm_logit = add_func([FM()(concat_func(v, axis=1)) for k, v in group_embedding_dict.items() if k in fm_group]) dnn_input = combined_dnn_input(list(chain.from_iterable( group_embedding_dict.values())), dense_value_list) mix = concatenate([cross_emb_out, trans_output, dnn_input], axis=-1) # !#mix dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed)(mix) dnn_logit = tf.keras.layers.Dense( 1, use_bias=False, activation=None)(dnn_output) final_logit = add_func([linear_logit, fm_logit, dnn_logit]) output = PredictionLayer(task)(final_logit) #!################################################################################################################ model = Model(inputs=inputs_all+[features], outputs=[output]) print(model.summary()) return model
def _get_best_anchor(y_true, anchors, width, height): """ get the correct anchor that is assoiciated with each box using IOU betwenn input anchors and gt Args: y_true: tf.Tensor[] for the list of bounding boxes in the yolo format anchors: list or tensor for the anchor boxes to be used in prediction found via Kmeans size: size of the image that the bounding boxes were selected at 416 is the default for the original YOLO model return: tf.Tensor: y_true with the anchor associated with each ground truth box known """ with tf.name_scope("get_anchor"): width = tf.cast(width, dtype=tf.float32) height = tf.cast(height, dtype=tf.float32) anchor_xy = y_true[..., 0:2] true_wh = y_true[..., 2:4] # scale thhe boxes anchors = tf.convert_to_tensor(anchors, dtype=tf.float32) anchors_x = anchors[..., 0] / width anchors_y = anchors[..., 1] / height anchors = tf.stack([anchors_x, anchors_y], axis=-1) # build a matrix of anchor boxes anchors = tf.transpose(anchors, perm=[1, 0]) anchor_xy = tf.tile(tf.expand_dims(anchor_xy, axis=-1), [1, 1, tf.shape(anchors)[-1]]) anchors = tf.tile(tf.expand_dims(anchors, axis=0), [tf.shape(anchor_xy)[0], 1, 1]) # stack the xy so, each anchor is asscoaited once with each center from the ground truth input anchors = K.concatenate([anchor_xy, anchors], axis=1) anchors = tf.transpose(anchors, perm=[2, 0, 1]) # copy the gt n times so that each anchor from above can be compared to input ground truth truth_comp = tf.tile(tf.expand_dims(y_true[..., 0:4], axis=-1), [1, 1, tf.shape(anchors)[0]]) truth_comp = tf.transpose(truth_comp, perm=[2, 0, 1]) # compute intersection over union of the boxes, and take the argmax of comuted iou for each box. # thus each box is associated with the largest interection over union iou_raw = compute_iou(truth_comp, anchors) gt_mask = tf.cast(iou_raw > 0.213, dtype=iou_raw.dtype) num_k = tf.reduce_max( tf.reduce_sum(tf.transpose(gt_mask, perm=[1, 0]), axis=1)) if num_k <= 0: num_k = 1.0 values, indexes = tf.math.top_k(tf.transpose(iou_raw, perm=[1, 0]), k=tf.cast(num_k, dtype=tf.int32), sorted=True) ind_mask = tf.cast(values > 0.213, dtype=indexes.dtype) iou_index = tf.concat([ K.expand_dims(indexes[..., 0], axis=-1), ((indexes[..., 1:] + 1) * ind_mask[..., 1:]) - 1 ], axis=-1) stack = tf.zeros( [tf.shape(iou_index)[0], tf.cast(1, dtype=iou_index.dtype)], dtype=iou_index.dtype) - 1 #tf.print(tf.shape(iou_index)) while num_k < 5: iou_index = tf.concat([iou_index, stack], axis=-1) num_k += 1 iou_index = iou_index[..., :5] values = tf.concat([ K.expand_dims(values[..., 0], axis=-1), ((values[..., 1:]) * tf.cast(ind_mask[..., 1:], dtype=tf.float32)) ], axis=-1) # iou_anchors = K.argmax(iou_raw, axis = 0) # iou_anchors = K.expand_dims(tf.cast(iou_anchors, dtype = tf.float32), axis = -1) # tf.print(iou_index, values) #flatten the list from above and attach to the end of input y_true, then return it #y_true = K.concatenate([y_true, K.expand_dims(iou_anchors, axis = -1)], axis = -1) return tf.cast(iou_index, dtype=tf.float32)
def __call__(self, y_true, y_pred): # 1. generate and store constants and format output shape = tf.shape(y_pred) batch_size, width, height = shape[0], shape[1], shape[2] y_pred = tf.cast( tf.reshape(y_pred, [batch_size, width, height, self._num, -1]), tf.float32) grid_points, anchor_grid, y_true = self._get_label_attributes( width, height, batch_size, y_true, y_pred, y_pred.dtype) fwidth = tf.cast(width, y_pred.dtype) fheight = tf.cast(height, y_pred.dtype) # 2. split up layer output into components, xy, wh, confidence, class -> then apply activations to the correct items pred_xy, pred_wh, pred_box = self._get_predicted_box( fwidth, fheight, y_pred[..., 0:4], anchor_grid, grid_points) pred_conf = tf.expand_dims(tf.math.sigmoid(y_pred[..., 4]), axis=-1) pred_conf = self.rm_nan_inf(pred_conf) pred_class = tf.math.sigmoid(y_pred[..., 5:]) self.print_error(pred_box) # 3. split up ground_truth into components, xy, wh, confidence, class -> apply calculations to acchive safe format as predictions true_box = y_true[..., 0:4] true_conf = y_true[..., 4] true_class = y_true[..., 5:] # 5. apply generalized IOU or mse to the box predictions -> only the indexes where an object exists will affect the total loss -> found via the true_confidnce in ground truth if self._loss_type == "giou": iou, giou = iou_ops.compute_giou(true_box, pred_box) mask_iou = tf.cast(iou < self._ignore_thresh, dtype=y_pred.dtype) loss_box = (1 - giou) * self._iou_normalizer * true_conf #loss_box = tf.math.minimum(loss_box, self._max_value) elif self._loss_type == "ciou": iou, ciou = iou_ops.compute_ciou(true_box, pred_box) mask_iou = tf.cast(iou < self._ignore_thresh, dtype=y_pred.dtype) loss_box = (1 - ciou) * self._iou_normalizer * true_conf #loss_box = tf.math.minimum(loss_box, self._max_value) else: # iou mask computation iou = iou_ops.compute_iou(true_box, pred_box) mask_iou = tf.cast(iou < self._ignore_thresh, dtype=y_pred.dtype) # mse loss computation :: yolo_layer.c: scale = (2-truth.w*truth.h) scale = ( 2 - true_box[..., 2] * true_box[..., 3]) * self._iou_normalizer true_xy, true_wh = self._scale_ground_truth_box( true_box, fwidth, fheight, anchor_grid, grid_points, y_pred.dtype) loss_xy = tf.reduce_sum(K.square(true_xy - pred_xy), axis=-1) loss_wh = tf.reduce_sum(K.square(true_wh - pred_wh), axis=-1) loss_box = (loss_wh + loss_xy) * true_conf * scale #loss_box = tf.math.minimum(loss_box, self._max_value) # 6. apply binary cross entropy(bce) to class attributes -> only the indexes where an object exists will affect the total loss -> found via the true_confidnce in ground truth class_loss = self._cls_normalizer * tf.reduce_sum( ks.losses.binary_crossentropy(K.expand_dims(true_class, axis=-1), K.expand_dims(pred_class, axis=-1)), axis=-1) * true_conf # 7. apply bce to confidence at all points and then strategiacally penalize the network for making predictions of objects at locations were no object exists bce = ks.losses.binary_crossentropy(K.expand_dims(true_conf, axis=-1), pred_conf) conf_loss = (true_conf + (1 - true_conf) * mask_iou) * bce # 8. take the sum of all the dimentions and reduce the loss such that each batch has a unique loss value loss_box = tf.reduce_mean( tf.cast(tf.reduce_sum(loss_box, axis=(1, 2, 3)), dtype=y_pred.dtype)) conf_loss = tf.reduce_mean( tf.cast(tf.reduce_sum(conf_loss, axis=(1, 2, 3)), dtype=y_pred.dtype)) class_loss = tf.reduce_mean( tf.cast(tf.reduce_sum(class_loss, axis=(1, 2, 3)), dtype=y_pred.dtype)) # 9. i beleive tensorflow will take the average of all the batches loss, so add them and let TF do its thing loss = class_loss + conf_loss + loss_box # 10. store values for use in metrics recall50 = tf.reduce_mean( tf.math.divide_no_nan( tf.reduce_sum(tf.cast(tf.squeeze(pred_conf, axis=-1) > 0.5, dtype=true_conf.dtype) * true_conf, axis=(1, 2, 3)), (tf.reduce_sum(true_conf, axis=(1, 2, 3))))) avg_iou = tf.math.divide_no_nan( tf.reduce_sum(iou), tf.cast(tf.math.count_nonzero(tf.cast(iou > 0, dtype=y_pred.dtype)), dtype=y_pred.dtype)) return loss, loss_box, conf_loss, class_loss, avg_iou, recall50
def call(self, x): x = K.expand_dims(x, axis=1) output = K.sum(x * self.kernel, axis=(2, 3)) if self.activation is not None: return self.activation(output) return output
def __init__(self, model, upsample_size=UPSAMPLE_SIZE): mask_size = np.ceil(np.array((32, 32), dtype=float) / upsample_size) mask_size = mask_size.astype(int) self.mask_size = mask_size mask = np.zeros(self.mask_size) pattern = np.zeros((32, 32, 3)) mask = np.expand_dims(mask, axis=2) mask_tanh = np.zeros_like(mask) pattern_tanh = np.zeros_like(pattern) # prepare mask related tensors self.mask_tanh_tensor = K.variable(mask_tanh) mask_tensor_unrepeat = (K.tanh(self.mask_tanh_tensor) \ / (2 - K.epsilon()) + 0.5) mask_tensor_unexpand = K.repeat_elements( mask_tensor_unrepeat, rep=3, axis=2) self.mask_tensor = K.expand_dims(mask_tensor_unexpand, axis=0) upsample_layer = UpSampling2D( size=(upsample_size, upsample_size)) mask_upsample_tensor_uncrop = upsample_layer(self.mask_tensor) uncrop_shape = K.int_shape(mask_upsample_tensor_uncrop)[1:] cropping_layer = Cropping2D( cropping=((0, uncrop_shape[0] - 32), (0, uncrop_shape[1] - 32))) self.mask_upsample_tensor = cropping_layer( mask_upsample_tensor_uncrop) # self.mask_upsample_tensor = K.round(self.mask_upsample_tensor) reverse_mask_tensor = (K.ones_like(self.mask_upsample_tensor) - self.mask_upsample_tensor) # prepare pattern related tensors self.pattern_tanh_tensor = K.variable(pattern_tanh) self.pattern_raw_tensor = ( (K.tanh(self.pattern_tanh_tensor) / (2 - K.epsilon()) + 0.5) * 255.0) # prepare input image related tensors # ignore clip operation here # assume input image is already clipped into valid color range input_tensor = K.placeholder((None,32,32,3)) input_raw_tensor = input_tensor # IMPORTANT: MASK OPERATION IN RAW DOMAIN X_adv_raw_tensor = ( reverse_mask_tensor * input_raw_tensor + self.mask_upsample_tensor * self.pattern_raw_tensor) X_adv_tensor = X_adv_raw_tensor output_tensor = model(X_adv_tensor) y_target_tensor = K.placeholder((None,43)) y_true_tensor = K.placeholder((None,43)) self.loss_ce = categorical_crossentropy(output_tensor, y_target_tensor) self.hyperparameters = K.reshape(K.constant(np.array([1e-2, 1e-5, 1e-7, 1e-8, 0, 1e-2])), shape=(6, 1)) self.loss_reg = self.build_tabor_regularization(input_raw_tensor, model, y_target_tensor, y_true_tensor) self.loss_reg = K.dot(K.reshape(self.loss_reg, shape=(1, 6)), self.hyperparameters) self.loss = K.mean(self.loss_ce) + self.loss_reg self.opt = Adam(lr=1e-3, beta_1=0.5, beta_2=0.9) self.updates = self.opt.get_updates( params=[self.pattern_tanh_tensor, self.mask_tanh_tensor], loss=self.loss) self.train = K.function( [input_tensor, y_true_tensor, y_target_tensor], [self.loss_ce, self.loss_reg, self.loss], updates=self.updates)
def call(self, inputs, val_mode=False, dropout=False): # Train or validation mode ############################################## if val_mode: logging.debug("MODEL DRAKE NESTED CALL - Train mode") else: logging.debug("MODEL DRAKE NESTED CALL - Validation mode") # STEP 0: Process Inputs ################################################ # Input | Encoder input | batch_size=None x # # | | feature_map_wxh=None(64) x# # | | image_embedding_dim= # # | | image_embedding_dim # # | Token input | batch_size = None x # # | | token_seq_len = x # # | | token_seq_len # #_____________________|_____________________|___________________________# input_image = inputs[0] input_tokens = inputs[1] self.batch_size = input_tokens.shape[0] batch_token_seq_len = input_tokens.shape[1] # Logging, Debug & Assert logging.debug("MODEL DRAKE NESTED CALL - Step 0 - Process inputs - " "batch_size {}".format(self.batch_size)) logging.debug("MODEL DRAKE NESTED CALL - Step 0 - Process inputs - " "input_image shape {}".format(K.int_shape(input_image))) logging.debug("MODEL DRAKE NESTED CALL - Step 0 - Process inputs - " "input_tokens shape {}".format( K.int_shape(input_tokens))) if self.image_encoder == "inceptionv3": tf.compat.v1.debugging.assert_equal(K.int_shape(input_image), (self.batch_size, 299, 299, 3)) else: tf.compat.v1.debugging.assert_equal(K.int_shape(input_image), (self.batch_size, 64, 2048)) tf.compat.v1.debugging.assert_equal( K.int_shape(input_tokens), (self.batch_size, batch_token_seq_len)) # STEP 1: Reset Decoder Hidden State #################################### # Zeroes | Initial decoder | batch_size=None x # # | hidden state | decoder_hidden_dim= # # | | decoder_hidden_dim # # | Initial outer | batch_size=None x # # | decoder hidden | decoder_hidden_dim= # # | state | decoder_hidden_dim # #_____________________|_____________________|___________________________# decoder_hidden_state = \ keras.backend.zeros(shape=(self.batch_size, self.decoder_hidden_dim)) # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 1 - Reset decoder hidden " "state - decoder_hidden_state shape {}".format( K.int_shape(decoder_hidden_state))) tf.compat.v1.debugging.assert_equal( K.int_shape(decoder_hidden_state), (self.batch_size, self.decoder_hidden_dim)) # STEP 2: Image Encoding ################################################ # Dense + Activations | Image encoder | batch_size=None x # # | output | feature_map_wxh=None(64) # # | | image_embedding_dim= # # | | image_embedding_dim # #_____________________|_____________________|___________________________# input_image_features = \ self.model1_image_encoding([input_image], dropout=dropout) # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 2 - Image encoding dense" " and activations - input_image_features shape {}".format( K.int_shape(input_image_features))) tf.compat.v1.debugging.assert_equal( K.int_shape(input_image_features), (self.batch_size, 64, self.image_embedding_dim)) # STEP 3: Token Embedding for all batch input sequences ################# # Embedding | Token embedding | batch_size=None x # # | | token_seq_len = # # | | token_seq_len x # # | | token_embedding_dim= # # | | token_embedding_dim # # ____________________|_____________________|___________________________# input_token_embeddings = \ self.model2_token_embedding([input_tokens], dropout=dropout) # Logging, Debug & Assert logging.debug("MODEL DRAKE NESTED CALL - Step 3 - Token embeddings - " "target_token_embeddings shape {}".format( keras.backend.int_shape(input_token_embeddings))) tf.compat.v1.debugging.assert_equal( keras.backend.int_shape(input_token_embeddings), (self.batch_size, batch_token_seq_len, self.token_embedding_dim)) # STEP 4: Decoder inputs is a 'GO' ###################################### # Slice + Expand dims | GO column | batch_size=None x # # | | token_seq_len = 1 x # # | | token_embedding_dim= # # | | token_embedding_dim # # ____________________|_____________________|___________________________# # For first character input is always GO = 1 at index 0 # Both for teaching forcing mode and validation mode decoder_token_input = \ K.expand_dims(input_token_embeddings[:, 0], 1) # Logging, Debug, & Assert logging.debug("MODEL DRAKE NESTED CALL - Step 4 - Decoder inputs - " "decoder_teaching_forcing_inputs shape {}".format( K.int_shape(decoder_token_input))) tf.compat.v1.debugging.assert_equal( K.int_shape(decoder_token_input), (self.batch_size, 1, self.token_embedding_dim)) # STEP 5: Loop through token sequence ################################### batch_loss = 0 batch_mean_edit_distance = 0 if val_mode: list_predictions = [] for i in range(1, batch_token_seq_len): # STEP 5.1: Outer attention ########################################### # Summed weights | Outer context | batch_size=None x # # | vector | decoder_hidden_dim= # # | | decoder_hidden_dim # # __________________|_____________________|___________________________# outer_attention_map, outer_attention_weights = \ self.model5_attention_map([input_image_features, decoder_hidden_state], dropout=dropout) # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 5.1 - Outer attention - " "Context vector shape {}".format( K.int_shape(outer_attention_map))) tf.compat.v1.debugging.assert_equal( K.int_shape(outer_attention_map), (self.batch_size, 64, self.decoder_hidden_dim)) # STEP 5.4: Inner attention ########################################### # Summed weights | Context vector | batch_size=None x # # | | decoder_hidden_dim= # # | | decoder_hidden_dim # # __________________|_____________________|___________________________# context_vector, attention_weights = \ self.model3_attention([outer_attention_map, decoder_hidden_state], dropout=dropout) # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 5.4 - Inner attention - " "Context vector shape {}".format(K.int_shape(context_vector))) tf.compat.v1.debugging.assert_equal( K.int_shape(context_vector), (self.batch_size, self.decoder_hidden_dim)) # STEP 5.5: LSTM Input ################################################ # Expand + | LSTM input | batch_size=None x # # Concatenate | | token_seq_len=1 x # # | | lstm_input_dim= # # | | decoder_hidden_dim + # # | | token_embedding_dim # # __________________|_____________________|___________________________# context_vector_expanded = self.layer1_expand_dims( context_vector, 1) # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 5.5 - Expand context " "vector - context_vector_expanded shape {}".format( K.int_shape(context_vector_expanded))) tf.compat.v1.debugging.assert_equal( K.int_shape(context_vector_expanded), (self.batch_size, 1, self.decoder_hidden_dim)) lstm_input = self.layer2_concatenate( [context_vector_expanded, decoder_token_input], axis=-1) # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 5.5 - Concat context" "vector and token embedding - lstm_input shape {}".format( K.int_shape(lstm_input))) tf.compat.v1.debugging.assert_equal( K.int_shape(lstm_input), (self.batch_size, 1, self.token_embedding_dim + self.decoder_hidden_dim)) # STEP 5.6: LSTM ###################################################### # LSTM return | LSTM Output | batch_size=None x # # sequences and | | token_seq_len=1 x # # state | | decoder_hidden_dim= # # | | decoder_hidden_dim # # | | # # | LSTM Hidden State | batch_size=None x # # | | decoder_hidden_dim= # # | | decoder_hidden_dim # # | | # # | LSTM Cell State | batch_size=None x # # | | decoder_hidden_dim= # # | | decoder_hidden_dim # # __________________|_____________________|___________________________# lstm_output, decoder_hidden_state, decoder_cell_state = \ self.layer3_lstm(lstm_input) # Logging, Debug & Assert logging.debug("MODEL DRAKE NESTED CALL - Step 5.6 - LSTM output - " "lstm_output shape {}".format( K.int_shape(lstm_output))) logging.debug("MODEL DRAKE NESTED CALL - Step 5.6 - LSTM output - " "decoder_hidden_state shape {}".format( K.int_shape(decoder_hidden_state))) logging.debug("MODEL DRAKE NESTED CALL - Step 5.6 - LSTM output - " "decoder_cell_state shape {}".format( K.int_shape(decoder_cell_state))) tf.compat.v1.debugging.assert_equal( K.int_shape(lstm_output), (self.batch_size, 1, self.decoder_hidden_dim)) tf.compat.v1.debugging.assert_equal( K.int_shape(decoder_hidden_state), (self.batch_size, self.decoder_hidden_dim)) tf.compat.v1.debugging.assert_equal( K.int_shape(decoder_cell_state), (self.batch_size, self.decoder_hidden_dim)) # STEP 5.7: MLP ####################################################### # Dense | Predicted token | batch_size=None x # # | | token_vocab_size x # # | | token_vocab_size # #___________________|_____________________|___________________________# mlp_input = self.layer4_concatenate( [context_vector_expanded, lstm_output], axis=-1) single_token_prediction = self.model4_mlp([mlp_input], dropout=dropout) # Logging. Debug & Assert logging.debug("MODEL DRAKE NESTED CALL - Step 5.7 - MLP output - " "single_token_prediction shape {}".format( K.int_shape(single_token_prediction))) tf.compat.v1.debugging.assert_equal( K.int_shape(single_token_prediction), (self.batch_size, self.token_vocab_size)) # STEP 5.8: Calculate loss ############################################ # Loss | Single token loss | int # #___________________|_____________________|___________________________# batch_loss += masked_ce_loss_fn( target=input_tokens[:, i], prediction=single_token_prediction, batch_size=self.batch_size, token_vocab_size=self.token_vocab_size) # Logging, Debug & Assert logging.debug("MODEL DRAKE NESTED CALL - Step 5.8 - " "Single prediction loss {}".format(batch_loss)) # STEP 5.9 Update decoder input ####################################### # Decoder input | New decoder | batch_size=None x # # | hidden state | decoder_hidden_dim= # # | | decoder_hidden_dim # #___________________|_____________________|___________________________# if val_mode: # In validation mode use argmax output from decoder argmax_prediction = tf.argmax(single_token_prediction, axis=1, output_type=tf.dtypes.int32) list_predictions.append(argmax_prediction) argmax_prediction_expanded = K.expand_dims(argmax_prediction) decoder_token_input = \ self.model2_token_embedding([argmax_prediction_expanded]) else: # In training mode use teacher forcing inputs decoder_token_input = \ K.expand_dims(input_token_embeddings[:, i], 1) # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 5.9 - Update decoder " " input - decoder_token_input shape {}".format( K.int_shape(decoder_token_input))) tf.compat.v1.debugging.assert_equal( K.int_shape(decoder_token_input), (self.batch_size, 1, self.token_embedding_dim)) # STEP 6: Calculate levenstein distance if val_mode: stack_predictions = tf.stack(list_predictions, axis=1) stack_predictions_len = stack_predictions.shape[1] # Logging, Debug & Assert logging.debug( "MODEL DRAKE NESTED CALL - Step 6 - Stack predictions " "shape {}".format(K.int_shape(stack_predictions))) tf.compat.v1.debugging.assert_equal( K.int_shape(stack_predictions), (self.batch_size, stack_predictions_len)) batch_mean_edit_distance = \ edit_distance_metric( target=input_tokens[:, 1:stack_predictions_len +1], prediction=stack_predictions, predictions_file=self.predictions_file) # STEP 7: Return word sequence batch loss ############################### return batch_loss, batch_mean_edit_distance
def yolov2_loss(self, detector_mask, matching_true_boxes, class_one_hot, true_boxes_grid, y_pred, info = False): ''' Calculate YOLO V2 loss from prediction (y_pred) and ground truth tensors (detector_mask, matching_true_boxes, class_one_hot, true_boxes_grid,) Parameters ---------- - detector_mask : tensor, shape (batch, size, GRID_W, GRID_H, anchors_count, 1) 1 if bounding box detected by grid cell, else 0 - matching_true_boxes : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, 5) Contains adjusted coords of bounding box in YOLO format - class_one_hot : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, class_count) One hot representation of bounding box label - true_boxes_grid : annotations : tensor (shape : batch_size, max annot, 5) true_boxes_grid format : x, y, w, h, c (coords unit : grid cell) - y_pred : prediction from model. tensor (shape : batch_size, GRID_W, GRID_H, anchors count, (5 + labels count) - info : boolean. True to get some infox about loss value Returns ------- - loss : scalar - sub_loss : sub loss list : coords loss, class loss and conf loss : scalar ''' # anchors tensor anchors = np.array(ANCHORS) anchors = anchors.reshape(len(anchors) // 2, 2) # grid coords tensor coord_x = tf.cast(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)), tf.float32) coord_y = tf.transpose(coord_x, (0, 2, 1, 3, 4)) coords = tf.tile(tf.concat([coord_x, coord_y], -1), [y_pred.shape[0], 1, 1, 5, 1]) # coordinate loss pred_xy = K.sigmoid(y_pred[:, :, :, :, 0:2]) # adjust coords between 0 and 1 pred_xy = (pred_xy + coords) # add cell coord for comparaison with ground truth. New coords in grid cell unit pred_wh = K.exp(y_pred[:, :, :, :, 2:4]) * anchors # adjust width and height for comparaison with ground truth. New coords in grid cell unit # pred_wh = (pred_wh * anchors) # unit : grid cell nb_detector_mask = K.sum(tf.cast(detector_mask > 0.0, tf.float32)) xy_loss = LAMBDA_COORD * K.sum(detector_mask * K.square(matching_true_boxes[..., :2] - pred_xy)) / ( nb_detector_mask + 1e-6) # Non /2 wh_loss = LAMBDA_COORD * K.sum(detector_mask * K.square(K.sqrt(matching_true_boxes[..., 2:4]) - K.sqrt(pred_wh))) / (nb_detector_mask + 1e-6) coord_loss = xy_loss + wh_loss # class loss pred_box_class = y_pred[..., 5:] true_box_class = tf.argmax(class_one_hot, -1) # class_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) class_loss = K.sparse_categorical_crossentropy(target=true_box_class, output=pred_box_class, from_logits=True) class_loss = K.expand_dims(class_loss, -1) * detector_mask class_loss = LAMBDA_CLASS * K.sum(class_loss) / (nb_detector_mask + 1e-6) # confidence loss pred_conf = K.sigmoid(y_pred[..., 4:5]) # for each detector : iou between prediction and ground truth x1 = matching_true_boxes[..., 0] y1 = matching_true_boxes[..., 1] w1 = matching_true_boxes[..., 2] h1 = matching_true_boxes[..., 3] x2 = pred_xy[..., 0] y2 = pred_xy[..., 1] w2 = pred_wh[..., 0] h2 = pred_wh[..., 1] ious = self.iou(x1, y1, w1, h1, x2, y2, w2, h2) ious = K.expand_dims(ious, -1) # for each detector : best ious between prediction and true_boxes (every bounding box of image) pred_xy = K.expand_dims(pred_xy, 4) # shape : m, GRID_W, GRID_H, BOX, 1, 2 pred_wh = K.expand_dims(pred_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half true_boxe_shape = K.int_shape(true_boxes_grid) true_boxes_grid = K.reshape(true_boxes_grid, [true_boxe_shape[0], 1, 1, 1, true_boxe_shape[1], true_boxe_shape[2]]) true_xy = true_boxes_grid[..., 0:2] true_wh = true_boxes_grid[..., 2:4] true_wh_half = true_wh * 0.5 true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half intersect_mins = K.maximum(pred_mins, true_mins) # shape : m, GRID_W, GRID_H, BOX, max_annot, 2 intersect_maxes = K.minimum(pred_maxes, true_maxes) # shape : m, GRID_W, GRID_H, BOX, max_annot, 2 intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) # shape : m, GRID_W, GRID_H, BOX, max_annot, 1 intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] # shape : m, GRID_W, GRID_H, BOX, max_annot, 1 pred_areas = pred_wh[..., 0] * pred_wh[..., 1] # shape : m, GRID_W, GRID_H, BOX, 1, 1 true_areas = true_wh[..., 0] * true_wh[..., 1] # shape : m, GRID_W, GRID_H, BOX, max_annot, 1 union_areas = pred_areas + true_areas - intersect_areas iou_scores = intersect_areas / union_areas # shape : m, GRID_W, GRID_H, BOX, max_annot, 1 best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # shape : m, GRID_W, GRID_H, BOX, 1 # no object confidence loss no_object_detection = K.cast(best_ious < 0.6, K.dtype(best_ious)) noobj_mask = no_object_detection * (1 - detector_mask) nb_noobj_mask = K.sum(tf.cast(noobj_mask > 0.0, tf.float32)) noobject_loss = LAMBDA_NOOBJECT * K.sum(noobj_mask * K.square(-pred_conf)) / (nb_noobj_mask + 1e-6) # object confidence loss object_loss = LAMBDA_OBJECT * K.sum(detector_mask * K.square(ious - pred_conf)) / (nb_detector_mask + 1e-6) # total confidence loss conf_loss = noobject_loss + object_loss # total loss loss = conf_loss + class_loss + coord_loss sub_loss = [conf_loss, class_loss, coord_loss] # # 'triple' mask # true_box_conf_IOU = ious * detector_mask # conf_mask = noobj_mask * LAMBDA_NOOBJECT # conf_mask = conf_mask + detector_mask * LAMBDA_OBJECT # nb_conf_box = K.sum(tf.to_float(conf_mask > 0.0)) # conf_loss = K.sum(K.square(true_box_conf_IOU - pred_conf) * conf_mask) / (nb_conf_box + 1e-6) # # total loss # loss = conf_loss /2. + class_loss + coord_loss /2. # sub_loss = [conf_loss /2., class_loss, coord_loss /2.] if info: print('conf_loss : {:.4f}'.format(conf_loss)) print('class_loss : {:.4f}'.format(class_loss)) print('coord_loss : {:.4f}'.format(coord_loss)) print(' xy_loss : {:.4f}'.format(xy_loss)) print(' wh_loss : {:.4f}'.format(wh_loss)) print('--------------------') print('total loss : {:.4f}'.format(loss)) # display masks for each anchors for i in range(len(anchors)): f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 5)) f.tight_layout() f.suptitle('MASKS FOR ANCHOR {} :'.format(anchors[i, ...])) ax1.matshow((K.sum(detector_mask[0, :, :, i], axis=2)), cmap='Greys', vmin=0, vmax=1) ax1.set_title( 'detector_mask, count : {}'.format(K.sum(tf.cast(detector_mask[0, :, :, i] > 0., tf.int32)))) ax1.xaxis.set_ticks_position('bottom') ax2.matshow((K.sum(no_object_detection[0, :, :, i], axis=2)), cmap='Greys', vmin=0, vmax=1) ax2.set_title('no_object_detection mask') ax2.xaxis.set_ticks_position('bottom') ax3.matshow((K.sum(noobj_mask[0, :, :, i], axis=2)), cmap='Greys', vmin=0, vmax=1) ax3.set_title('noobj_mask') ax3.xaxis.set_ticks_position('bottom') plt.show() return loss, sub_loss
def call(self, inputs): outputs = K.expand_dims(inputs, 1) return outputs
def yolo_loss(y_true, y_pred): label_class = y_true[..., :1] # ? * 7 * 7 * 1 # 分类 label_box = y_true[..., 1:5] # ? * 7 * 7 * 4 # BB1的坐标 response_mask = y_true[..., 5] # ? * 7 * 7 # BB1的置信度 response_mask = K.expand_dims(response_mask) # ? * 7 * 7 * 1 predict_class = y_pred[..., :1] # ? * 7 * 7 * 1 # 分类 predict_trust = y_pred[..., 1:3] # ? * 7 * 7 * 2 # BB1和BB2的置信度 predict_box = y_pred[..., 3:] # ? * 7 * 7 * 8 # BB1和BB2的坐标 _label_box = K.reshape(label_box, [-1, 7, 7, 1, 4]) _predict_box = K.reshape(predict_box, [-1, 7, 7, 2, 4]) label_xy, label_wh = yolo_head(_label_box, img_size=224) # ? * 7 * 7 * 1 * 2, ? * 7 * 7 * 1 * 2 label_xy = K.expand_dims(label_xy, 3) # ? * 7 * 7 * 1 * 1 * 2 label_wh = K.expand_dims(label_wh, 3) # ? * 7 * 7 * 1 * 1 * 2 label_xy_min, label_xy_max = X_Y_W_H_To_Min_Max(label_xy, label_wh) # ? * 7 * 7 * 1 * 1 * 2, ? * 7 * 7 * 1 * 1 * 2 predict_xy, predict_wh = yolo_head(_predict_box, img_size=224) # ? * 7 * 7 * 2 * 2, ? * 7 * 7 * 2 * 2 predict_xy = K.expand_dims(predict_xy, 4) # ? * 7 * 7 * 2 * 1 * 2 predict_wh = K.expand_dims(predict_wh, 4) # ? * 7 * 7 * 2 * 1 * 2 predict_xy_min, predict_xy_max = X_Y_W_H_To_Min_Max(predict_xy, predict_wh) # ? * 7 * 7 * 2 * 1 * 2, ? * 7 * 7 * 2 * 1 * 2 iou_scores = iou(predict_xy_min, predict_xy_max, label_xy_min, label_xy_max) # ? * 7 * 7 * 2 * 1 best_ious = K.max(iou_scores, axis=4) # ? * 7 * 7 * 2 best_box = K.max(best_ious, axis=3, keepdims=True) # ? * 7 * 7 * 1 box_mask = K.cast(best_ious >= best_box, K.dtype(best_ious)) # ? * 7 * 7 * 2 no_object_loss = 0.5 * (1 - box_mask * response_mask) * K.square(0 - predict_trust) object_loss = box_mask * response_mask * K.square(1 - predict_trust) confidence_loss = no_object_loss + object_loss confidence_loss = K.sum(confidence_loss) class_loss = response_mask * K.square(label_class - predict_class) class_loss = K.sum(class_loss) _label_box = K.reshape(label_box, [-1, 7, 7, 1, 4]) _predict_box = K.reshape(predict_box, [-1, 7, 7, 2, 4]) label_xy, label_wh = yolo_head(_label_box, img_size=224) # ? * 7 * 7 * 1 * 2, ? * 7 * 7 * 1 * 2 predict_xy, predict_wh = yolo_head(_predict_box, img_size=224) # ? * 7 * 7 * 2 * 2, ? * 7 * 7 * 2 * 2 box_mask = K.expand_dims(box_mask) response_mask = K.expand_dims(response_mask) box_loss = 5 * box_mask * response_mask * K.square((label_xy - predict_xy) / 224) box_loss += 5 * box_mask * response_mask * K.square((K.sqrt(label_wh) - K.sqrt(predict_wh)) / 224) box_loss = K.sum(box_loss) loss = confidence_loss + class_loss + box_loss return loss
def recursion(self, input_energy, mask=None, go_backwards=False, return_sequences=True, return_logZ=True, input_length=None): """Forward (alpha) or backward (beta) recursion If `return_logZ = True`, compute the logZ, the normalization constant: \[ Z = \sum_{y1, y2, y3} exp(-E) # energy = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3)) = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3)) sum_{y1} exp(-(u1' y1' + y1' W y2))) \] Denote: \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \] \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \] \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \] Note that: yi's are one-hot vectors u1, u3: boundary energies have been merged If `return_logZ = False`, compute the Viterbi's best path lookup table. """ chain_energy = self.chain_kernel # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t chain_energy = K.expand_dims(chain_energy, 0) # shape=(B, F), dtype=float32 prev_target_val = K.zeros_like(input_energy[:, 0, :]) if go_backwards: input_energy = K.reverse(input_energy, 1) if mask is not None: mask = K.reverse(mask, 1) initial_states = [ prev_target_val, K.zeros_like(prev_target_val[:, :1]) ] constants = [chain_energy] if mask is not None: mask2 = K.cast( K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), K.floatx()) constants.append(mask2) def _step(input_energy_i, states): return self.step(input_energy_i, states, return_logZ) target_val_last, target_val_seq, _ = K.rnn(_step, input_energy, initial_states, constants=constants, input_length=input_length, unroll=self.unroll) if return_sequences: if go_backwards: target_val_seq = K.reverse(target_val_seq, 1) return target_val_seq else: return target_val_last
def call(self, inputs): segment, memory = inputs full = K.concatenate([K.zeros_like(memory[:, :, 0]), segment], axis=1) relative = K.not_equal(K.expand_dims(segment, axis=-1), K.expand_dims(full, axis=1)) relative = K.one_hot(K.cast(relative, 'uint8'), 2) return [relative, self.embeddings + 0.0]
def atrous_spatial_pyramid_pooling(input_layer, global_image_pooling_upsampling_factor=None ): # branch: 1x1 conv b_aspp_0 = _Conv2D(input_layer, filters=256, kernel_size=1, name='aspp_0_conv', bn_epsilon=1e-5) # branch: 3x3 conv, rate 6 b_aspp_1 = SeparableConv2D(filters=256, kernel_size=3, padding='same', dilation_rate=6, use_bias=False, name='aspp_1_sepconv')(input_layer) b_aspp_1 = BatchNormalization(name='aspp_1_sepconv_bn', epsilon=1e-5)(b_aspp_1) b_aspp_1 = ReLU()(b_aspp_1) # branch: 3x3 conv, rate 12 b_aspp_2 = SeparableConv2D(filters=256, kernel_size=3, padding='same', dilation_rate=12, use_bias=False, name='aspp_2_sepconv')(input_layer) b_aspp_2 = BatchNormalization(name='aspp_2_sepconv_bn', epsilon=1e-5)(b_aspp_2) b_aspp_2 = ReLU()(b_aspp_2) # branch: 3x3 conv, rate 18 b_aspp_3 = SeparableConv2D(filters=256, kernel_size=3, padding='same', dilation_rate=18, use_bias=False, name='pyramid_3x3sepconv')(input_layer) b_aspp_3 = BatchNormalization(name='pyramid_3x3sepconv_bn', epsilon=1e-5)(b_aspp_3) b_aspp_3 = ReLU()(b_aspp_3) if global_image_pooling_upsampling_factor is None: output_layer = Concatenate()([b_aspp_0, b_aspp_1, b_aspp_2, b_aspp_3]) else: # branch: global image pooling b_image_pooling = GlobalAveragePooling2D( name='pyramid_img_pool')(input_layer) b_image_pooling = Lambda( lambda x: K.expand_dims(K.expand_dims(x, 1), 1))( b_image_pooling ) # (batch size x channels)->(batch size x 1 x 1 x channels) b_image_pooling = Conv2D(filters=256, kernel_size=1, padding='same', use_bias=False, name='pyramid_img_pool_conv')(b_image_pooling) b_image_pooling = BatchNormalization( name='pyramid_img_pool_conv_bn')(b_image_pooling) b_image_pooling = ReLU()(b_image_pooling) b_image_pooling = UpSampling2D( global_image_pooling_upsampling_factor, interpolation='bilinear')(b_image_pooling) output_layer = Concatenate()( [b_aspp_0, b_aspp_1, b_aspp_2, b_aspp_3, b_image_pooling]) return output_layer
def yolo2_loss(args, anchors, num_classes, label_smoothing=0, use_crossentropy_loss=False, use_crossentropy_obj_loss=False, rescore_confidence=False, use_diou_loss=False): """YOLOv2 loss function. Parameters ---------- yolo_output : tensor Final convolutional layer features. true_boxes : tensor Ground truth boxes tensor with shape [batch, num_true_boxes, 5] containing box x_center, y_center, width, height, and class. y_true : array output of preprocess_true_boxes, with shape [conv_height, conv_width, num_anchors, 6] anchors : tensor Anchor boxes for model. num_classes : int Number of object classes. rescore_confidence : bool, default=False If true then set confidence target to IOU of best predicted box with the closest matching ground truth box. Returns ------- total_loss : float total mean YOLOv2 loss across minibatch """ (yolo_output, true_boxes, y_true) = args num_anchors = len(anchors) yolo_output_shape = K.shape(yolo_output) input_shape = yolo_output_shape[1:3] * 32 batch_size_f = K.cast(yolo_output_shape[0], K.dtype(yolo_output)) # batch size, float tensor object_scale = 5 no_object_scale = 1 class_scale = 1 location_scale = 1 pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo2_head( yolo_output, anchors, num_classes, input_shape) object_mask = y_true[..., 4:5] # Expand pred x,y,w,h to allow comparison with ground truth. # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params pred_boxes = K.concatenate([pred_xy, pred_wh]) pred_boxes = K.expand_dims(pred_boxes, 4) # reshape true_boxes to: # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params true_boxes_shape = K.shape(true_boxes) true_boxes = K.reshape(true_boxes, [ true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2] ]) iou_scores = box_iou(pred_boxes, true_boxes) iou_scores = K.squeeze(iou_scores, axis=0) # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) # Determine confidence weights from object and no_object weights. # NOTE: YOLOv2 does not use binary cross-entropy. Here we try it. no_object_weights = (no_object_scale * (1 - object_detections) * (1 - object_mask)) if use_crossentropy_obj_loss: no_objects_loss = no_object_weights * K.binary_crossentropy( K.zeros(K.shape(pred_confidence)), pred_confidence, from_logits=False) if rescore_confidence: objects_loss = (object_scale * object_mask * K.binary_crossentropy( best_ious, pred_confidence, from_logits=False)) else: objects_loss = ( object_scale * object_mask * K.binary_crossentropy(K.ones(K.shape(pred_confidence)), pred_confidence, from_logits=False)) else: no_objects_loss = no_object_weights * K.square(-pred_confidence) if rescore_confidence: objects_loss = (object_scale * object_mask * K.square(best_ious - pred_confidence)) else: objects_loss = (object_scale * object_mask * K.square(1 - pred_confidence)) confidence_loss = objects_loss + no_objects_loss # Classification loss for matching detections. # NOTE: YOLOv2 does not use categorical cross-entropy loss. # Here we try it. matching_classes = K.cast(y_true[..., 5], 'int32') matching_classes = K.one_hot(matching_classes, num_classes) if label_smoothing: matching_classes = _smooth_labels(matching_classes, label_smoothing) if use_crossentropy_loss: classification_loss = ( class_scale * object_mask * K.expand_dims(K.categorical_crossentropy( matching_classes, pred_class_prob, from_logits=False), axis=-1)) else: classification_loss = (class_scale * object_mask * K.square(matching_classes - pred_class_prob)) if use_diou_loss: # Calculate DIoU loss as location loss diou = box_diou(pred_boxes, true_boxes) diou = K.squeeze(diou, axis=-1) diou_loss = location_scale * object_mask * (1 - diou) location_loss = diou_loss else: # YOLOv2 location loss for matching detection boxes. matching_boxes = y_true[..., 0:4] feats = K.reshape(yolo_output, [ -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors, num_classes + 5 ]) # Unadjusted box predictions for loss. # TODO: Remove extra computation shared with yolo2_head. raw_pred_boxes = K.concatenate( (K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1) location_loss = (location_scale * object_mask * K.square(matching_boxes - raw_pred_boxes)) confidence_loss_sum = K.sum(confidence_loss) / batch_size_f classification_loss_sum = K.sum(classification_loss) / batch_size_f location_loss_sum = K.sum(location_loss) / batch_size_f total_loss = 0.5 * (confidence_loss_sum + classification_loss_sum + location_loss_sum) # Fit for tf 2.0.0 loss shape total_loss = K.expand_dims(total_loss, axis=-1) return total_loss, location_loss_sum, confidence_loss_sum, classification_loss_sum
def call(self, inputs, mask=None, **kwargs): if isinstance(inputs, list): inputs, positions = inputs positions = K.cast(positions, 'int32') mask = mask[1] else: positions = None input_len = K.shape(inputs)[1] if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD: e = self._call_additive_emission(inputs) elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL: e = self._call_multiplicative_emission(inputs) if self.attention_activation is not None: e = self.attention_activation(e) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) if self.attention_width is not None: ones = tf.ones((input_len, input_len)) if self.history_only: local = tf.linalg.band_part( ones, K.minimum(input_len, self.attention_width - 1), 0, ) else: local = tf.linalg.band_part( ones, K.minimum(input_len, self.attention_width // 2), K.minimum(input_len, (self.attention_width - 1) // 2), ) e = e * K.expand_dims(local, 0) if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask) e = K.permute_dimensions( K.permute_dimensions(e * mask, (0, 2, 1)) * mask, (0, 2, 1)) # a_{t} = \text{softmax}(e_t) s = K.sum(e, axis=-1) s = K.tile(K.expand_dims(s, axis=-1), K.stack([1, 1, input_len])) a = e / (s + K.epsilon()) # l_t = \sum_{t'} a_{t, t'} x_{t'} v = K.batch_dot(a, inputs) if self.attention_regularizer_weight > 0.0: self.add_loss(self._attention_regularizer(a)) if positions is not None: pos_num = K.shape(positions)[1] batch_indices = K.tile( K.expand_dims(K.arange(K.shape(inputs)[0]), axis=-1), K.stack([1, pos_num])) pos_indices = K.stack([batch_indices, positions], axis=-1) v = tf.gather_nd(v, pos_indices) a = tf.gather_nd(a, pos_indices) if self.return_attention: return [v, a] return v
def yolo_loss(args, input_shape, anchors, anchors_mask, num_classes, ignore_thresh=0.5, balance=[0.4, 1.0, 4], box_ratio=0.05, obj_ratio=1, cls_ratio=0.5 / 4, label_smoothing=0.1, focal_loss=False, focal_loss_ratio=10, gamma=2, alpha=0.25, print_loss=False): num_layers = len(anchors_mask) #---------------------------------------------------------------------------------------------------# # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true] # y_true是一个列表,包含三个特征层,shape分别为: # (m,13,13,3,85) # (m,26,26,3,85) # (m,52,52,3,85) # yolo_outputs是一个列表,包含三个特征层,shape分别为: # (m,13,13,3,85) # (m,26,26,3,85) # (m,52,52,3,85) #---------------------------------------------------------------------------------------------------# y_true = args[num_layers:] yolo_outputs = args[:num_layers] #-----------------------------------------------------------# # 得到input_shpae为416,416 #-----------------------------------------------------------# input_shape = K.cast(input_shape, K.dtype(y_true[0])) #-----------------------------------------------------------# # 取出每一张图片 # m的值就是batch_size #-----------------------------------------------------------# m = K.shape(yolo_outputs[0])[0] loss = 0 #---------------------------------------------------------------------------------------------------# # y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。 # yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。 #---------------------------------------------------------------------------------------------------# for l in range(num_layers): #-----------------------------------------------------------# # 以第一个特征层(m,13,13,3,85)为例子 # 取出该特征层中存在目标的点的位置。(m,13,13,3,1) #-----------------------------------------------------------# object_mask = y_true[l][..., 4:5] #-----------------------------------------------------------# # 取出其对应的种类(m,13,13,3,80) #-----------------------------------------------------------# true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) #-----------------------------------------------------------# # 将yolo_outputs的特征层输出进行处理、获得四个返回值 # 其中: # grid (13,13,1,2) 网格坐标 # raw_pred (m,13,13,3,85) 尚未处理的预测结果 # pred_xy (m,13,13,3,2) 解码后的中心坐标 # pred_wh (m,13,13,3,2) 解码后的宽高坐标 #-----------------------------------------------------------# grid, raw_pred, pred_xy, pred_wh = get_anchors_and_decode( yolo_outputs[l], anchors[anchors_mask[l]], num_classes, input_shape, calc_loss=True) #-----------------------------------------------------------# # pred_box是解码后的预测的box的位置 # (m,13,13,3,4) #-----------------------------------------------------------# pred_box = K.concatenate([pred_xy, pred_wh]) #-----------------------------------------------------------# # 找到负样本群组,第一步是创建一个数组,[] #-----------------------------------------------------------# ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') #-----------------------------------------------------------# # 对每一张图片计算ignore_mask #-----------------------------------------------------------# def loop_body(b, ignore_mask): #-----------------------------------------------------------# # 取出n个真实框:n,4 #-----------------------------------------------------------# true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) #-----------------------------------------------------------# # 计算预测框与真实框的iou # pred_box 13,13,3,4 预测框的坐标 # true_box n,4 真实框的坐标 # iou 13,13,3,n 预测框和真实框的iou #-----------------------------------------------------------# iou = box_iou(pred_box[b], true_box) #-----------------------------------------------------------# # best_iou 13,13,3 每个特征点与真实框的最大重合程度 #-----------------------------------------------------------# best_iou = K.max(iou, axis=-1) #-----------------------------------------------------------# # 判断预测框和真实框的最大iou小于ignore_thresh # 则认为该预测框没有与之对应的真实框 # 该操作的目的是: # 忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了 # 不适合当作负样本,所以忽略掉。 #-----------------------------------------------------------# ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask #-----------------------------------------------------------# # 在这个地方进行一个循环、循环是对每一张图片进行的 #-----------------------------------------------------------# _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) #-----------------------------------------------------------# # ignore_mask用于提取出作为负样本的特征点 # (m,13,13,3) #-----------------------------------------------------------# ignore_mask = ignore_mask.stack() # (m,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) #-----------------------------------------------------------# # 真实框越大,比重越小,小框的比重更大。 # 使用iou损失时,大中小目标的回归损失不存在比例失衡问题,故弃用 #-----------------------------------------------------------# box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] #-----------------------------------------------------------# # 计算Ciou loss #-----------------------------------------------------------# raw_true_box = y_true[l][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * (1 - ciou) location_loss = K.sum(ciou_loss) #------------------------------------------------------------------------------# # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,那么计算0与置信度的交叉熵 # 在这其中会忽略一部分样本,这些被忽略的样本满足条件best_iou<ignore_thresh # 该操作的目的是: # 忽略预测结果与真实框非常对应特征点,因为这些框已经比较准了 # 不适合当作负样本,所以忽略掉。 #------------------------------------------------------------------------------# if focal_loss: confidence_loss = (object_mask * (tf.ones_like(raw_pred[...,4:5]) - tf.sigmoid(raw_pred[...,4:5])) ** gamma * alpha * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) + \ (1 - object_mask) * ignore_mask * tf.sigmoid(raw_pred[...,4:5]) ** gamma * (1 - alpha) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)) * focal_loss_ratio else: confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) + \ (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) #-----------------------------------------------------------# # 计算正样本数量 #-----------------------------------------------------------# num_pos = tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1) num_neg = tf.maximum( K.sum(K.cast((1 - object_mask) * ignore_mask, tf.float32)), 1) #-----------------------------------------------------------# # 将所有损失求和 #-----------------------------------------------------------# location_loss = location_loss * box_ratio / num_pos confidence_loss = K.sum(confidence_loss) * balance[l] * obj_ratio / ( num_pos + num_neg) class_loss = K.sum(class_loss) * cls_ratio / num_pos / num_classes loss += location_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [ loss, location_loss, confidence_loss, class_loss, tf.shape(ignore_mask) ], summarize=100, message='loss: ') return loss