def yolo_head(feats, anchors, num_classes): # convert anchors to shape 1 , 1 , 1 , len of anchors , 2 num_anchors = len(anchors) anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # conv_dims , width and hight of the grid _, conv_height, conv_width, _ = K.int_shape(feats) conv_dims = K.variable([conv_width, conv_height]) # reshape yolo network output to None , grid_width , grid_hight , num of amnchors , num of classes + 5 feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) # convert conv_dims after casting it to feats datatype to 1 , 1 , 1 , 2 conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) # create grid from (0 , 0 ) to (width , hight) conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)]) conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering. conv_index = K.variable( conv_index.reshape(1, conv_height, conv_width, 1, 2)) box_confidence = K.sigmoid(feats[..., 4:5]) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_class_probs = K.softmax(feats[..., 5:]) box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_confidence, box_xy, box_wh, box_class_probs
def yolo_head(feats, anchors, num_classes, input_shape): """Convert final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (box_xy + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) box_wh = box_wh * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) return box_xy, box_wh, box_confidence, box_class_probs
def call(self, inputs, output_shape=None): updates, mask = inputs[0], inputs[1] with tf.compat.v1.variable_scope(self.name): mask = K.cast(mask, "int32") input_shape = tf.shape(updates, out_type="int32") # calculation new shape if output_shape is None: output_shape = ( input_shape[0], input_shape[1] * self.size[0], input_shape[2] * self.size[1], input_shape[3], ) self.output_shape1 = output_shape # calculation indices for batch, height, width and feature maps one_like_mask = K.ones_like(mask, dtype="int32") batch_shape = K.concatenate([[input_shape[0]], [1], [1], [1]], axis=0) batch_range = K.reshape(tf.range(output_shape[0], dtype="int32"), shape=batch_shape) b = one_like_mask * batch_range y = mask // (output_shape[2] * output_shape[3]) x = (mask // output_shape[3]) % output_shape[2] feature_range = tf.range(output_shape[3], dtype="int32") f = one_like_mask * feature_range # transpose indices & reshape update values to one dimension updates_size = tf.size(updates) indices = K.transpose( K.reshape(K.stack([b, y, x, f]), [4, updates_size])) values = K.reshape(updates, [updates_size]) ret = tf.scatter_nd(indices, values, output_shape) return ret
def call(self, QKVs): """Core logic of multi-head self attention. Args: QKVs (list): inputs of multi-head self attention i.e. query, key and value. Returns: object: ouput tensors. """ if len(QKVs) == 3: Q_seq, K_seq, V_seq = QKVs Q_len, V_len = None, None elif len(QKVs) == 5: Q_seq, K_seq, V_seq, Q_len, V_len = QKVs Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape(Q_seq, shape=(-1, K.shape(Q_seq)[1], self.multiheads, self.head_dim)) Q_seq = K.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape(K_seq, shape=(-1, K.shape(K_seq)[1], self.multiheads, self.head_dim)) K_seq = K.permute_dimensions(K_seq, pattern=(0, 2, 1, 3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape(V_seq, shape=(-1, K.shape(V_seq)[1], self.multiheads, self.head_dim)) V_seq = K.permute_dimensions(V_seq, pattern=(0, 2, 1, 3)) A = einsum("abij, abkj -> abik", Q_seq, K_seq) / K.sqrt( K.cast(self.head_dim, dtype="float32")) A = K.permute_dimensions( A, pattern=(0, 3, 2, 1) ) # A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads] A = self.Mask(A, V_len, "add") A = K.permute_dimensions(A, pattern=(0, 3, 2, 1)) if self.mask_right: ones = K.ones_like(A[:1, :1]) lower_triangular = K.tf.matrix_band_part(ones, num_lower=-1, num_upper=0) mask = (ones - lower_triangular) * 1e12 A = A - mask A = K.softmax(A) O_seq = einsum("abij, abjk -> abik", A, V_seq) O_seq = K.permute_dimensions(O_seq, pattern=(0, 2, 1, 3)) O_seq = K.reshape(O_seq, shape=(-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, "mul") return O_seq
def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape): '''Process Conv layer output''' box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats, anchors, num_classes, input_shape) boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape) boxes = K.reshape(boxes, [-1, 4]) box_scores = box_confidence * box_class_probs box_scores = K.reshape(box_scores, [-1, num_classes]) return boxes, box_scores
def _boxes_and_scores(self, feats, anchors, num_classes, input_shape, image_shape): """Process Convolutional layer output""" box_xy, box_wh, box_confidence, box_class_probs = self._yolo_head( feats, anchors, num_classes, input_shape) boxes = self._correct_boxes(box_xy, box_wh, input_shape, image_shape) boxes = K.reshape(boxes, [-1, 4]) box_scores = box_confidence * box_class_probs box_scores = K.reshape(box_scores, [-1, num_classes]) return boxes, box_scores
def yolo_eval(yolo_outputs, image_shape, max_boxes=10, score_threshold=.6, iou_threshold=.5): """Evaluate YOLO model on given input batch and return filtered boxes.""" box_xy, box_wh, box_confidence, box_class_probs = yolo_outputs boxes = yolo_boxes_to_corners(box_xy, box_wh) boxes, scores, classes = yolo_filter_boxes( boxes, box_confidence, box_class_probs, threshold=score_threshold) # Scale boxes back to original image shape. height = image_shape[0] width = image_shape[1] image_dims = K.stack([height, width, height, width]) image_dims = K.reshape(image_dims, [1, 4]) boxes = boxes * image_dims # TODO: Something must be done about this ugly hack! max_boxes_tensor = K.variable(max_boxes, dtype='int32') K.get_session().run(tf.compat.v1.variables_initializer([max_boxes_tensor])) nms_index = tf.image.non_max_suppression( boxes, scores, max_boxes_tensor, iou_threshold=iou_threshold) boxes = K.gather(boxes, nms_index) scores = K.gather(scores, nms_index) classes = K.gather(classes, nms_index) return boxes, scores, classes
def path_energy0(y, x, U, mask=None): """Path energy without boundary potential handling.""" n_classes = K.shape(x)[2] y_one_hot = K.one_hot(y, n_classes) # Tag path energy energy = K.sum(x * y_one_hot, 2) energy = K.sum(energy, 1) # Transition energy y_t = y[:, :-1] y_tp1 = y[:, 1:] U_flat = K.reshape(U, [-1]) # Convert 2-dim indices (y_t, y_tp1) of U to 1-dim indices of U_flat: flat_indices = y_t * n_classes + y_tp1 U_y_t_tp1 = K.gather(U_flat, flat_indices) if mask is not None: mask = K.cast(mask, K.floatx()) y_t_mask = mask[:, :-1] y_tp1_mask = mask[:, 1:] U_y_t_tp1 *= y_t_mask * y_tp1_mask energy += K.sum(U_y_t_tp1, axis=1) return energy
def scale_boxes(boxes, image_shape): """ Scales the predicted boxes in order to be drawable on the image""" height = image_shape[0] width = image_shape[1] image_dims = K.stack([height, width, height, width]) image_dims = K.reshape(image_dims, [1, 4]) boxes = boxes * image_dims return boxes
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): """Convert final layer features to bounding box parameters.""" num_anchors = anchors_per_level # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile( tf.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1], name='yolo_head/tile/reshape/grid_y'), [1, grid_shape[1], 1, 1]) grid_x = K.tile( tf.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1], name='yolo_head/tile/reshape/grid_x'), [grid_shape[0], 1, 1, 1]) grid = tf.concat([grid_x, grid_y], axis=-1, name='yolo_head/concatenate/grid') grid = K.cast(grid, K.dtype(feats)) feats = tf.reshape(feats, [ -1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5 + NUM_ANGLES3 ], name='yolo_head/reshape/feats') # Adjust predictions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[..., ::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[..., ::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:5 + num_classes]) polygons_confidence = K.sigmoid(feats[..., 5 + num_classes + 2:5 + num_classes + NUM_ANGLES3:3]) polygons_x = K.exp(feats[..., 5 + num_classes:num_classes + 5 + NUM_ANGLES3:3]) dx = K.square(anchors_tensor[..., 0:1] / 2) dy = K.square(anchors_tensor[..., 1:2] / 2) d = K.cast(K.sqrt(dx + dy), K.dtype(polygons_x)) a = K.pow(input_shape[..., ::-1], 2) a = K.cast(a, K.dtype(feats)) b = K.sum(a) diagonal = K.cast(K.sqrt(b), K.dtype(feats)) polygons_x = polygons_x * d / diagonal polygons_y = feats[..., 5 + num_classes + 1:num_classes + 5 + NUM_ANGLES3:3] polygons_y = K.sigmoid(polygons_y) if calc_loss == True: return grid, feats, box_xy, box_wh, polygons_confidence return box_xy, box_wh, box_confidence, box_class_probs, polygons_x, polygons_y, polygons_confidence
def _yolo_head(self, feats, anchors, num_classes, input_shape, calc_loss=False): """Convert final layer features to bounding box parameters""" num_anchors = len(anchors) anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) # height, width grid_shape = K.shape(feats)[1:3] grid_y = K.tile( K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1], ) grid_x = K.tile( K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1], ) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[::-1], K.dtype(feats)) box_wh = (K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats))) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def call(self, x, mask=None): features_dim = self.features_dim step_dim = self.step_dim e = K.reshape( K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) # e = K.dot(x, self.W) if self.bias: e += self.b e = K.tanh(e) a = K.exp(e) # apply mask after the exp. will be re-normalized next if mask is not None: # cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) c = K.sum(a * x, axis=1) return c
def yolo_head(feats, anchors, num_classes): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tensor Final convolutional layer features. anchors : array-like Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy : tensor x, y box predictions adjusted by spatial location in conv layer. box_wh : tensor w, h box predictions adjusted by anchors and conv spatial resolution. box_conf : tensor Probability estimate for whether each box contains any object. box_class_pred : tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # Static implementation for fixed models. # TODO: Remove or add option for static implementation. # _, conv_height, conv_width, _ = K.int_shape(feats) # conv_dims = K.variable([conv_width, conv_height]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) # TODO: Repeat_elements and tf.split doesn't support dynamic splits. # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) conv_width_index = K.tile( K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) # Static generation of conv_index: # conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)]) # conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering. # conv_index = K.variable( # conv_index.reshape(1, conv_height, conv_width, 1, 2)) # feats = Reshape( # (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.softmax(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_xy, box_wh, box_confidence, box_class_probs
def yolo_loss(args, anchors, num_classes, rescore_confidence=False, print_loss=False): """YOLO localization loss function. Parameters ---------- yolo_output : tensor Final convolutional layer features. true_boxes : tensor Ground truth boxes tensor with shape [batch, num_true_boxes, 5] containing box x_center, y_center, width, height, and class. detectors_mask : array 0/1 mask for detector positions where there is a matching ground truth. matching_true_boxes : array Corresponding ground truth boxes for positive detector positions. Already adjusted for conv height and width. anchors : tensor Anchor boxes for model. num_classes : int Number of object classes. rescore_confidence : bool, default=False If true then set confidence target to IOU of best predicted box with the closest matching ground truth box. print_loss : bool, default=False If True then use a tf.Print() to print the loss components. Returns ------- mean_loss : float mean localization loss across minibatch """ (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args num_anchors = len(anchors) object_scale = 5 no_object_scale = 1 class_scale = 1 coordinates_scale = 1 pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_head( yolo_output, anchors, num_classes) # Unadjusted box predictions for loss. # TODO: Remove extra computation shared with yolo_head. yolo_output_shape = K.shape(yolo_output) feats = K.reshape(yolo_output, [ -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors, num_classes + 5 ]) pred_boxes = K.concatenate( (K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1) # TODO: Adjust predictions by image width/height for non-square images? # IOUs may be off due to different aspect ratio. # Expand pred x,y,w,h to allow comparison with ground truth. # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params pred_xy = K.expand_dims(pred_xy, 4) pred_wh = K.expand_dims(pred_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half true_boxes_shape = K.shape(true_boxes) # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params true_boxes = K.reshape(true_boxes, [ true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2] ]) true_xy = true_boxes[..., 0:2] true_wh = true_boxes[..., 2:4] # Find IOU of each predicted box with each ground truth box. true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half intersect_mins = K.maximum(pred_mins, true_mins) intersect_maxes = K.minimum(pred_maxes, true_maxes) intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = intersect_areas / union_areas # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) # TODO: Darknet region training includes extra coordinate loss for early # training steps to encourage predictions to match anchor priors. # Determine confidence weights from object and no_object weights. # NOTE: YOLO does not use binary cross-entropy here. no_object_weights = (no_object_scale * (1 - object_detections) * (1 - detectors_mask)) no_objects_loss = no_object_weights * K.square(-pred_confidence) if rescore_confidence: objects_loss = (object_scale * detectors_mask * K.square(best_ious - pred_confidence)) else: objects_loss = (object_scale * detectors_mask * K.square(1 - pred_confidence)) confidence_loss = objects_loss + no_objects_loss # Classification loss for matching detections. # NOTE: YOLO does not use categorical cross-entropy loss here. matching_classes = K.cast(matching_true_boxes[..., 4], 'int32') matching_classes = K.one_hot(matching_classes, num_classes) classification_loss = (class_scale * detectors_mask * K.square(matching_classes - pred_class_prob)) # Coordinate loss for matching detection boxes. matching_boxes = matching_true_boxes[..., 0:4] coordinates_loss = (coordinates_scale * detectors_mask * K.square(matching_boxes - pred_boxes)) confidence_loss_sum = K.sum(confidence_loss) classification_loss_sum = K.sum(classification_loss) coordinates_loss_sum = K.sum(coordinates_loss) total_loss = 0.5 * ( confidence_loss_sum + classification_loss_sum + coordinates_loss_sum) if print_loss: total_loss = tf.Print( total_loss, [ total_loss, confidence_loss_sum, classification_loss_sum, coordinates_loss_sum ], message='yolo_loss, conf_loss, class_loss, box_coord_loss:') return total_loss