def __init__(self, anno_file, anno_file_type, img_size=416): self.img_size = img_size # For Multi-training if cfg.TRAIN.DATA_TYPE == 'VOC': self.classes = cfg.VOC_DATA.CLASSES elif cfg.TRAIN.DATA_TYPE == 'COCO': self.classes = cfg.COCO_DATA.CLASSES else: self.classes = cfg.DATASET.CLASSES self.cross_offset = 0.2 self.num_classes = len(self.classes) self.class_to_id = dict(zip(self.classes, range(self.num_classes))) self.__annotations = self.__load_annotations(anno_file, anno_file_type) self.hue_jitter = 0.005 self.bright_jitter = 0.25 self.sat_jitter = 0.25 self.label_smooth = dataAug.LabelSmooth() self.bbox_minsize = 40
def __creat_label(self, bboxes): """ Label assignment. For a single picture all GT box bboxes are assigned anchor. 1、Select a bbox in order, convert its coordinates("xyxy") to "xywh"; and scale bbox' xywh by the strides. 2、Calculate the iou between the each detection layer'anchors and the bbox in turn, and select the largest anchor to predict the bbox.If the ious of all detection layers are smaller than 0.3, select the largest of all detection layers' anchors to predict the bbox. Note : 1、The same GT may be assigned to multiple anchors. And the anchors may be on the same or different layer. 2、The total number of bboxes may be more than it is, because the same GT may be assigned to multiple layers of detection. """ anchors = np.array(cfg.MODEL["ANCHORS"]) strides = np.array(cfg.MODEL["STRIDES"]) train_output_size = self.img_size / strides anchors_per_scale = cfg.MODEL["ANCHORS_PER_SCLAE"] label = [ np.zeros( ( int(train_output_size[i]), int(train_output_size[i]), anchors_per_scale, 6 + self.num_classes, ) ) for i in range(3) ] for i in range(3): label[i][..., 5] = 1.0 bboxes_xywh = [ np.zeros((150, 4)) for _ in range(3) ] # Darknet the max_num is 30 bbox_count = np.zeros((3,)) for bbox in bboxes: bbox_coor = bbox[:4] bbox_class_ind = int(bbox[4]) bbox_mix = bbox[5] # onehot one_hot = np.zeros(self.num_classes, dtype=np.float32) one_hot[bbox_class_ind] = 1.0 one_hot_smooth = dataAug.LabelSmooth()(one_hot, self.num_classes) # convert "xyxy" to "xywh" bbox_xywh = np.concatenate( [ (bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2], ], axis=-1, ) # print("bbox_xywh: ", bbox_xywh) bbox_xywh_scaled = ( 1.0 * bbox_xywh[np.newaxis, :] / strides[:, np.newaxis] ) iou = [] exist_positive = False for i in range(3): anchors_xywh = np.zeros((anchors_per_scale, 4)) anchors_xywh[:, 0:2] = ( np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5 ) # 0.5 for compensation anchors_xywh[:, 2:4] = anchors[i] iou_scale = tools.iou_xywh_numpy( bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh ) iou.append(iou_scale) iou_mask = iou_scale > 0.3 if np.any(iou_mask): xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype( np.int32 ) # Bug : 当多个bbox对应同一个anchor时,默认将该anchor分配给最后一个bbox label[i][yind, xind, iou_mask, 0:4] = bbox_xywh label[i][yind, xind, iou_mask, 4:5] = 1.0 label[i][yind, xind, iou_mask, 5:6] = bbox_mix label[i][yind, xind, iou_mask, 6:] = one_hot_smooth bbox_ind = int(bbox_count[i] % 150) # BUG : 150为一个先验值,内存消耗大 bboxes_xywh[i][bbox_ind, :4] = bbox_xywh bbox_count[i] += 1 exist_positive = True if not exist_positive: best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1) best_detect = int(best_anchor_ind / anchors_per_scale) best_anchor = int(best_anchor_ind % anchors_per_scale) xind, yind = np.floor( bbox_xywh_scaled[best_detect, 0:2] ).astype(np.int32) label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh label[best_detect][yind, xind, best_anchor, 4:5] = 1.0 label[best_detect][yind, xind, best_anchor, 5:6] = bbox_mix label[best_detect][yind, xind, best_anchor, 6:] = one_hot_smooth bbox_ind = int(bbox_count[best_detect] % 150) bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh bbox_count[best_detect] += 1 label_sbbox, label_mbbox, label_lbbox = label sbboxes, mbboxes, lbboxes = bboxes_xywh return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
def creat_label(self, bboxes): """ Label assignment. For a single picture all GT box bboxes are assigned anchor. 1、Select a bbox in order, convert its coordinates("xyxy") to "xywh"; and scale bbox' xywh by the strides. 2、Calculate the iou between the each detection layer'anchors and the bbox in turn, and select the largest anchor to predict the bbox.If the ious of all detection layers are smaller than 0.3, select the largest of all detection layers' anchors to predict the bbox. Note : 1、The same GT may be assigned to multiple anchors. And the anchors may be on the same or different layer. Pai : even the same cell 2、The total number of bboxes may be more than it is, because the same GT may be assigned to multiple layers of detection. """ anchors = np.array(self.cfg_MODEL["ANCHORS"]) strides = np.array(self.cfg_MODEL["STRIDES"]) train_output_size = self.img_size / strides anchors_per_scale = self.cfg_MODEL["ANCHORS_PER_SCLAE"] label = [ np.zeros((int(train_output_size[i]), int(train_output_size[i]), anchors_per_scale, 6 + self.num_classes)) for i in range(3) ] # label = [np,np.np] each have size = grid,grid, anchors_per_scale, 6+self.num_classes for i in range(3): label[i][..., 5] = 1.0 bboxes_xywh = [np.zeros((150, 4)) for _ in range(3)] # Darknet the max_num is 30 bbox_count = np.zeros((3, )) for bbox in bboxes: # start one by one gt box bbox_coor = bbox[:4] bbox_class_ind = int(bbox[4]) bbox_mix = bbox[5] # onehot one_hot = np.zeros(self.num_classes, dtype=np.float32) one_hot[bbox_class_ind] = 1.0 one_hot_smooth = dataAug.LabelSmooth()(one_hot, self.num_classes) # convert "xyxy" to "xywh" bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1) # print("bbox_xywh: ", bbox_xywh) bbox_xywh_scaled = 1.0 * bbox_xywh[ np. newaxis, :] / strides[:, np. newaxis] # scale gt box to grid unit sp shape = [3,4] each row is at diferent scale. 1 grind size is 8, 16 ,32 iou = [] exist_positive = False for i in range(3): # we have 3 scales anchors_xywh = np.zeros((anchors_per_scale, 4)) anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype( np.int32) + 0.5 # 0.5 for compensation anchors_xywh[:, 2:4] = anchors[i] iou_scale = tools.iou_xywh_numpy( bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh) iou.append(iou_scale) iou_mask = iou_scale > 0.3 if np.any(iou_mask): xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype( np.int32) # Bug : When multiple gt bboxes correspond to the same anchor, the anchor is assigned to the last bbox by default label[i][ yind, xind, iou_mask, 0: 4] = bbox_xywh # Pai: this means the 3 anchors in that cell can be responsible to the boxes. Not choose one in this code label[i][ yind, xind, iou_mask, 4: 5] = 1.0 # You seeeeeeeeeeee the confidence for the gt = 1 kuayyyyyyyyyyyy finally i got the answer label[i][yind, xind, iou_mask, 5:6] = bbox_mix label[i][yind, xind, iou_mask, 6:] = one_hot_smooth bbox_ind = int( bbox_count[i] % 150 ) # BUG : 150 is a prior value, memory consumption is large bboxes_xywh[i][bbox_ind, :4] = bbox_xywh bbox_count[i] += 1 exist_positive = True if not exist_positive: best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1) best_detect = int(best_anchor_ind / anchors_per_scale) best_anchor = int(best_anchor_ind % anchors_per_scale) xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32) label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh label[best_detect][yind, xind, best_anchor, 4:5] = 1.0 label[best_detect][yind, xind, best_anchor, 5:6] = bbox_mix label[best_detect][yind, xind, best_anchor, 6:] = one_hot_smooth bbox_ind = int(bbox_count[best_detect] % 150) bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh bbox_count[best_detect] += 1 label_sbbox, label_mbbox, label_lbbox = label sbboxes, mbboxes, lbboxes = bboxes_xywh return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes