def encode(self, boxes, labels): '''Encode target bounding boxes and class labels. SSD coding rules: tx = (x - anchor_x) / (variance[0]*anchor_w) ty = (y - anchor_y) / (variance[0]*anchor_h) tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj,4]. labels: (tensor) object class labels, sized [#obj,]. Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' anchor_boxes = self.anchor_boxes ious = bbox_iou(anchor_boxes, boxes) max_ious, max_ids = ious.max(1) boxes = boxes[max_ids] boxes = change_bbox_order(boxes, 'xyxy2xywh') anchor_boxes = change_bbox_order(anchor_boxes, 'xyxy2xywh') loc_xy = (boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:] loc_wh = torch.log(boxes[:, 2:] / anchor_boxes[:, 2:]) loc_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = 1 + labels[max_ids] # cls_targets[max_ious<0.5] = 0 # ignore = (max_ious>0.4) & (max_ious<0.5) # ignore ious between [0.4,0.5] # cls_targets[ignore] = -1 # mark ignored to -1 return loc_targets, cls_targets
def tracker2object(self, boxes: [OBSTACLE], th=0.5): n_b = len(boxes) n_o = len(self.obstacles) iou_mat = np.zeros((n_o, n_b)) for i in range(n_o): for j in range(n_b): iou_mat[i, j] = bbox_iou(self.obstacles[i], boxes[j]) count = min(n_b, n_o) used = [] idmax = 0 obstacles = [] while count > 0: r, k = np.unravel_index(np.argmax(iou_mat, axis=None), iou_mat.shape) if iou_mat[r, k] > th: used.append(k) obstacle = self.obstacles[r] box = boxes[k] if idmax < obstacle._id: idmax = obstacle._id obstacle.update_box(box) obstacles.append(obstacle) iou_mat[r, :] = -99 iou_mat[:, k] = -99 count = count - 1 idx = range(n_b) idx = [elem for elem in idx if elem not in used] self.obstacles = obstacles for i, c in enumerate(idx): # dst = self.calculate_position(boxes[c]) obstacle = OBSTACLE(boxes[c], i + idmax + 1) self.obstacles.append(obstacle) return
def encode(self, boxes, labels): '''Encode target bounding boxes and class labels. SSD coding rules: tx = (x - anchor_x) / (variance[0]*anchor_w) ty = (y - anchor_y) / (variance[0]*anchor_h) tw = log(w / anchor_w) th = log(h / anchor_h) Args: boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj,4]. labels: (tensor) object class labels, sized [#obj,]. Returns: loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' def argmax(x): v, i = x.max(0) j = v.max(0)[1].item() return (i[j], j) default_boxes = self.default_boxes default_boxes = change_bbox_order(default_boxes, 'xywh2xyxy') ious = bbox_iou(default_boxes, boxes) index = torch.empty(len(default_boxes), dtype=torch.long).fill_(-1) masked_ious = ious.clone() while True: i, j = argmax(masked_ious) if masked_ious[i, j] < 1e-6: break index[i] = j masked_ious[i, :] = 0 masked_ious[:, j] = 0 mask = (index < 0) & (ious.max(1)[0] > 0.5) if mask.any(): index[mask] = ious[mask].max(1)[1] boxes = boxes[index.clamp(min=0)] boxes = change_bbox_order(boxes, 'xyxy2xywh') default_boxes = change_bbox_order(default_boxes, 'xyxy2xywh') loc_xy = (boxes[:, :2] - default_boxes[:, :2] ) / default_boxes[:, 2:] / self.variances[0] loc_wh = torch.log( boxes[:, 2:] / default_boxes[:, 2:]) / self.variances[1] loc_targets = torch.cat([loc_xy, loc_wh], dim=1) cls_targets = 1 + labels[index.clamp(min=0)] cls_targets[index < 0] = 0 return loc_targets, cls_targets
def compare(self, data1, data2, thresh_iou): if data2['xmin'] <= data1['xmin'] <= data1['xmax'] <= data2['xmax'] \ and data2['ymin'] <= data1['ymin'] <= data1['ymax'] <= data2['ymax']: return True if data1['xmin'] <= data2['xmin'] <= data2['xmax'] <= data1['xmax'] \ and data1['ymin'] <= data2['ymin'] <= data2['ymax'] <= data1['ymax']: return True box1 = BoundBox(data1['xmin'], data1['ymin'], data1['xmax'], data1['ymax']) box2 = BoundBox(data2['xmin'], data2['ymin'], data2['xmax'], data2['ymax']) iou = bbox_iou(box1, box2) if iou > thresh_iou: return True else: return False
def random_crop(img, boxes, labels, min_scale=0.3, max_aspect_ratio=2.): ''' :param img: (PIL.Image) :param boxes: (tensor) [N,4] :param labels: (tensor) [N,] :param min_scale: (float) minimal image width/height scale :param max_aspect_ratio: (float) maximum width/height aspect ratio :return: img, boxes, labels ''' imw, imh = img.size params = [(0, 0, imw, imh)] # crop roi (x,y,w,h) out ## ???? for min_iou in (0, 0.1, 0.3, 0.5, 0.7, 0.9): for _ in range(100): scale = random.uniform(min_scale, 1) aspect_ratio = random.uniform( max(1 / max_aspect_ratio, scale * scale), min(max_aspect_ratio, 1 / (scale * scale))) w = int(imw * scale * math.sqrt(aspect_ratio)) h = int(imh * scale / math.sqrt(aspect_ratio)) x = random.randrange(imw - w) y = random.randrange(imh - h) roi = torch.FloatTensor([[x, y, x + w, y + h]]) ious = bbox_iou(boxes, roi) if ious.min() >= min_iou: params.append((x, y, w, h)) break x, y, w, h = random.choice(params) img = img.crop((x, y, x + w, y + h)) center = (boxes[:, :2] + boxes[:, 2:]) / 2 mask = (center[:, 0] >= x) & (center[:, 0] <= x + w) \ & (center[:, 1] >= y) & (center[:, 1] <= y + h) if mask.any(): boxes = boxes[mask] - torch.FloatTensor([x, y, x, y]) boxes = bbox_clamp(boxes, 0, 0, w, h) labels = labels[mask] else: boxes = torch.FloatTensor([[0, 0, 0, 0]]) labels = torch.LongTensor([0]) return img, boxes, labels
def do_nms(boxes, nms_thresh): if len(boxes) > 0: nb_class = len(boxes[0].classes) else: return for c in range(nb_class): sorted_indices = np.argsort([-box.classes[c] for box in boxes]) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue for j in range(i + 1, len(sorted_indices)): index_j = sorted_indices[j] if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: boxes[index_j].classes[c] = 0
def update(self, pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults=None): """Update internal buffer with latest prediction and gt pairs. Parameters ---------- pred_bboxes : list of mxnet.NDArray or numpy.ndarray Prediction bounding boxes with shape `B, N, 4`. Where B is the size of mini-batch, N is the number of bboxes. pred_labels : list of mxnet.NDArray or numpy.ndarray Prediction bounding boxes labels with shape `B, N`. pred_scores : list of mxnet.NDArray or numpy.ndarray Prediction bounding boxes scores with shape `B, N`. gt_bboxes : list of mxnet.NDArray or numpy.ndarray Ground-truth bounding boxes with shape `B, M, 4`. Where B is the size of mini-batch, M is the number of ground-truths. gt_labels : list of mxnet.NDArray or numpy.ndarray Ground-truth bounding boxes labels with shape `B, M`. gt_difficults : list of mxnet.NDArray or numpy.ndarray, optional, default is None Ground-truth bounding boxes difficulty labels with shape `B, M`. """ def as_numpy(a): """Convert a (list of) mx.NDArray into numpy.ndarray""" if isinstance(a, (list, tuple)): out = [ x.asnumpy() if isinstance(x, mx.nd.NDArray) else x for x in a ] try: out = np.concatenate(out, axis=0) except ValueError: out = np.array(out) return out elif isinstance(a, mx.nd.NDArray): a = a.asnumpy() return a if gt_difficults is None: gt_difficults = [None for _ in as_numpy(gt_labels)] if isinstance(gt_labels, list): if len(gt_difficults) != len(gt_labels) * gt_labels[0].shape[0]: gt_difficults = [None] * len(gt_labels) * gt_labels[0].shape[0] for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in zip( *[ as_numpy(x) for x in [ pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults ] ]): # strip padding -1 for pred and gt valid_pred = np.where(pred_label.flat >= 0)[0] pred_bbox = pred_bbox[valid_pred, :] pred_label = pred_label.flat[valid_pred].astype(int) pred_score = pred_score.flat[valid_pred] valid_gt = np.where(gt_label.flat >= 0)[0] gt_bbox = gt_bbox[valid_gt, :] gt_label = gt_label.flat[valid_gt].astype(int) if gt_difficult is None: gt_difficult = np.zeros(gt_bbox.shape[0]) else: gt_difficult = gt_difficult.flat[valid_gt] for l in np.unique( np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] # add by smher, 2019.11.19 valid_pred_score = np.where(pred_score_l > self._score_thresh) pred_bbox_l = pred_bbox_l[valid_pred_score] pred_score_l = pred_score_l[valid_pred_score] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] self._n_pos[l] += np.logical_not(gt_difficult_l).sum() self._score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: self._match[l].extend((0, ) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = bbox_iou(pred_bbox_l, gt_bbox_l) gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < self.iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: self._match[l].append(-1) else: if not selec[gt_idx]: self._match[l].append(1) else: self._match[l].append(0) selec[gt_idx] = True else: self._match[l].append(0)
def __getitem__(self, idx): # get image input size, change every 10 batches net_h, net_w = self._get_net_size(idx) base_grid_h, base_grid_w = net_h//self.downsample, net_w//self.downsample # determine the first and the last indices of the batch l_bound = idx*self.batch_size r_bound = (idx+1)*self.batch_size if r_bound > len(self.instances): r_bound = len(self.instances) l_bound = r_bound - self.batch_size x_batch = np.zeros((r_bound - l_bound, net_h, net_w, 3)) # input images t_batch = np.zeros((r_bound - l_bound, 1, 1, 1, self.max_box_per_image, 4)) # list of groundtruth boxes # initialize the inputs and the outputs yolo_1 = np.zeros((r_bound - l_bound, 1*base_grid_h, 1*base_grid_w, 3, 4+1+self.objects)) # desired network output 1 yolo_2 = np.zeros((r_bound - l_bound, 2*base_grid_h, 2*base_grid_w, 3, 4+1+self.objects)) # desired network output 2 yolo_3 = np.zeros((r_bound - l_bound, 4*base_grid_h, 4*base_grid_w, 3, 4+1+self.objects)) # desired network output 3 yolos = [yolo_1, yolo_2, yolo_3] instance_count = 0 true_box_index = 0 # do the logic to fill in the inputs and the output for train_instance in self.instances[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self._aug_image(train_instance, net_h, net_w) for obj in all_objs: # find the best anchor box for this object max_anchor = None max_index = -1 max_iou = -1 shifted_box = BoundBox(0, 0, obj['xmax']-obj['xmin'], obj['ymax']-obj['ymin']) for i in range(len(ANC_VALS)): anchor =BoundBox(0, 0, ANC_VALS[i][0],ANC_VALS[i][1]) iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou # determine the yolo to be responsible for this bounding box yolo = yolos[max_index//3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid center_x = .5*(obj['xmin'] + obj['xmax']) g_center_x = center_x / float(net_w) * grid_w # sigma(t_x) + c_x center_y = .5*(obj['ymin'] + obj['ymax']) g_center_y = center_y / float(net_h) * grid_h # sigma(t_y) + c_y # determine the sizes of the bounding box w = obj['xmax'] - obj['xmin'] h = obj['ymax'] - obj['ymin'] box = [center_x, center_y, w, h] # determine the index of the label obj_indx = self.labels.index(obj['name']) # determine the location of the cell responsible for this object grid_x = int(np.floor(g_center_x)) grid_y = int(np.floor(g_center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch # yolo[instance_count, grid_y, grid_x, ] = 0 yolo[instance_count, grid_y, grid_x, max_index%3, 0:4] = box yolo[instance_count, grid_y, grid_x, max_index%3, 4 ] = 1. yolo[instance_count, grid_y, grid_x, max_index%3, 5+obj_indx] = 1 # assign input image to x_batch x_batch[instance_count] = img/255. # increase instance counter in the current batch instance_count += 1 # yolo_1 = yolo_1.reshape((yolo_1.shape[0],yolo_1.shape[1],yolo_1.shape[2],3*(self.objects+5))) # print(yolo_1.shape) # return x_batch, yolo_3# [dummy_yolo_1] return x_batch, [yolo_1, yolo_2, yolo_3]# [dummy_yolo_1] return [x_batch, t_batch, yolo_1], [dummy_yolo_1]
def __getitem__(self, idx): net_h, net_w = self._get_net_size(idx) base_grid_h, base_grid_w = net_h // self.downsample, net_w // self.downsample l_bound = idx * self.batch_size r_bound = (idx + 1) * self.batch_size if r_bound > len(self.train_list): r_bound = len(self.train_list) l_bound = r_bound - self.batch_size x_batch = np.zeros((self.batch_size, net_h, net_w, 3)) t_batch = np.zeros( (self.batch_size, 1, 1, 1, self.max_box_per_image, 4)) yolo_1 = np.zeros( (self.batch_size, 1 * base_grid_h, 1 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.label_list))) yolo_2 = np.zeros( (self.batch_size, 2 * base_grid_h, 2 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.label_list))) yolo_3 = np.zeros( (self.batch_size, 4 * base_grid_h, 4 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.label_list))) yolos = [yolo_3, yolo_2, yolo_1] dummy_yolo_1 = np.zeros((self.batch_size, 1)) dummy_yolo_2 = np.zeros((self.batch_size, 1)) dummy_yolo_3 = np.zeros((self.batch_size, 1)) true_box_index = 0 for instance_count, train_instace in enumerate( self.train_list[l_bound:r_bound]): aug_img, aug_objs = self.augmentation(train_instace, net_h, net_w) for obj in aug_objs: max_anchor = None max_index = -1 max_iou = -1 shifted_box = BoundBox(0, 0, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin']) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou yolo = yolos[max_index // 3] grid_h, grid_w = yolo.shape[1:3] center_x = .5 * (obj['xmin'] + obj['xmax']) center_x = center_x / float(net_w) * grid_w center_y = .5 * (obj['ymin'] + obj['ymax']) center_y = center_y / float(net_h) * grid_h w = np.log( (obj['xmax'] - obj['xmin']) / float(max_anchor.xmax)) h = np.log( (obj['ymax'] - obj['ymin']) / float(max_anchor.ymax)) box = [center_x, center_y, w, h] obj_indx = self.label_list.index(obj['name']) grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) yolo[instance_count, grid_y, grid_x, max_index % 3] = 0 yolo[instance_count, grid_y, grid_x, max_index % 3, 0:4] = box yolo[instance_count, grid_y, grid_x, max_index % 3, 4] = 1. yolo[instance_count, grid_y, grid_x, max_index % 3, 5 + obj_indx] = 1 true_box = [ center_x, center_y, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin'] ] t_batch[instance_count, 0, 0, 0, true_box_index] = true_box true_box_index += 1 true_box_index = true_box_index % self.max_box_per_image x_batch[instance_count] = normalize(aug_img) return [x_batch, t_batch, yolo_1, yolo_2, yolo_3], [dummy_yolo_1, dummy_yolo_2, dummy_yolo_3]
def random_crop_with_constraints(bbox, size, min_scale=0.3, max_scale=1, max_aspect_ratio=2, constraints=None, max_trial=50): """Crop an image randomly with bounding box constraints. This data augmentation is used in training of Single Shot Multibox Detector [#]_. More details can be found in data augmentation section of the original paper. .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016. Parameters ---------- bbox : numpy.ndarray Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes. The second axis represents attributes of the bounding box. Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`, we allow additional attributes other than coordinates, which stay intact during bounding box transformations. size : tuple Tuple of length 2 of image shape as (width, height). min_scale : float The minimum ratio between a cropped region and the original image. The default value is :obj:`0.3`. max_scale : float The maximum ratio between a cropped region and the original image. The default value is :obj:`1`. max_aspect_ratio : float The maximum aspect ratio of cropped region. The default value is :obj:`2`. constraints : iterable of tuples An iterable of constraints. Each constraint should be :obj:`(min_iou, max_iou)` format. If means no constraint if set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`. If this argument defaults to :obj:`None`, :obj:`((0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used. max_trial : int Maximum number of trials for each constraint before exit no matter what. Returns ------- numpy.ndarray Cropped bounding boxes with shape :obj:`(M, 4+)` where M <= N. tuple Tuple of length 4 as (x_offset, y_offset, new_width, new_height). """ # default params in paper if constraints is None: constraints = ( (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), (None, 1), ) w, h = size candidates = [(0, 0, w, h)] for min_iou, max_iou in constraints: min_iou = -np.inf if min_iou is None else min_iou max_iou = np.inf if max_iou is None else max_iou for _ in range(max_trial): scale = random.uniform(min_scale, max_scale) aspect_ratio = random.uniform( max(1 / max_aspect_ratio, scale * scale), min(max_aspect_ratio, 1 / (scale * scale))) crop_h = int(h * scale / np.sqrt(aspect_ratio)) crop_w = int(w * scale * np.sqrt(aspect_ratio)) crop_t = random.randrange(h - crop_h) crop_l = random.randrange(w - crop_w) crop_bb = np.array( (crop_l, crop_t, crop_l + crop_w, crop_t + crop_h)) if len(bbox) == 0: top, bottom = crop_t, crop_t + crop_h left, right = crop_l, crop_l + crop_w return bbox, (left, top, right - left, bottom - top) iou = bbox_iou(bbox, crop_bb[np.newaxis]) if min_iou <= iou.min() and iou.max() <= max_iou: top, bottom = crop_t, crop_t + crop_h left, right = crop_l, crop_l + crop_w candidates.append((left, top, right - left, bottom - top)) break # random select one while candidates: crop = candidates.pop(np.random.randint(0, len(candidates))) new_bbox = bbox_crop(bbox, crop, allow_outside_center=False) if new_bbox.size < 1: continue new_crop = (crop[0], crop[1], crop[2], crop[3]) return new_bbox, new_crop return bbox, (0, 0, w, h)
def __getitem__(self, idx): # get image input size, change every 10 batches base_grid_h, base_grid_w = self.net_h//self.downsample, self.net_w//self.downsample # determine the first and the last indices of the batch l_bound = idx*self.batch_size r_bound = (idx+1)*self.batch_size if r_bound > len(self.instances): r_bound = len(self.instances) l_bound = r_bound - self.batch_size x_batch = np.zeros((r_bound - l_bound, self.net_h, self.net_w, 3)) # input images # initialize the inputs and the outputs yolo_1 = np.zeros((r_bound - l_bound, 1*base_grid_h, 1*base_grid_w, 3, 4+1+self.objects)) # desired network output 1 yolo_2 = np.zeros((r_bound - l_bound, 2*base_grid_h, 2*base_grid_w, 3, 4+1+self.objects)) # desired network output 2 yolo_3 = np.zeros((r_bound - l_bound, 4*base_grid_h, 4*base_grid_w, 3, 4+1+self.objects)) # desired network output 3 yolos = [yolo_1, yolo_2, yolo_3] instance_count = 0 true_box_index = 0 # do the logic to fill in the inputs and the output for train_instance in self.instances[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self._aug_image(train_instance, self.net_h, self.net_w) for obj in all_objs: # find the best anchor box for this object max_anchor = None max_index = -1 max_iou = -1 shifted_box = BoundBox(0, 0, obj['xmax']-obj['xmin'], obj['ymax']-obj['ymin']) for i in range(len(ANC_VALS)): anchor =BoundBox(0, 0, ANC_VALS[i][0],ANC_VALS[i][1]) iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou # determine the yolo to be responsible for this bounding box yolo = yolos[max_index//3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid center_x = .5*(obj['xmin'] + obj['xmax']) g_center_x = center_x / float(self.net_w) * grid_w # sigma(t_x) + c_x center_y = .5*(obj['ymin'] + obj['ymax']) g_center_y = center_y / float(self.net_h) * grid_h # sigma(t_y) + c_y # determine the sizes of the bounding box w = obj['xmax'] - obj['xmin'] h = obj['ymax'] - obj['ymin'] box = [center_x, center_y, w, h] # determine the index of the label obj_indx = self.labels.index(obj['name']) # determine the location of the cell responsible for this object grid_x = int(np.floor(g_center_x)) grid_y = int(np.floor(g_center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch yolo[instance_count, grid_y, grid_x, max_index%3, 0:4] = box yolo[instance_count, grid_y, grid_x, max_index%3, 4 ] = 1. yolo[instance_count, grid_y, grid_x, max_index%3, 5+obj_indx] = 1 # assign input image to x_batch x_batch[instance_count] = img/255. # increase instance counter in the current batch instance_count += 1 return x_batch, [yolo_1, yolo_2, yolo_3]# [dummy_yolo_1]
def encode(self, boxes, labels, iou_threshold=0.5): target_list = list() objmask_list = list() noobjmask_list = list() for i, ori_anchors in enumerate(self.anchors): in_h = in_w = int(self.fm_size[i]) # self.input_size[0] / in_w, self.input_size[1] / in_h w_fm_stride, h_fm_stride = self.input_size / in_w, self.input_size / in_h anchors = [(a_w / w_fm_stride, a_h / h_fm_stride) for a_w, a_h in ori_anchors] num_anchors = len(anchors) obj_mask = torch.zeros(num_anchors, in_h, in_w) noobj_mask = torch.ones(num_anchors, in_h, in_w) tx = torch.zeros(num_anchors, in_h, in_w) ty = torch.zeros(num_anchors, in_h, in_w) tw = torch.zeros(num_anchors, in_h, in_w) th = torch.zeros(num_anchors, in_h, in_w) tconf = torch.zeros(num_anchors, in_h, in_w) tcls = torch.zeros(num_anchors, in_h, in_w, self.num_classes) for t in range(boxes.size(0)): # Convert to position relative to box gx = (boxes[t, 0].item() + boxes[t, 2].item()) / ( 2.0 * self.input_size) * in_w # [0] gy = (boxes[t, 1].item() + boxes[t, 3].item()) / ( 2.0 * self.input_size) * in_h # [1] gw = (boxes[t, 2].item() - boxes[t, 0].item()) / self.input_size * in_w # [0] gh = (boxes[t, 3].item() - boxes[t, 1].item()) / self.input_size * in_h # [1] if gw * gh == 0 or gx >= in_w or gy >= in_h: continue # Get grid box indices gi = int(gx) gj = int(gy) # Get shape of gt box gt_box = torch.FloatTensor([0, 0, gw, gh]).unsqueeze(0) # Get shape of anchor box anchor_shapes = torch.FloatTensor( np.concatenate((np.zeros( (num_anchors, 2)), np.array(anchors)), 1)) # Calculate iou between gt and anchor shapes anch_ious = bbox_iou(gt_box, anchor_shapes) # Where the overlap is larger than threshold set mask to zero (ignore) noobj_mask[anch_ious[0] > iou_threshold] = 0 # Find the best matching anchor box best_n = np.argmax(anch_ious, axis=1) # Masks obj_mask[best_n, gj, gi] = 1 # Coordinates tx[best_n, gj, gi] = gx - gi ty[best_n, gj, gi] = gy - gj # Width and height tw[best_n, gj, gi] = math.log(gw / anchors[best_n][0] + 1e-16) th[best_n, gj, gi] = math.log(gh / anchors[best_n][1] + 1e-16) # object tconf[best_n, gj, gi] = 1 # One-hot encoding of label tcls[best_n, gj, gi, int(labels[t])] = 1 obj_mask = obj_mask.view(-1, 1) noobj_mask = noobj_mask.view(-1, 1) tx = tx.view(-1, 1) ty = ty.view(-1, 1) tw = tw.view(-1, 1) th = th.view(-1, 1) tconf = tconf.view(-1, 1) tcls = tcls.view(-1, self.num_classes) target = torch.cat((tx, ty, tw, th, tconf, tcls), -1) target_list.append(target) objmask_list.append(obj_mask) noobjmask_list.append(noobj_mask) target = torch.cat(target_list, 0) obj_mask = torch.cat(objmask_list, 0) noobj_mask = torch.cat(noobjmask_list, 0) return target, torch.cat([obj_mask, noobj_mask], dim=1)
def __getitem__(self, idx): # get image input size, change every 10 batches # net_h, net_w 是输入图像的高宽,每10个batch随机变换一次 net_h, net_w = self._get_net_size(idx) # 32倍下采样的特征图的高宽 base_grid_h, base_grid_w = net_h//self.downsample, net_w//self.downsample # determine the first and the last indices of the batch l_bound = idx*self.batch_size r_bound = (idx+1)*self.batch_size # 这个感觉不是很合理 if r_bound > len(self.instances): r_bound = len(self.instances) l_bound = r_bound - self.batch_size # 准备样本,一个batch的输入图像 x_batch = np.zeros((r_bound - l_bound, net_h, net_w, 3)) # input images # 每个图像中的所有对象边框,shape=(batch,1,1,1,一个图像中最多几个对象,4个坐标) t_batch = np.zeros((r_bound - l_bound, 1, 1, 1, self.max_box_per_image, 4)) # list of groundtruth boxes # initialize the inputs and the outputs,分别对应32、16、8倍下采样的输出特征图 # [batch_size,特征图高,特征图宽,anchor数量3,边框坐标4+置信度1+预测对象类别数] yolo_1 = np.zeros((r_bound - l_bound, 1*base_grid_h, 1*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 1 yolo_2 = np.zeros((r_bound - l_bound, 2*base_grid_h, 2*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 2 yolo_3 = np.zeros((r_bound - l_bound, 4*base_grid_h, 4*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 3 # 8、16、32倍下采样对应到先验框 [55,69, 75,234, 133,240, 136,129, 142,363, 203,290, 228,184, 285,359, 341,260] yolos = [yolo_3, yolo_2, yolo_1] dummy_yolo_1 = np.zeros((r_bound - l_bound, 1)) dummy_yolo_2 = np.zeros((r_bound - l_bound, 1)) dummy_yolo_3 = np.zeros((r_bound - l_bound, 1)) instance_count = 0 # batch中的第几张图像 true_box_index = 0 # 图像中的第几个对象 # do the logic to fill in the inputs and the output for train_instance in self.instances[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self._aug_image(train_instance, net_h, net_w) for obj in all_objs: # find the best anchor box for this object max_anchor = None # IOU最大的那个anchor max_index = -1 # IOU最大的那个anchor 的index max_iou = -1 shifted_box = BoundBox(0, 0, obj['xmax']-obj['xmin'], obj['ymax']-obj['ymin']) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou # determine the yolo to be responsible for this bounding box # 3种尺度的特征图,与当前对象最匹配的那种anchor,所属的那个特征图的tensor,就是这里的yolo yolo = yolos[max_index//3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid # 对象的边框中心坐标 被转换到 特征图网格上,其值相当于 期望预测的坐标 sigma(t_x) + c_x,sigma(t_y) + c_y center_x = .5*(obj['xmin'] + obj['xmax']) center_x = center_x / float(net_w) * grid_w # 期望预测的坐标 sigma(t_x) + c_x = center_x center_y = .5*(obj['ymin'] + obj['ymax']) center_y = center_y / float(net_h) * grid_h # 期望预测的坐标 sigma(t_y) + c_y = center_y # determine the sizes of the bounding box w = np.log((obj['xmax'] - obj['xmin']) / float(max_anchor.xmax)) # t_w,注:truth_w = anchor_w * exp(t_w) h = np.log((obj['ymax'] - obj['ymin']) / float(max_anchor.ymax)) # t_h,注:truth_h = anchor_h * exp(t_h) box = [center_x, center_y, w, h] # determine the index of the label obj_indx = self.labels.index(obj['name']) # determine the location of the cell responsible for this object grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch # max_index%3 对应到最佳匹配的anchor,一个对象仅有一个anchor负责检测 yolo[instance_count, grid_y, grid_x, max_index%3] = 0 yolo[instance_count, grid_y, grid_x, max_index%3, 0:4] = box # 边框坐标 yolo[instance_count, grid_y, grid_x, max_index%3, 4 ] = 1. # 边框置信度 yolo[instance_count, grid_y, grid_x, max_index%3, 5+obj_indx] = 1 # 对象分类 # assign the true box to t_batch. true_box的x、y是特征图上的坐标(比如13*13特征图),宽和高是原始图像上对象的宽和高 true_box = [center_x, center_y, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin']] t_batch[instance_count, 0, 0, 0, true_box_index] = true_box # 因为有 instance_count 区分不同的图像,true_box_index 应该只需在每次图像切换时 true_box_index=0 即可。这里在整个batch累加true_box_index,暂不确定是否有特别的用意。 true_box_index += 1 true_box_index = true_box_index % self.max_box_per_image # assign input image to x_batch if self.norm != None: x_batch[instance_count] = self.norm(img) else: # plot image and bounding boxes for sanity check for obj in all_objs: cv2.rectangle(img, (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) cv2.putText(img, obj['name'], (obj['xmin']+2, obj['ymin']+12), 0, 1.2e-3 * img.shape[0], (0,255,0), 2) x_batch[instance_count] = img # increase instance counter in the current batch instance_count += 1 return [x_batch, t_batch, yolo_1, yolo_2, yolo_3], [dummy_yolo_1, dummy_yolo_2, dummy_yolo_3]
def write_results_half(prediction, confidence, num_classes, nms=True, nms_conf=0.4): conf_mask = (prediction[:, :, 4] > confidence).half().unsqueeze(2) prediction = prediction * conf_mask try: ind_nz = torch.nonzero(prediction[:, :, 4]).transpose(0, 1).contiguous() except: return 0 box_a = prediction.new(prediction.shape) box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2] / 2) box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3] / 2) box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2] / 2) box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3] / 2) prediction[:, :, :4] = box_a[:, :, :4] batch_size = prediction.size(0) output = prediction.new(1, prediction.size(2) + 1) write = False for ind in range(batch_size): #select the image from the batch image_pred = prediction[ind] #Get the class having maximum score, and the index of that class #Get rid of num_classes softmax scores #Add the class index and the class score of class having maximum score max_conf, max_conf_score = torch.max(image_pred[:, 5:5 + num_classes], 1) max_conf = max_conf.half().unsqueeze(1) max_conf_score = max_conf_score.half().unsqueeze(1) seq = (image_pred[:, :5], max_conf, max_conf_score) image_pred = torch.cat(seq, 1) #Get rid of the zero entries non_zero_ind = (torch.nonzero(image_pred[:, 4])) try: image_pred_ = image_pred[non_zero_ind.squeeze(), :] except: continue #Get the various classes detected in the image img_classes = unique(image_pred_[:, -1].long()).half() #WE will do NMS classwise for cls in img_classes: #get the detections with one particular class cls_mask = image_pred_ * (image_pred_[:, -1] == cls).half().unsqueeze(1) class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze() image_pred_class = image_pred_[class_mask_ind] #sort the detections such that the entry with the maximum objectness #confidence is at the top conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1] image_pred_class = image_pred_class[conf_sort_index] idx = image_pred_class.size(0) #if nms has to be done if nms: #For each detection for i in range(idx): #Get the IOUs of all boxes that come after the one we are looking at #in the loop try: ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i + 1:]) except ValueError: break except IndexError: break #Zero out all the detections that have IoU > treshhold iou_mask = (ious < nms_conf).half().unsqueeze(1) image_pred_class[i + 1:] *= iou_mask #Remove the non-zero entries non_zero_ind = torch.nonzero( image_pred_class[:, 4]).squeeze() image_pred_class = image_pred_class[non_zero_ind] #Concatenate the batch_id of the image to the detection #this helps us identify which image does the detection correspond to #We use a linear straucture to hold ALL the detections from the batch #the batch_dim is flattened #batch is identified by extra batch column batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) seq = batch_ind, image_pred_class if not write: output = torch.cat(seq, 1) write = True else: out = torch.cat(seq, 1) output = torch.cat((output, out)) return output
def update(self, pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults=None): def as_numpy(a): """Convert a (list of) mx.NDArray into numpy.ndarray""" if isinstance(a, (list, tuple)): out = [ x.asnumpy() if isinstance(x, mx.nd.NDArray) else x for x in a ] try: out = np.concatenate(out, axis=0) except ValueError: out = np.array(out) return out elif isinstance(a, mx.nd.NDArray): a = a.asnumpy() return a if gt_difficults is None: gt_difficults = [None for _ in as_numpy(gt_labels)] ''' if isinstance(gt_labels, list): if len(gt_difficults) != len(gt_labels) * gt_labels[0].shape[0]: gt_difficults = [None] * len(gt_labels) * gt_labels[0].shape[0] ''' # calculate the FD, suppose the predicted label is all correct # obj_nums = gt_bboxes[0].shape[1] # if obj_nums < 101: # gt_labels = [X[:, 0:obj_nums, :] for X in pred_labels] # print('len of gt_labels: ', len(gt_labels)) # print('len of pred_labels: ', len(pred_labels)) # print('shape of gt_labels ele: ', gt_labels[0].shape) # print('shape of pred_labels ele: ', pred_labels[0].shape) for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in zip( *[ as_numpy(x) for x in [ pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults ] ]): # strip padding -1 for pred and gt valid_pred = np.where(pred_label.flat >= 0)[0] pred_bbox = pred_bbox[valid_pred, :] pred_label = pred_label.flat[valid_pred].astype(int) pred_score = pred_score.flat[valid_pred] valid_gt = np.where(gt_label.flat >= 0)[0] gt_bbox = gt_bbox[valid_gt, :] gt_label = gt_label.flat[valid_gt].astype(int) if gt_difficult is None: gt_difficult = np.zeros(gt_bbox.shape[0]) else: gt_difficult = gt_difficult.flat[valid_gt] for l in np.unique( np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] # get the predicts, which socre > scores thresh pred_score_l_idx = np.where( pred_score_l >= self.score_thresh)[0] pred_bbox_l = pred_bbox_l[pred_score_l_idx] pred_score_l = pred_score_l[pred_score_l_idx] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] self._n_pos[l] += np.logical_not(gt_difficult_l).sum() self._score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = bbox_iou(pred_bbox_l, gt_bbox_l) for iou_thresh in self.ovp_thresh: iou_thresh_key = str(iou_thresh) if len(gt_bbox_l) == 0: self._match[iou_thresh_key][l].extend( (0, ) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: self._match[iou_thresh_key][l].append(-1) else: if not selec[gt_idx]: self._match[iou_thresh_key][l].append(1) else: self._match[iou_thresh_key][l].append(0) selec[gt_idx] = True else: self._match[iou_thresh_key][l].append(0)
def __getitem__(self, idx): # get image input size, change every 10 batches net_h, net_w = self._get_net_size(idx) base_grid_h, base_grid_w = net_h//self.downsample, net_w//self.downsample # determine the first and the last indices of the batch l_bound = idx*self.batch_size r_bound = (idx+1)*self.batch_size if r_bound > len(self.instances): r_bound = len(self.instances) l_bound = r_bound - self.batch_size x_batch = np.zeros((r_bound - l_bound, net_h, net_w, 3)) # input images t_batch = np.zeros((r_bound - l_bound, 1, 1, 1, self.max_box_per_image, 4)) # list of groundtruth boxes # initialize the inputs and the outputs yolo_1 = np.zeros((r_bound - l_bound, 1*base_grid_h, 1*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 1 yolo_2 = np.zeros((r_bound - l_bound, 2*base_grid_h, 2*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 2 yolo_3 = np.zeros((r_bound - l_bound, 4*base_grid_h, 4*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 3 yolos = [yolo_3, yolo_2, yolo_1] dummy_yolo_1 = np.zeros((r_bound - l_bound, 1)) dummy_yolo_2 = np.zeros((r_bound - l_bound, 1)) dummy_yolo_3 = np.zeros((r_bound - l_bound, 1)) instance_count = 0 true_box_index = 0 # do the logic to fill in the inputs and the output for train_instance in self.instances[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self._aug_image(train_instance, net_h, net_w) for obj in all_objs: # find the best anchor box for this object max_anchor = None max_index = -1 max_iou = -1 shifted_box = BoundBox(0, 0, obj['xmax']-obj['xmin'], obj['ymax']-obj['ymin']) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou # determine the yolo to be responsible for this bounding box yolo = yolos[max_index//3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid center_x = .5*(obj['xmin'] + obj['xmax']) center_x = center_x / float(net_w) * grid_w # sigma(t_x) + c_x center_y = .5*(obj['ymin'] + obj['ymax']) center_y = center_y / float(net_h) * grid_h # sigma(t_y) + c_y # determine the sizes of the bounding box w = np.log((obj['xmax'] - obj['xmin']) / float(max_anchor.xmax)) # t_w h = np.log((obj['ymax'] - obj['ymin']) / float(max_anchor.ymax)) # t_h box = [center_x, center_y, w, h] # determine the index of the label obj_indx = self.labels.index(obj['name']) # determine the location of the cell responsible for this object grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch yolo[instance_count, grid_y, grid_x, max_index%3] = 0 yolo[instance_count, grid_y, grid_x, max_index%3, 0:4] = box yolo[instance_count, grid_y, grid_x, max_index%3, 4 ] = 1. yolo[instance_count, grid_y, grid_x, max_index%3, 5+obj_indx] = 1 # assign the true box to t_batch true_box = [center_x, center_y, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin']] t_batch[instance_count, 0, 0, 0, true_box_index] = true_box true_box_index += 1 true_box_index = true_box_index % self.max_box_per_image # assign input image to x_batch if self.norm != None: x_batch[instance_count] = self.norm(img) else: # plot image and bounding boxes for sanity check for obj in all_objs: cv2.rectangle(img, (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) cv2.putText(img, obj['name'], (obj['xmin']+2, obj['ymin']+12), 0, 1.2e-3 * img.shape[0], (0,255,0), 2) x_batch[instance_count] = img # increase instance counter in the current batch instance_count += 1 return [x_batch, t_batch, yolo_1, yolo_2, yolo_3], [dummy_yolo_1, dummy_yolo_2, dummy_yolo_3]