def get_targets(pred_boxes, pred_conf, pred_cls, targets, anchors, num_anchors, num_classes, in_h, ignore_thres, img_dim): mask = torch.zeros(targets.size(0), num_anchors, in_h, in_h) conf_mask = torch.zeros(targets.size(0), num_anchors, in_h, in_h) tx = torch.zeros(targets.size(0), num_anchors, in_h, in_h) ty = torch.zeros(targets.size(0), num_anchors, in_h, in_h) tw = torch.zeros(targets.size(0), num_anchors, in_h, in_h) th = torch.zeros(targets.size(0), num_anchors, in_h, in_h) tconf = torch.ByteTensor(targets.size(0), num_anchors, in_h, in_h).fill_(0) tcls = torch.ByteTensor(targets.size(0), num_anchors, in_h, in_h, num_classes).fill_(0) counter = 0 correct = 0 for batch in range(targets.size(0)): for t in range(targets.shape[1]): if targets[batch, t].sum() == 0: continue counter += 1 gx = targets[batch, t, 1] * in_h gy = targets[batch, t, 2] * in_h gw = targets[batch, t, 3] * in_h gh = targets[batch, t, 4] * in_h gi = int(gx) gj = int(gy) gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0) anchor_shapes = torch.FloatTensor( np.concatenate((np.zeros((num_anchors, 2)), np.array(anchors)), 1)) anch_ious = bbox_iou(gt_box, anchor_shapes, True) conf_mask[batch, anch_ious > ignore_thres, gj, gi] = 0 best_n = np.argmax(anch_ious) gt_box = torch.FloatTensor(np.array([gx, gy, gw, gh])).unsqueeze(0) pred_box = pred_boxes[batch, best_n, gj, gi].unsqueeze(0) mask[batch, best_n, gj, gi] = 1 conf_mask[batch, best_n, gj, gi] = 1 tx[batch, best_n, gj, gi] = gx - gi ty[batch, best_n, gj, gi] = gy - gj tw[batch, best_n, gj, gi] = math.log(gw / anchors[best_n][0] + 1e-16) th[batch, best_n, gj, gi] = math.log(gh / anchors[best_n][1] + 1e-16) target_label = int(targets[batch, t, 0]) tcls[batch, best_n, gj, gi, target_label] = 1 tconf[batch, best_n, gj, gi] = 1 iou = bbox_iou(gt_box, pred_box) pred_label = torch.argmax(pred_cls[batch, best_n, gj, gi]) score = pred_conf[batch, best_n, gj, gi] if iou > 0.5 and pred_label == target_label and score > 0.5: correct += 1 return counter, correct, mask, conf_mask, tx, ty, tw, th, tconf, tcls
def get_ground_truth(boxes): gt = np.zeros((grid_h, grid_w, num_box, 4 + 1 + num_classes), dtype=np.float32) for bbox in boxes: bx, by, bw, bh = bbox center_x = bx + bw / 2. center_x = center_x / float(image_w / grid_w) center_y = by + bh / 2. center_y = center_y / float(image_h / grid_h) cell_x = int(np.floor(center_x)) cell_y = int(np.floor(center_y)) center_w = bw / grid_size center_h = bh / grid_size box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(anchor_boxes)): anchor = anchor_boxes[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou # assign ground truth x, y, w, h, confidence and class probs gt[cell_y, cell_x, best_anchor, 0] = 1.0 gt[cell_y, cell_x, best_anchor, 1:5] = box gt[cell_y, cell_x, best_anchor, 5] = 1.0 return gt
def comput_loss(proc_pred, annotations_gt, targets, iou_th=0.5, giou_ratio=0.5): #procpred = process_preds(model_out[0], int(np.sqrt(out.shape[1])) , 256, 56) boxloss, closs, objloss = torch.tensor([0]).float(), torch.tensor( [0]).float(), torch.tensor([0]).float() for j in range(len(proc_pred)): for i, gt in enumerate(annotations_gt[j]): # get ious+ ious = bbox_iou(gt.float(), xywh2xyxy(procpred[j, :, :4]).float()) # get reelvant predictions pertinent = torch.where(ious > iou_th)[0] if len(pertinent): best_id = torch.max(ious[pertinent], 0)[1] best_bb = procpred[j, best_id, :] closs += pred_criterion(best_bb[5:].unsqueeze(0), torch.tensor(targets[i])) boxloss += (1 - ious[pertinent]).mean() trgt_objectness = ( 1 - giou_ratio) + giou_ratio * ious.detach().clamp(0) objloss += obj_criterion(procpred[j, ..., 4], trgt_objectness) loss = 2 * boxloss + closs + 2 * objloss loss_print = dict(box=boxloss.detach(), pred=closs.detach(), obj=objloss.detach()) return loss, loss_print
def validate(model): anchor = generate_anchor(8, [8, ], [0.33, 0.5, 1, 2, 3], 17) prec1 = 0 model = model.eval() transform=transforms.Compose([ transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) lines = [] for k in xrange(20): all_sample = [ args.vot2018[i] for i in sorted(random.sample(xrange(len(args.vot2018)), 30)) ] nSamples = len(all_sample) for i in xrange(nSamples): sequence = all_sample[i] ran_id = random.randint(0,len(sequence)-1) while len(sequence[ran_id])<2: sequence = all_sample[random.randint(0,nSamples-1)] ran_id = random.randint(0,len(sequence)-1) track_obj = sequence[ran_id] ran_f1 = random.randint(0,len(track_obj)-1) ran_f2 = random.randint(0,len(track_obj)-1) lines.append([track_obj[ran_f1],track_obj[ran_f2]]) random.shuffle(lines) for line in lines: z,x,gt_box,regression_target,conf_target= load_data(line,0) inpz = transform(z) inpx = transform(x) score, delta = model(inpz.unsqueeze(0).cuda(),inpx.unsqueeze(0).cuda()) delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy() score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1), dim=0).data[1, :].cpu().numpy() delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0] delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3] best_pscore_id = np.argmax(score) target = delta[:, best_pscore_id] prec1 += bbox_iou(target, gt_box, False) prec1 = prec1/600 return prec1
def compute_batch_info(outputs, labels, iou_thres): """ compute true positive, predicted scores and predicted labels per sample batch here, the outputs are the concated outputs. """ batch_metrics = [] for idx in range(len(outputs)): # idx is img_idx """ Before outputs flows into compute_batch_info fcn, the outputs should be processed by nms, and outputs will be a list. len(outputs) == batch_num So it may exist None. every item in outputs: [detection_num,7] (x1,y1,x2,y2,conf_score,cls_score,cls_pred) """ if outputs[idx] is None: continue output = outputs[idx] pred_boxes = output[:, :4] pred_conf = output[:, 4] pred_cls = output[:, -1] # true positive tp = np.zeros(pred_boxes.shape[0]) # choose the detections of the (idx) image annotations = labels[labels[:, 0] == idx][:, 1:] img_labels = annotations[:, 0] if len(annotations) else [] if len(annotations): detected_boxes = [] img_boxes = annotations[:, 1:] for pred_idx, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_cls)): # if targets are all found, then break if len(detected_boxes) == len(annotations): break # ignore if label is not one of the img labels if pred_label not in img_labels: continue iou, box_idx = bbox_iou(pred_box.unsqueeze(0), img_boxes).max(0) if iou > iou_thres and box_idx not in detected_boxes: tp[pred_idx] = 1 detected_boxes += [box_idx] if labels.is_cuda: pred_conf = pred_conf.detach().cpu() pred_cls = pred_cls.detach().cpu() batch_metrics.append([tp, pred_conf, pred_cls]) return batch_metrics
def _calc_ious(self, anchor, bbox, inside_index): # ious between the anchors and the gt boxes ious = bbox_iou(anchor, bbox) argmax_ious = ious.argmax(axis=1) max_ious = ious[np.arange(len(inside_index)), argmax_ious] gt_argmax_ious = ious.argmax(axis=0) gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])] gt_argmax_ious = np.where(ious == gt_max_ious)[0] return argmax_ious, max_ious, gt_argmax_ious
def compute_metrics(gt, img_shape, noise_size=5, noise_position=5, create_bbox_proba=0.5, destroy_bbox_proba=0.5, k=10): """ 1. Add noise to ground truth Bounding Boxes. 2.Compute Fscore, IoU, Map of two lists of Bounding Boxes. :param gt: list of GT bounding boxes :param img_shape: Original image shape :param noise_size: Change bbox size param :param noise_position: Increase bbox size param :param destroy_bbox_proba: Proba of destroying Bboxes :param create_bbox_proba: Proba of creating Bboxes :param k: Map at k :return: Noisy Bboxes, Fscore, IoU, MaP """ # Add noise to GT depending on noise parameter bboxes = u.add_noise_to_bboxes(gt, img_shape, noise_size=True, noise_size_factor=noise_size, noise_position=True, noise_position_factor=noise_position) # Randomly create and destroy bounding boxes depending # on probability parameter bboxes = u.create_bboxes(bboxes, img_shape, prob=create_bbox_proba) bboxes = u.destroy_bboxes(bboxes, prob=destroy_bbox_proba) bboxTP, bboxFN, bboxFP = evalf.performance_accumulation_window(bboxes, gt) """ Compute F-score of GT against modified bboxes PER FRAME NUMBER """ # ToDo: Add dependency on frame number fscore = u.fscore(bboxTP, bboxFN, bboxFP) """ Compute IoU of GT against modified Bboxes PER FRAME NUMBER: """ iou = list() for b, box in enumerate(gt): iou.append(u.bbox_iou(bboxes[b], gt[b])) """ Compute mAP of GT against modified bboxes PER FRAME NUMBER: """ map = u.mapk(bboxes, gt, k) return (bboxes, fscore, iou, map)
def test(model, test_loader, config): """Test the model during training""" def truths_length(truth): for k in range(50): if truth[k][1] == 0: return k model.eval() num_classes = config['num_classes'] anchors = config['anchors'] num_anchors = len(anchors) // 2 conf_thresh = config['conf_thresh'] nms_thresh = config['nms_thresh'] iou_thresh = config['iou_thresh'] eps = 1e-5 total = 0. proposals = 0. correct = 0. for batch_idx, (data, target) in enumerate(test_loader): data = data.cuda() data = Variable(data, volatile=True) output = model(data).data all_boxes = get_region_boxes(output, conf_thresh, num_classes, anchors, num_anchors) for i in range(output.size(0)): boxes = all_boxes[i] boxes = nms(boxes, nms_thresh) truths = target[i].view(-1, 5) num_gts = truths_length(truths) total += num_gts for l in range(len(boxes)): if boxes[l][4] > conf_thresh: proposals += 1 for l in range(num_gts): box_gt = [truths[l][1], truths[l][2], truths[l][3], truths[l][4], 1., 1., truths[l][0]] best_iou = 0 best_j = -1 for j in range(len(boxes)): iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False) if iou > best_iou: best_j = j best_iou = iou if best_iou > iou_thresh and boxes[best_j][6] == box_gt[6]: correct += 1 precision = 1. * correct / (proposals + eps) recall = 1. * correct / (total + eps) fscore = 2. * precision * recall / (precision + recall + eps) print('precision: {}, recall: {}, fscore: {}'.format( precision, recall, fscore))
def get_target(self, target, anchors, in_w, in_h, ignore_threshold): bs = target.size(0) mask = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False) noobj_mask = torch.ones(bs, self.num_anchors, in_h, in_w, requires_grad=False) tx = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False) ty = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False) tw = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False) th = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False) tconf = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False) tcls = torch.zeros(bs, self.num_anchors, in_h, in_w, self.num_classes, requires_grad=False) for b in range(bs): for t in range(target.shape[1]): if target[b, t].sum() == 0: continue # Convert to position relative to box gx = target[b, t, 1] * in_w gy = target[b, t, 2] * in_h gw = target[b, t, 3] * in_w gh = target[b, t, 4] * in_h # Get grid box indices gi = int(gx) gj = int(gy) # Get shape of gt box gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0) # Get shape of anchor box anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((self.num_anchors, 2)), np.array(anchors)), 1)) # Calculate iou between gt and anchor shapes anch_ious = bbox_iou(gt_box, anchor_shapes) # Where the overlap is larger than threshold set mask to zero (ignore) noobj_mask[b, anch_ious > ignore_threshold, gj, gi] = 0 # Find the best matching anchor box best_n = np.argmax(anch_ious) # Masks mask[b, best_n, gj, gi] = 1 # Coordinates tx[b, best_n, gj, gi] = gx - gi ty[b, best_n, gj, gi] = gy - gj # Width and height tw[b, best_n, gj, gi] = math.log(gw/anchors[best_n][0] + 1e-16) th[b, best_n, gj, gi] = math.log(gh/anchors[best_n][1] + 1e-16) # object tconf[b, best_n, gj, gi] = 1 # One-hot encoding of label tcls[b, best_n, gj, gi, int(target[b, t, 0])] = 1 return mask, noobj_mask, tx, ty, tw, th, tconf, tcls
def build_targets(target, anchors, num_anchors, nH, nW): nB = target.size(0) nA = num_anchors anchor_step = len(anchors) / num_anchors mask = torch.zeros(nB, nA, nH, nW) tx = torch.zeros(nB, nA, nH, nW) ty = torch.zeros(nB, nA, nH, nW) tw = torch.zeros(nB, nA, nH, nW) th = torch.zeros(nB, nA, nH, nW) tconf = torch.zeros(nB, nA, nH, nW) tcls = torch.zeros(nB, nA, nH, nW) nGT = 0 for b in range(nB): for t in range(50): if target[b][t * 5 + 1] == 0: break nGT = nGT + 1 best_iou = 0.0 best_n = -1 i = int(target[b][t * 5 + 1] * nW) j = int(target[b][t * 5 + 2] * nH) w = target[b][t * 5 + 3] * nW h = target[b][t * 5 + 4] * nH gt_box = [0, 0, w, h] for n in range(nA): anchor_box = [ 0, 0, anchors[anchor_step * n], anchors[anchor_step * n + 1] ] iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False) if iou > best_iou: best_iou = iou best_n = n mask[b][best_n][j][i] = 1 tx[b][best_n][j][i] = target[b][t * 5 + 1] * nW - i ty[b][best_n][j][i] = target[b][t * 5 + 2] * nH - j tw[b][best_n][j][i] = math.log(w / anchors[anchor_step * best_n]) th[b][best_n][j][i] = math.log(h / anchors[anchor_step * best_n + 1]) tconf[b][best_n][j][i] = best_iou tcls[b][best_n][j][i] = target[b][t * 5] return nGT, mask, tx, ty, tw, th, tconf, tcls
def encode(self, bboxes, labels, input_size): '''Encode target bounding boxes and class labels. We obey the Faster RCNN box coder: tx = (x - anchor_x) / anchor_w ty = (y - anchor_y) / anchor_h tw = log(w / anchor_w) th = log(h / anchor_h) Args: bboxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4]. labels: (tensor) object class labels, sized [#obj,]. input_size: (int/tuple) model input size of (w,h). Returns: reg_targets: (tensor) encoded bounding boxes, sized [#anchors,4]. cls_targets: (tensor) encoded class labels, sized [#anchors,]. ''' input_size = torch.Tensor(input_size) anchor_bboxes = self._get_anchor_boxes(input_size) # (xc, yc, w, h) -> (x1, y1, x2, y2) a = anchor_bboxes[:, :2] b = anchor_bboxes[:, 2:] anchor_bboxes_wh = torch.cat([a - b / 2, a + b / 2], 1) # [anchor# 4] ious = bbox_iou( anchor_bboxes_wh, bboxes) # [anchor#, object#] iou for each anchor and bbox max_ious, max_ids = ious.max( 1) # max (and indice) for each row (anchor) bboxes = bboxes[max_ids] # (x1, y1, x2, y2) -> (xc, yc, w, h) a = bboxes[:, :2] b = bboxes[:, 2:] bboxes = torch.cat([(a + b) / 2, b - a + 1], 1) # [anchor# 4] loc_xy = (bboxes[:, :2] - anchor_bboxes[:, :2]) / anchor_bboxes[:, 2:] loc_wh = torch.log(bboxes[:, 2:] / anchor_bboxes[:, 2:]) reg_targets = torch.cat([loc_xy, loc_wh], 1) cls_targets = labels[max_ids] cls_targets[max_ious < self.config.max_iou] = 0 cls_targets[(max_ious > self.config.min_iou) & (max_ious < self.config.max_iou )] = -1 # for now just mark ignored to -1 _, best_anchor_ids = ious.max(0) cls_targets[best_anchor_ids] = labels return reg_targets, cls_targets
def get_item(self, img): # get image object image, image_size = self.get_img(img['file_name']) # construct output from object's x, y, w, h true_box_index = 0 # annotation y_anntn = np.zeros(shape=( self.grid_width, self.grid_height, self.max_grid_box, 4+1+self.num_categories)) true_box = np.zeros(shape=(1, 1, 1, self.max_image_box, 4)) annIds = self.dataset.getAnnIds(imgIds=img['id']) annotations = self.dataset.loadAnns(annIds) for annotation in annotations: box = self.get_box(annotation, image_size=image_size) x, y, w, h = box grid_x = int(np.floor(x)) grid_y = int(np.floor(y)) # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou([0, 0, w, h], anchor) if max_iou < iou: best_anchor = i max_iou = iou cat_id = self.cat_ids[annotation['category_id']] y_anntn[grid_x, grid_y, best_anchor, 0:4] = box y_anntn[grid_x, grid_y, best_anchor, 4] = 1.0 y_anntn[grid_x, grid_y, best_anchor, 5 + cat_id] = 1.0 # assign the true box to b_batch true_box[0, 0, 0, true_box_index] = box true_box_index += 1 true_box_index = true_box_index % self.max_image_box return image, y_anntn, true_box
def _calc_ious( self, anchor, bbox, ): # ious between the anchors and the gt boxes # 这里输入的anchor已经删除了超出图像边界的样本 ious = utils.bbox_iou(anchor, bbox) # [nanchor,nbbox],以下的最接近表示IOU最大 argmax_ious = ious.argmax(axis=1) # 每个anchor,与其最接近的bbox的索引,[nanchor,] max_ious = ious[np.arange(ious.shape[0]), argmax_ious] # 每个anchor,与其最接近的bbox的IOU值,[nanchor,] gt_argmax_ious = ious.argmax(axis=0) # 每个bbox,与其最接近的anchor的索引,[nbbox,] gt_max_ious = ious[ gt_argmax_ious, np.arange(ious.shape[1])] # 每个bbox,与其最接近的anchor的IOU值[nbbox,] # 注意,这里得到的 与每个bbox最近的anchor索引和IOU值 并不是全部的,因为np.argmax只会选取第一个碰到的最大值,所以还需要下面这步来得到所有的最大值的index gt_argmax_ious = np.where( ious == gt_max_ious)[0] # 这里并不关心anchor是跟哪个bbox最接近,全都要 return argmax_ious, max_ious, gt_argmax_ious
def best_anchor_box(box): # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundingBox(0, 0, box[2], box[3]) anchors = [ BoundingBox(0, 0, config.ANCHORS[2 * i], config.ANCHORS[2 * i + 1]) for i in range(len(config.ANCHORS) // 2) ] for i in range(len(anchors)): anchor = anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou return best_anchor
def get_ground_truth(coco, imgId): gt = np.zeros((grid_h, grid_w, num_box, 4 + 1 + num_classes), dtype=np.float32) annIds = coco.getAnnIds(imgIds=[imgId]) annos = coco.loadAnns(ids=annIds) for anno in annos: category_id = anno['category_id'] bx, by, bw, bh = anno['bbox'] bx = 1.0 * bx * image_w by = 1.0 * by * image_h bw = 1.0 * bw * image_w bh = 1.0 * bh * image_h center_x = bx + bw / 2. center_x = center_x / grid_size center_y = by + bh / 2. center_y = center_y / grid_size cell_x = int(np.clip(np.floor(center_x), 0.0, (grid_w - 1))) cell_y = int(np.clip(np.floor(center_y), 0.0, (grid_h - 1))) center_w = bw / grid_size center_h = bh / grid_size box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(anchor_boxes)): anchor = anchor_boxes[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou # assign ground truth x, y, w, h, confidence and class probs gt[cell_y, cell_x, best_anchor, 0] = 1.0 gt[cell_y, cell_x, best_anchor, 1:5] = box gt[cell_y, cell_x, best_anchor, 5 + catId2idx[category_id]] = 1.0 return gt
def __init__(self, num_parts=24): self.default = -3 self.num_parts = num_parts self.fs = sorted(os.listdir("data/breatheless")) self.data = {} # Sort by filename pkl_data = pkl.load(open("data/breatheless.pkl", "rb")) pkl_data = sorted(pkl_data, key=lambda d: d["file_name"]) # Choose the object with largest bounding box best = {"i": 0, "area": 0} for i, bbox in enumerate(pkl_data[0]["pred_boxes_XYXY"]): x0, y0, x1, y1 = [e.item() for e in bbox] area = bbox_area(x0, y0, x1, y1) if area > best["area"]: best = {"i": i, "area": area} # Tracking using hungarian method prev_bbox = pkl_data[0]["pred_boxes_XYXY"][best["i"]] for d in pkl_data: best = {"i": 0, "iou": 0} for i, bbox in enumerate(d["pred_boxes_XYXY"]): iou = bbox_iou(prev_bbox, bbox) if iou > best["iou"]: best = {"i": i, "iou": iou} fname = d["file_name"].split("/")[-1] self.data[fname] = { "pred": d["pred_densepose"][best["i"]], "bbox": d["pred_boxes_XYXY"][best["i"]] } prev_bbox = d["pred_boxes_XYXY"][best["i"]]
def __call__(self, images, annotations, shapes, aug=True): # get image input size, change every 10 batches if aug: self.idx += 1 net_h, net_w = self._get_net_size() else: net_h, net_w = self.config['model']['input_size'], self.config[ 'model']['input_size'] base_grid_h, base_grid_w = net_h // self.down_sample, net_w // self.down_sample x_batch = np.zeros((self.batch_size, net_h, net_w, 3), dtype=np.float32) t_batch = np.zeros( (self.batch_size, 1, 1, 1, self.max_box_per_image, 4), dtype=np.float32) # initialize the inputs and the outputs yolo_1 = np.zeros((self.batch_size, 1 * base_grid_h, 1 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels)), dtype=np.float32) yolo_2 = np.zeros((self.batch_size, 2 * base_grid_h, 2 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels)), dtype=np.float32) yolo_3 = np.zeros((self.batch_size, 4 * base_grid_h, 4 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels)), dtype=np.float32) yolos = [yolo_3, yolo_2, yolo_1] instance_count = 0 true_box_index = 0 # do the logic to fill in the inputs and the output for img, ann, shape in zip(images, annotations, shapes): ann = json.loads(ann) img = cv2.resize(img, (shape[1], shape[0])) # augment input image and fix object's position and size if aug: img, all_objs = self._aug_image(img, ann, net_h, net_w) else: img, all_objs = self._raw_image(img, ann, net_h, net_w) for obj in all_objs: # find the best anchor box for this object max_anchor = None max_index = -1 max_iou = -1 # not only max iou anchor but also larger than threshold anchors are positive. positive_anchors = [] positive_threshold = 0.3 shifted_box = BoundBox(0, 0, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin']) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou if iou > positive_threshold: positive_anchors.append([i, anchor]) if not positive_anchors: positive_anchors.append([max_index, max_anchor]) for max_index, max_anchor in positive_anchors: # determine the yolo to be responsible for this bounding box yolo = yolos[max_index // 3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid center_x = .5 * (obj['xmin'] + obj['xmax']) center_x = center_x / float( net_w) * grid_w # sigma(t_x) + c_x center_y = .5 * (obj['ymin'] + obj['ymax']) center_y = center_y / float( net_h) * grid_h # sigma(t_y) + c_y # determine the sizes of the bounding box w = np.log((obj['xmax'] - obj['xmin']) / float(max_anchor.xmax)) # t_w h = np.log((obj['ymax'] - obj['ymin']) / float(max_anchor.ymax)) # t_h box = [center_x, center_y, w, h] # determine the index of the label obj_indx = self.labels.index(obj['name']) # determine the location of the cell responsible for this object grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch yolo[instance_count, grid_y, grid_x, max_index % 3] = 0 yolo[instance_count, grid_y, grid_x, max_index % 3, 0:4] = box yolo[instance_count, grid_y, grid_x, max_index % 3, 4] = 1. yolo[instance_count, grid_y, grid_x, max_index % 3, 5 + obj_indx] = 1 # assign the true box to t_batch true_box = [ center_x, center_y, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin'] ] t_batch[instance_count, 0, 0, 0, true_box_index] = true_box true_box_index += 1 true_box_index = true_box_index % self.max_box_per_image # assign input image to x_batch if aug and self.norm is not None: x_batch[instance_count] = self.norm(img) elif not aug: x_batch[instance_count] = img # increase instance counter in the current batch instance_count += 1 output = [x_batch, t_batch, yolo_1, yolo_2, yolo_3] if not aug: output += [images, annotations, shapes] return output
def build_targets(self, pred_boxes, target, anchors, nA, nH, nW): nB = target.size(0) anchor_step = anchors.size(1) # anchors[nA][anchor_step] noobj_mask = torch.ones(nB, nA, nH, nW) obj_mask = torch.zeros(nB, nA, nH, nW) coord_mask = torch.zeros(nB, nA, nH, nW) tcoord = torch.zeros(4, nB, nA, nH, nW) tconf = torch.zeros(nB, nA, nH, nW) tcls = torch.zeros(nB, nA, nH, nW, self.num_classes) nAnchors = nA * nH * nW nPixels = nH * nW nGT = 0 nRecall = 0 nRecall75 = 0 # it works faster on CPU than on GPU. anchors = anchors.to("cpu") for b in range(nB): cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() cur_ious = torch.zeros(nAnchors) tbox = target[b].view(-1, 5).to("cpu") for t in range(50): if tbox[t][1] == 0: break gx, gy = tbox[t][1] * nW, tbox[t][2] * nH gw, gh = tbox[t][3] * self.net_width, tbox[t][ 4] * self.net_height cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() cur_ious = torch.max( cur_ious, multi_bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) ignore_ix = (cur_ious > self.ignore_thresh).view(nA, nH, nW) noobj_mask[b][ignore_ix] = 0 for t in range(50): if tbox[t][1] == 0: break nGT += 1 gx, gy = tbox[t][1] * nW, tbox[t][2] * nH gw, gh = tbox[t][3] * self.net_width, tbox[t][ 4] * self.net_height gw, gh = gw.float(), gh.float() gi, gj = int(gx), int(gy) tmp_gt_boxes = torch.FloatTensor([0, 0, gw, gh]).repeat(nA, 1).t() anchor_boxes = torch.cat( (torch.zeros(nA, anchor_step), anchors), 1).t() _, best_n = torch.max( multi_bbox_ious(anchor_boxes, tmp_gt_boxes, x1y1x2y2=False), 0) gt_box = torch.FloatTensor([gx, gy, gw, gh]) pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) obj_mask[b][best_n][gj][gi] = 1 noobj_mask[b][best_n][gj][gi] = 0 coord_mask[b][best_n][gj][gi] = 2. - tbox[t][3] * tbox[t][4] tcoord[0][b][best_n][gj][gi] = gx - gi tcoord[1][b][best_n][gj][gi] = gy - gj tcoord[2][b][best_n][gj][gi] = math.log(gw / anchors[best_n][0]) tcoord[3][b][best_n][gj][gi] = math.log(gh / anchors[best_n][1]) tcls[b][best_n][gj][gi][int(tbox[t][0])] = 1 tconf[b][best_n][gj][gi] = iou if self.rescore else 1. if iou > 0.5: nRecall += 1 if iou > 0.75: nRecall75 += 1 return nGT, nRecall, nRecall75, obj_mask, noobj_mask, coord_mask, tcoord, tconf, tcls
def __getitem__(self, idx): l_bound = idx * self.config['BATCH_SIZE'] r_bound = (idx + 1) * self.config['BATCH_SIZE'] if r_bound > len(self.images): r_bound = len(self.images) l_bound = r_bound - self.config['BATCH_SIZE'] instance_count = 0 x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'],\ self.config['IMAGE_W'], 3)) b_batch = np.zeros((r_bound - l_bound, 1, 1, 1,\ self.config['TRUE_BOX_BUFFER'], 4)) y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'],\ self.config['GRID_W'], self.config['BOX'],\ 4+1+len(self.config['LABELS']))) for train_instance in self.images[l_bound:r_bound]: img, all_objs = self.aug_image(train_instance, jitter=self.jitter) true_box_index = 0 for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and\ obj['name'] in self.config['LABELS']: center_x = .5 * (obj['xmin'] + obj['xmax']) center_x = center_x / (float(self.config['IMAGE_W']) /\ self.config['GRID_W']) center_y = .5 * (obj['ymin'] + obj['ymax']) center_y = center_y / (float(self.config['IMAGE_H']) /\ self.config['GRID_H']) grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) if grid_x < self.config['GRID_W'] and grid_y <\ self.config['GRID_H']: obj_indx = self.config['LABELS'].index(obj['name']) center_w = (obj['xmax'] - obj['xmin']) /\ (float(self.config['IMAGE_W']) /\ self.config['GRID_W']) center_h = (obj['ymax'] - obj['ymin']) /\ (float(self.config['IMAGE_H']) /\ self.config['GRID_H']) box = [center_x, center_y, center_w, center_h] best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou y_batch[instance_count, grid_y, grid_x,\ best_anchor, 0:4] = box y_batch[instance_count, grid_y, grid_x, best_anchor,\ 4] = 1. y_batch[instance_count, grid_y, grid_x, best_anchor,\ 5+obj_indx] = 1 b_batch[instance_count, 0, 0, 0, true_box_index] = box true_box_index += 1 true_box_index = true_box_index\ % self.config['TRUE_BOX_BUFFER'] if self.norm != None: x_batch[instance_count] = self.norm(img) else: for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']),\ (obj['xmax'],obj['ymax']), (255,0,0), 3) cv2.putText(img[:, :, ::-1], obj['name'], (obj['xmin'] + 2, obj['ymin'] + 12), 0, 1.2e-3 * img.shape[0], (0, 255, 0), 2) x_batch[instance_count] = img instance_count += 1 return [x_batch, b_batch], y_batch
def get_generator(self): self.randomized_imgs = randomize_imgs(self.images) num_img = len(self.randomized_imgs) total_count = 0 batch_count = 0 x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images b_batch = np.zeros( (self.config['BATCH_SIZE'], 1, 1, 1, self.config['TRUE_BOX_BUFFER'], 4) ) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4 + 1 + 1)) # desired network output while True: if total_count < num_img: train_instance = self.randomized_imgs[total_count] # augment input image and fix object's position and size img, all_objs = self.aug_image(train_instance, jitter=self.jitter) # construct output from object's x, y, w, h true_box_index = 0 for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj[ 'ymin'] and obj['name'] in self.config['LABELS']: center_x = .5 * (obj['xmin'] + obj['xmax']) center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) center_y = .5 * (obj['ymin'] + obj['ymax']) center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) if grid_x < self.config[ 'GRID_W'] and grid_y < self.config['GRID_H']: obj_indx = self.config['LABELS'].index(obj['name']) center_w = (obj['xmax'] - obj['xmin']) / ( float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell center_h = (obj['ymax'] - obj['ymin']) / ( float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou # assign ground truth x, y, w, h, confidence and class probs to y_batch y_batch[batch_count, grid_y, grid_x, best_anchor, 0:4] = box y_batch[batch_count, grid_y, grid_x, best_anchor, 4] = 1. y_batch[batch_count, grid_y, grid_x, best_anchor, 5] = obj_indx # assign the true box to b_batch b_batch[batch_count, 0, 0, 0, true_box_index] = box true_box_index += 1 true_box_index = true_box_index % self.config[ 'TRUE_BOX_BUFFER'] # assign input image to x_batch if self.norm: x_batch[batch_count] = normalize(img) else: x_batch[batch_count] = img # plot image and bounding boxes for sanity check for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj[ 'ymin']: cv2.rectangle(img[:, :, ::-1], (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), (255, 0, 0), 3) cv2.putText(img[:, :, ::-1], obj['name'], (obj['xmin'] + 2, obj['ymin'] + 12), 0, 1.2e-3 * img.shape[0], (0, 255, 0), 2) plt.figure(figsize=(10, 10)) plt.imshow(img) plt.show() # increase instance counter in current batch batch_count += 1 total_count += 1 if total_count >= num_img: total_count = 0 if self.shuffle: self.randomized_imgs = randomize_imgs(self.images) if batch_count >= self.config['BATCH_SIZE']: yield [x_batch, b_batch], y_batch x_batch = np.zeros( (self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) y_batch = np.zeros( (self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 5 + self.config['CLASS'])) batch_count = 0 if self.shuffle: self.randomized_imgs = randomize_imgs(self.images)
def build_targets(self, pred_boxes, target, nH, nW): nB = target.size(0) nA = self.num_anchors noobj_mask = torch.ones (nB, nA, nH, nW) obj_mask = torch.zeros(nB, nA, nH, nW) coord_mask = torch.zeros(nB, nA, nH, nW) tcoord = torch.zeros( 4, nB, nA, nH, nW) tconf = torch.zeros(nB, nA, nH, nW) tcls = torch.zeros(nB, nA, nH, nW) nAnchors = nA*nH*nW nPixels = nH*nW nGT = 0 # number of ground truth nRecall = 0 # it works faster on CPU than on GPU. anchors = self.anchors.to("cpu") if self.seen < 12800: tcoord[0].fill_(0.5) tcoord[1].fill_(0.5) coord_mask.fill_(0.01) # initial w, h == 0 means log(1)==0, s.t, anchor is equal to ground truth. for b in range(nB): cur_pred_boxes = pred_boxes[b*nAnchors:(b+1)*nAnchors].t() cur_ious = torch.zeros(nAnchors) tbox = target[b].view(-1,5).to("cpu") for t in range(50): if tbox[t][1] == 0: break gx, gw = [ i * nW for i in (tbox[t][1], tbox[t][3]) ] gy, gh = [ i * nH for i in (tbox[t][2], tbox[t][4]) ] cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors,1).t() cur_ious = torch.max(cur_ious, multi_bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) ignore_ix = (cur_ious>self.thresh).view(nA,nH,nW) noobj_mask[b][ignore_ix] = 0 for t in range(50): if tbox[t][1] == 0: break nGT += 1 gx, gw = [ i * nW for i in (tbox[t][1], tbox[t][3]) ] gy, gh = [ i * nH for i in (tbox[t][2], tbox[t][4]) ] gw, gh = gw.float(), gh.float() gi, gj = int(gx), int(gy) tmp_gt_boxes = torch.FloatTensor([0, 0, gw, gh]).repeat(nA,1).t() anchor_boxes = torch.cat((torch.zeros(nA, 2), anchors),1).t() tmp_ious = multi_bbox_ious(anchor_boxes, tmp_gt_boxes, x1y1x2y2=False) best_iou, best_n = torch.max(tmp_ious, 0) if self.anchor_step == 4: # this part is not tested. tmp_ious_mask = (tmp_ious==best_iou) if tmp_ious_mask.sum() > 0: gt_pos = torch.FloatTensor([gi, gj, gx, gy]).repeat(nA,1).t() an_pos = anchor_boxes[4:6] # anchor_boxes are consisted of [0 0 aw ah ax ay] dist = pow(((gt_pos[0]+an_pos[0])-gt_pos[2]),2) + pow(((gt_pos[1]+an_pos[1])-gt_pos[3]),2) dist[1-tmp_ious_mask]=10000 # set the large number for the small ious _, best_n = torch.min(dist,0) gt_box = torch.FloatTensor([gx, gy, gw, gh]) pred_box = pred_boxes[b*nAnchors+best_n*nPixels+gj*nW+gi] iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) obj_mask [b][best_n][gj][gi] = 1 noobj_mask[b][best_n][gj][gi] = 0 coord_mask[b][best_n][gj][gi] = 2. - tbox[t][3]*tbox[t][4] tcoord [0][b][best_n][gj][gi] = gx - gi tcoord [1][b][best_n][gj][gi] = gy - gj tcoord [2][b][best_n][gj][gi] = math.log(gw/anchors[best_n][0]) tcoord [3][b][best_n][gj][gi] = math.log(gh/anchors[best_n][1]) tcls [b][best_n][gj][gi] = tbox[t][0] tconf [b][best_n][gj][gi] = iou if self.rescore else 1. if iou > 0.5: nRecall += 1 return nGT, nRecall, obj_mask, noobj_mask, coord_mask, tcoord, tconf, tcls
def __getitem__(self, idx): le = LabelEncoder() le.fit_transform(self.labels) x_batch = np.zeros( (BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, self.n_channels)) b_batch = np.zeros((BATCH_SIZE, 1, 1, 1, self.max_obj, 4)) y_batch = np.zeros( (BATCH_SIZE, GRID_H, GRID_W, self.nb_anchors, 4 + 1 + self.num_classes())) # desired network output # current_batch = self.dataset[l_bound:r_bound] current_batch = self.dictionaries[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE] instance_num = 0 for instance in current_batch: img, object_annotations = self.aug_image(instance, jitter=self.jitter) obj_num = 0 # center of the bounding box is divided with the image width/height and grid width/height # to get the coordinates relative to a single element of a grid for obj in object_annotations: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj[ 'ymin'] and obj['name'] in self.labels: center_x = .5 * ( obj['xmin'] + obj['xmax'] ) # center of the lower side of the bb (by x axis) center_x = center_x / ( float(IMAGE_SIZE) / GRID_W ) # scaled to the grid unit (a value between 0 and GRID_W-1) center_y = .5 * (obj['ymin'] + obj['ymax'] ) # center of the lower side (by y axis) center_y = center_y / ( float(IMAGE_SIZE) / GRID_H ) # scaled to the grid unit (a value between 0 and GRID_H-1) grid_x = int(np.floor( center_x)) # assigns the object to the matching grid_y = int( np.floor(center_y) ) # grid element according to (center_x, center_y) if grid_x < GRID_W and grid_y < GRID_H: center_w = (obj['xmax'] - obj['xmin']) / (float(IMAGE_SIZE) / GRID_W) center_h = (obj['ymax'] - obj['ymin']) / (float(IMAGE_SIZE) / GRID_H) box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = [0, 0, center_w, center_h] for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou img = self.normalize(img) x_batch[instance_num] = img b_batch[instance_num, 0, 0, 0, obj_num] = box y_batch[instance_num, grid_y, grid_x, best_anchor, 0:4] = box y_batch[instance_num, grid_y, grid_x, best_anchor, 4] = 1. y_batch[instance_num, grid_y, grid_x, best_anchor, 5] = 1 obj_num += 1 obj_num %= self.max_obj instance_num += 1 return [x_batch, b_batch], y_batch
def val(self): APs = [] for i, (images, targets) in enumerate(self.val_data_loader): if i == 10: break images = self.to_var(images) targets = self.to_var(targets) with torch.no_grad(): output = self.net(images) output = utils.non_max_suppression(output, 80, conf_thres=self.conf_thres, nms_thres=self.nms_thres) # Compute average precision for each sample for sample_i in range(targets.size(0)): correct = [] # Get labels for sample where width is not zero (dummies) annotations = targets[sample_i, targets[sample_i, :, 3] != 0] # Extract detections detections = output[sample_i] if detections is None: # If there are no detections but there are annotations mask as zero AP if annotations.size(0) != 0: APs.append(0) continue # Get detections sorted by decreasing confidence scores detections = detections[np.argsort(-detections[:, 4])] # If no annotations add number of detections as incorrect if annotations.size(0) == 0: correct.extend([0 for _ in range(len(detections))]) else: # Extract target boxes as (x1, y1, x2, y2) target_boxes = torch.FloatTensor(annotations[:, 1:].shape) target_boxes[:, 0] = (annotations[:, 1] - annotations[:, 3] / 2) target_boxes[:, 1] = (annotations[:, 2] - annotations[:, 4] / 2) target_boxes[:, 2] = (annotations[:, 1] + annotations[:, 3] / 2) target_boxes[:, 3] = (annotations[:, 2] + annotations[:, 4] / 2) target_boxes *= self.image_size detected = [] for *pred_bbox, conf, obj_conf, obj_pred in detections: pred_bbox = torch.FloatTensor(pred_bbox).view(1, -1) # Compute iou with target boxes iou = utils.bbox_iou(pred_bbox, target_boxes) # Extract index of largest overlap best_i = np.argmax(iou) # If overlap exceeds threshold and classification is correct mark as correct if iou[best_i] > self.iou_thres and obj_pred == annotations[ best_i, 0] and best_i not in detected: correct.append(1) detected.append(best_i) else: correct.append(0) # Extract true and false positives true_positives = np.array(correct) false_positives = 1 - true_positives # Compute cumulative false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # Compute recall and precision at all ranks recall = true_positives / annotations.size( 0) if annotations.size(0) else true_positives precision = true_positives / np.maximum( true_positives + false_positives, np.finfo(np.float64).eps) # Compute average precision AP = utils.compute_ap(recall, precision) APs.append(AP) return np.mean(APs)
def __getitem__(self, idx): l_bound = idx*self.config['BATCH_SIZE'] r_bound = (idx+1)*self.config['BATCH_SIZE'] if r_bound > len(self.images): r_bound = len(self.images) l_bound = r_bound - self.config['BATCH_SIZE'] instance_count = 0 if self.config['IMAGE_C'] == 3: x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images else: x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 1)) b_batch = np.zeros((r_bound - l_bound, 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes # y_batch = (batch_size, 13, 13, 5, (4+1+80)) # 13 x 13 is the grid, 5 is the number of anchor boxes, and 4 is the xmin, ymin, xmax, ymax, and confidence score + 80 classes) y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+len(self.config['LABELS']))) # desired network output for train_instance in self.images[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self.aug_image(train_instance, jitter=self.jitter) # construct output from object's x, y, w, h true_box_index = 0 for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']: # centere with respective the original image center_x = .5*(obj['xmin'] + obj['xmax']) # center_x, center_y are with respective to the 13 x 13 image center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) center_y = .5*(obj['ymin'] + obj['ymax']) center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) # find out which grid the centre of the object (the true bounding box) belongs to grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']: obj_indx = self.config['LABELS'].index(obj['name']) # image_w / grid_w, image_h / grid_h == size of each grid cell, e.g. # in a 416 x 416 image, the size of grid cell = 416/13, 416/13 = 32 x 32 # so the centre_w and center_h is a percentage of the grid_cell center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell # center of true bounding gox with respective the 13 x 13 image # width and height of true bounding box with respective size of each grid cell box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou # assign ground truth x, y, w, h, confidence and class probs to y_batch y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box y_batch[instance_count, grid_y, grid_x, best_anchor, 4 ] = 1. y_batch[instance_count, grid_y, grid_x, best_anchor, 5+obj_indx] = 1 # assign the true box to b_batch b_batch[instance_count, 0, 0, 0, true_box_index] = box true_box_index += 1 true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER'] # assign input image to x_batch if self.norm != None: x_batch[instance_count] = self.norm(img) else: # plot image and bounding boxes for sanity check for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) cv2.putText(img[:,:,::-1], obj['name'], (obj['xmin']+2, obj['ymin']+12), 0, 1.2e-3 * img.shape[0], (0,255,0), 2) x_batch[instance_count] = img # increase instance counter in current batch instance_count += 1 #print(' new batch created', idx) return [x_batch, b_batch], y_batch
def __getitem__(self, idx): # get image input size, change every 10 batches # do the logic to fill in the inputs and the output train_instance = self.instances[idx] # augment input image and fix object's position and size # img, all_objs = self._aug_image(train_instance, net_h, net_w) augmented = self._aug_image(train_instance) net_h, net_w = augmented['image'].shape[0:2] x_batch = np.expand_dims(augmented['image'], 0) base_grid_h, base_grid_w = net_h // self.downsample, net_w // self.downsample gt_batch = np.zeros((1, 1, 1, 1, self.max_box_per_image, 4)) # list of groundtruth boxes # initialize the inputs and the outputs yolo_1 = np.zeros( (1, 1 * base_grid_h, 1 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels))) # desired network output 1 yolo_2 = np.zeros( (1, 2 * base_grid_h, 2 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels))) # desired network output 2 yolo_3 = np.zeros( (1, 4 * base_grid_h, 4 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels))) # desired network output 3 yolos = [yolo_3, yolo_2, yolo_1] dummy_yolo_1 = np.zeros((1, 1)) dummy_yolo_2 = np.zeros((1, 1)) dummy_yolo_3 = np.zeros((1, 1)) instance_count = 0 true_box_index = 0 for idx2, (xmin, ymin, xmax, ymax) in enumerate(augmented['bboxes']): xmin = int(xmin) xmax = int(xmax) ymin = int(ymin) ymax = int(ymax) # find the best anchor box for this object max_anchor = None max_index = -1 max_iou = -1 shifted_box = BoundBox(0, 0, xmax - xmin, ymax - ymin) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou # determine the yolo to be responsible for this bounding box yolo = yolos[max_index // 3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid center_x = .5 * (xmin + xmax) center_x = center_x / float(net_w) * grid_w # sigma(t_x) + c_x center_y = .5 * (ymin + ymax) center_y = center_y / float(net_h) * grid_h # sigma(t_y) + c_y # determine the sizes of the bounding box w = np.log((xmax - xmin) / float(max_anchor.xmax)) # t_w h = np.log((ymax - ymin) / float(max_anchor.ymax)) # t_h box = [center_x, center_y, w, h] # determine the index of the label obj_indx = augmented['category_id'][idx2] # determine the location of the cell responsible for this object grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch yolo[0, grid_y, grid_x, max_index % 3] = 0 yolo[0, grid_y, grid_x, max_index % 3, 0:4] = box yolo[0, grid_y, grid_x, max_index % 3, 4] = 1. yolo[0, grid_y, grid_x, max_index % 3, 5 + obj_indx] = 1 # assign the true box to t_batch true_box = [center_x, center_y, xmax - xmin, ymax - ymin] gt_batch[0, 0, 0, 0, true_box_index] = true_box true_box_index += 1 true_box_index = true_box_index % self.max_box_per_image # assign input image to x_batch x_batch = self.norm(x_batch) return [x_batch, gt_batch, yolo_1, yolo_2, yolo_3], [dummy_yolo_1, dummy_yolo_2, dummy_yolo_3]
def __getitem__(self, idx): train_instance = self.images[idx] # augment input image and fix object's position and size augmented = self.aug_image(train_instance) x_batch = np.expand_dims(augmented['image'], 0) net_h, net_w = augmented['image'].shape[0:2] b_batch = np.zeros( (1, 1, 1, 1, self.config['TRUE_BOX_BUFFER'], 4) ) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes y_batch = np.zeros( (1, net_h // 32, net_w // 32, self.config['BOX'], 4 + 1 + len(self.config['LABELS']))) # desired network output # construct output from object's x, y, w, h true_box_index = 0 for idx2, (xmin, ymin, xmax, ymax) in enumerate(augmented['bboxes']): if xmax > xmin and ymax > ymin and augmented['category_id'][ idx2] in self.config['LABELS']: center_x = .5 * (xmin + xmax) center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) center_y = .5 * (ymin + ymax) center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) if grid_x < self.config['GRID_W'] and grid_y < self.config[ 'GRID_H']: obj_indx = augmented['category_id'][idx2] center_w = (xmax - xmin) / ( float(self.config['IMAGE_W']) / self.config['GRID_W'] ) # unit: grid cell center_h = (ymax - ymin) / ( float(self.config['IMAGE_H']) / self.config['GRID_H'] ) # unit: grid cell box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou # assign ground truth x, y, w, h, confidence and class probs to y_batch y_batch[0, grid_y, grid_x, best_anchor, 0:4] = box y_batch[0, grid_y, grid_x, best_anchor, 4] = 1. y_batch[0, grid_y, grid_x, best_anchor, 5 + obj_indx] = 1 # assign the true box to b_batch b_batch[0, 0, 0, 0, true_box_index] = box true_box_index += 1 true_box_index = true_box_index % self.config[ 'TRUE_BOX_BUFFER'] # assign input image to x_batch x_batch = self.norm(x_batch) return [x_batch, b_batch], y_batch
def __getitem__(self, idx): # get image input size, change every 10 batches net_h, net_w = self._get_net_size(idx) base_grid_h, base_grid_w = net_h // self.downsample, net_w // self.downsample # determine the first and the last indices of the batch l_bound = idx * self.batch_size r_bound = (idx + 1) * self.batch_size if r_bound > len(self.instances): r_bound = len(self.instances) l_bound = r_bound - self.batch_size x_batch = np.zeros( (r_bound - l_bound, net_h, net_w, 3)) # input images gt_batch = np.zeros( (r_bound - l_bound, 1, 1, 1, self.max_box_per_image, 4)) # list of groundtruth boxes # initialize the inputs and the outputs yolo_1 = np.zeros( (r_bound - l_bound, 1 * base_grid_h, 1 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels))) # desired network output 1 yolo_2 = np.zeros( (r_bound - l_bound, 2 * base_grid_h, 2 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels))) # desired network output 2 yolo_3 = np.zeros( (r_bound - l_bound, 4 * base_grid_h, 4 * base_grid_w, len(self.anchors) // 3, 4 + 1 + len(self.labels))) # desired network output 3 yolos = [yolo_3, yolo_2, yolo_1] dummy_yolo_1 = np.zeros((r_bound - l_bound, 1)) dummy_yolo_2 = np.zeros((r_bound - l_bound, 1)) dummy_yolo_3 = np.zeros((r_bound - l_bound, 1)) instance_count = 0 true_box_index = 0 # do the logic to fill in the inputs and the output for train_instance in self.instances[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self._aug_image(train_instance, net_h, net_w) for obj in all_objs: # find the best anchor box for this object max_anchor = None max_index = -1 max_iou = -1 shifted_box = BoundBox(0, 0, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin']) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou # determine the yolo to be responsible for this bounding box yolo = yolos[max_index // 3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid center_x = .5 * (obj['xmin'] + obj['xmax']) center_x = center_x / float(net_w) * grid_w # sigma(t_x) + c_x center_y = .5 * (obj['ymin'] + obj['ymax']) center_y = center_y / float(net_h) * grid_h # sigma(t_y) + c_y # determine the sizes of the bounding box w = np.log((obj['xmax'] - obj['xmin']) / float(max_anchor.xmax)) # t_w h = np.log((obj['ymax'] - obj['ymin']) / float(max_anchor.ymax)) # t_h box = [center_x, center_y, w, h] # determine the index of the label obj_indx = self.labels.index(obj['name']) # determine the location of the cell responsible for this object grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch yolo[instance_count, grid_y, grid_x, max_index % 3] = 0 yolo[instance_count, grid_y, grid_x, max_index % 3, 0:4] = box yolo[instance_count, grid_y, grid_x, max_index % 3, 4] = 1. yolo[instance_count, grid_y, grid_x, max_index % 3, 5 + obj_indx] = 1 # assign the true box to t_batch true_box = [ center_x, center_y, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin'] ] gt_batch[instance_count, 0, 0, 0, true_box_index] = true_box true_box_index += 1 true_box_index = true_box_index % self.max_box_per_image # assign input image to x_batch if self.norm is not None: x_batch[instance_count] = self.norm(img) else: # plot image and bounding boxes for sanity check for obj in all_objs: cv2.rectangle(img, (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), (255, 0, 0), 3) cv2.putText(img, obj['name'], (obj['xmin'] + 2, obj['ymin'] + 12), 0, 1.2e-3 * img.shape[0], (0, 255, 0), 2) x_batch[instance_count] = img # increase instance counter in the current batch instance_count += 1 return [x_batch, gt_batch, yolo_1, yolo_2, yolo_3], [dummy_yolo_1, dummy_yolo_2, dummy_yolo_3]
def build_target(self, pred_boxes, pred_conf, pred_cls, target, anchors, num_anchors, num_classes, grid_size, ignore_thres, img_dim): """ Args: pred_boxes: Tensor, size:(batchsize, num_anchors, grid_size, grid_size, 4) pred_conf: Tensor, size:(batchsize, grid_size, grid_size) pred_cls: Tensor, size:(batchsize, grid_size, grid_size, classes) target: Tensor, size:(batchsize, max_obj, 5), ground truth. anchors: int, anchor num. num_classes: int, classes num. grid_size: width/height of feature map which predict layer apply on. ignore_thres: threshold of pred_conf to ingore background. img_dim: int, input image's width/height. Return: """ nB = target.size(0) # batchsize nA = num_anchors nC = num_classes nG = grid_size mask = torch.zeros( nB, nA, ) conf_mask = torch.ones(nB, nA, nG, nG) tx = torch.zeros(nB, nA, nG, nG) ty = torch.zeros(nB, nA, nG, nG) tw = torch.zeros(nB, nA, nG, nG) th = torch.zeros(nB, nA, nG, nG) tconf = torch.ByteTensor(nB, nA, nG, nG).fill_(0) tcls = torch.ByteTensor(nB, nA, nG, nG, nC).fill_(0) nGT = 0 nCorrect = 0 for b in range(nB): for t in range(target.shape[1]): if target[b, t].sum() == 0: # pad continue nGT += 1 # Convert to position relative to box gx = target[b, t, 1] * nG gy = target[b, t, 2] * nG gw = target[b, t, 3] * nG gh = target[b, t, 4] * nG # Get grid box indices gi = int(gx) gj = int(gy) # Get shape of gt box gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0) # Get shape of anchor box anchor_shapes = torch.FloatTensor( np.concatenate((np.zeros( (len(anchors), 2)), np.array(anchors)), 1)) # Calculate iou between gt and anchor shapes # 1 on 3 anch_ious = bbox_iou(gt_box, anchor_shapes) # Where the overlap is larger than threshold set mask to zero (ignore) conf_mask[b, anch_ious > ignore_thres, gj, gi] = 0 # Find the best matching anchor box best_n = np.argmax(anch_ious) # Get ground truth box gt_box = torch.FloatTensor(np.array([gx, gy, gw, gh])).unsqueeze(0) # Get the best prediction pred_box = pred_boxes[b, best_n, gj, gi].unsqueeze(0) # Masks mask[b, best_n, gj, gi] = 1 conf_mask[b, best_n, gj, gi] = 1 # Coordinates tx[b, best_n, gj, gi] = gx - gi ty[b, best_n, gj, gi] = gy - gj # Width and height tw[b, best_n, gj, gi] = math.log(gw / anchors[best_n][0] + 1e-16) th[b, best_n, gj, gi] = math.log(gh / anchors[best_n][1] + 1e-16) # One-hot encoding of label target_label = int(target[b, t, 0]) tcls[b, best_n, gj, gi, target_label] = 1 tconf[b, best_n, gj, gi] = 1 # Calculate iou between ground truth and best matching prediction iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) pred_label = torch.argmax(pred_cls[b, best_n, gj, gi]) score = pred_conf[b, best_n, gj, gi] if iou > 0.5 and pred_label == target_label and score > 0.5: nCorrect += 1 return nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls
def __getitem__(self, idx): l_bound = idx*self.config['BATCH_SIZE'] r_bound = (idx+1)*self.config['BATCH_SIZE'] if r_bound > len(self.images): r_bound = len(self.images) l_bound = r_bound - self.config['BATCH_SIZE'] instance_count = 0 x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images b_batch = np.zeros((r_bound - l_bound, 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+len(self.config['LABELS']))) # desired network output for train_instance in self.images[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self.aug_image(train_instance, jitter=self.jitter) # construct output from object's x, y, w, h true_box_index = 0 for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']: center_x = .5*(obj['xmin'] + obj['xmax']) center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) center_y = .5*(obj['ymin'] + obj['ymax']) center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']: obj_indx = self.config['LABELS'].index(obj['name']) center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou # assign ground truth x, y, w, h, confidence and class probs to y_batch y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box y_batch[instance_count, grid_y, grid_x, best_anchor, 4 ] = 1. y_batch[instance_count, grid_y, grid_x, best_anchor, 5+obj_indx] = 1 # assign the true box to b_batch b_batch[instance_count, 0, 0, 0, true_box_index] = box true_box_index += 1 true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER'] # assign input image to x_batch if self.norm != None: x_batch[instance_count] = self.norm(img) else: # plot image and bounding boxes for sanity check for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) cv2.putText(img[:,:,::-1], obj['name'], (obj['xmin']+2, obj['ymin']+12), 0, 1.2e-3 * img.shape[0], (0,255,0), 2) x_batch[instance_count] = img # increase instance counter in current batch instance_count += 1 #print(' new batch created', idx) return [x_batch, b_batch], y_batch
def __getitem__(self, idx): l_bound = idx * self.config['BATCH_SIZE'] r_bound = (idx + 1) * self.config['BATCH_SIZE'] if r_bound > len(self.images): r_bound = len(self.images) l_bound = r_bound - self.config['BATCH_SIZE'] instance_count = 0 x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images b_batch = np.zeros( (r_bound - l_bound, 1, 1, 1, self.config['TRUE_BOX_BUFFER'], 4) ) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes y_batch = np.zeros( (r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4 + 1 + 3 + self.config['CLASS'])) # desired network output for train_instance in self.images[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self.aug_image(train_instance, jitter=self.jitter) # construct output from object's x, y, w, h true_box_index = 0 for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj[ 'ymin'] and obj['name'] in self.config['LABELS']: center_x = .5 * (obj['xmin'] + obj['xmax']) center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) center_y = .5 * (obj['ymin'] + obj['ymax']) center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) if grid_x < self.config['GRID_W'] and grid_y < self.config[ 'GRID_H']: obj_indx = self.config['LABELS'].index(obj['name']) center_w = (obj['xmax'] - obj['xmin']) / ( float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell center_h = (obj['ymax'] - obj['ymin']) / ( float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell box = [center_x, center_y, center_w, center_h] # find the anchor that best predicts this box best_anchor = -1 max_iou = -1 shifted_box = BoundBox(0, 0, center_w, center_h) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: best_anchor = i max_iou = iou # assign ground truth x, y, w, h, confidence and class probs to y_batch y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box y_batch[instance_count, grid_y, grid_x, best_anchor, 4] = 1. y_batch[instance_count, grid_y, grid_x, best_anchor, 5 + obj_indx] = 1 y_batch[instance_count, grid_y, grid_x, best_anchor, 6:] = [ obj['pose_x'], obj['pose_y'], obj['pose_z'] ] # assign the true box to b_batch b_batch[instance_count, 0, 0, 0, true_box_index] = box true_box_index += 1 true_box_index = true_box_index % self.config[ 'TRUE_BOX_BUFFER'] # assign input image to x_batch if self.norm != None: x_batch[instance_count] = self.norm(img) else: # plot image and bounding boxes for sanity check for obj in all_objs: if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: cv2.rectangle(img[:, :, ::-1], (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), (255, 0, 0), 3) cv2.putText(img[:, :, ::-1], obj['name'], (obj['xmin'] + 2, obj['ymin'] + 12), 0, 1.2e-3 * img.shape[0], (0, 255, 0), 2) x_batch[instance_count] = img # increase instance counter in current batch instance_count += 1 #print ' new batch created', idx return [x_batch, b_batch], y_batch
def __call__( self, roi, bbox, label, ): # loc_normalize_mean, # loc_normalize_std): """Assigns ground truth to sampled proposals. This function samples total of :obj:`self.n_sample` RoIs from the combination of :obj:`roi` and :obj:`bbox`. The RoIs are assigned with the ground truth class labels as well as bounding box offsets and scales to match the ground truth bounding boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are sampled as foregrounds. Offsets and scales of bounding boxes are calculated using :func:`model.utils.bbox_tools.bbox2loc`. Also, types of input arrays and output arrays are same. Here are notations. * :math:`S` is the total number of sampled RoIs, which equals \ :obj:`self.n_sample`. * :math:`L` is number of object classes possibly including the \ background. Args: roi (array): Region of Interests (RoIs) from which we sample. Its shape is :math:`(R, 4)` bbox (array): The coordinates of ground truth bounding boxes. Its shape is :math:`(R', 4)`. label (array): Ground truth bounding box labels. Its shape is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where :math:`L` is the number of foreground classes. loc_normalize_mean (tuple of four floats): Mean values to normalize coordinates of bouding boxes. loc_normalize_std (tupler of four floats): Standard deviation of the coordinates of bounding boxes. Returns: (array, array, array): * **sample_roi**: Regions of interests that are sampled. \ Its shape is :math:`(S, 4)`. * **gt_roi_loc**: Offsets and scales to match \ the sampled RoIs to the ground truth bounding boxes. \ Its shape is :math:`(S, 4)`. * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ value 0 is the background. """ n_bbox, _ = bbox.shape # roi是rpn网络生成的候选区域 # bbox是ground truth # # 这里要注意的是bbox区域也可以作为训练样本,所以这里将roi和bbox concat起来 # roi = np.concatenate((roi, bbox), axis=0) pos_max_num = int(np.round(self.n_sample * self.pos_ratio)) # 采样的正样本数量 # 每个roi对应每个bbox的IOU iou = utils.bbox_iou(roi, bbox) # 每个roi对应iou最大的bbox的index gt_assignment = iou.argmax(axis=1) # 每个roi对应最大iou的值 max_iou = iou.max(axis=1) # 每个roi的label,0为背景类,所以别的类别都+1 gt_roi_label = label[gt_assignment] + 1 # 到这里我们得到了所有roi(包括bbox)的最大IOU值和他们的类别标签 # 在标注类别标签的时候我们并不关心它与bbox最接近 # Select foreground RoIs as those with >= pos_iou_thresh IoU. # 在大于IOU阈值的roi中选取正样本,为什么正样本比例设的这么低???,只有0.25 pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] if len(pos_index) > pos_max_num: pos_index = np.random.choice(pos_index, size=pos_max_num, replace=False) pos_num = len(pos_index) # print('ProposalTargetCreator pos index',len(pos_index)) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). # 在IOU区间内选择负样本,这里的iou区间是[0-0.5],我觉得0.5还挺高的??? neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_max_num = self.n_sample - pos_num if len(neg_index) > neg_max_num: neg_index = np.random.choice(neg_index, size=neg_max_num, replace=False) # print('ProposalTargetCreator neg index',len(neg_index)) # The indices that we're selecting (both positive and negative). # 正类保留分类,负类标签置0 keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[len(pos_index):] = 0 # negative labels --> 0 sample_roi = roi[keep_index] # print('ProposalTargetCreator',sample_roi.shape) # Compute offsets and scales to match sampled RoIs to the GTs. # 计算4个修正量作为位置回归的ground truth gt_roi_loc = utils.bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) # gt_roi_loc = (gt_roi_loc - loc_normalize_mean) / loc_normalize_std # # debug # print('debug') # # gt_roi_loc_ = utils.loc2bbox(sample_roi,gt_roi_loc) # # print(gt_roi_loc_.shape) # # print(gt_roi_loc_[:10]) # gt_roi_label_ = gt_roi_label - 1 # # gt_rpn_loc_ = gt_rpn_loc_[gt_rpn_label.numpy()>=0] # # dataset_utils.draw_pic(img[0].numpy(),dataset.VOC_BBOX_LABEL_NAMES,gt_rpn_loc_,) # # gt_roi_loc_ = gt_roi_loc_[gt_roi_label_>=0] # img_ = dataset_utils.inverse_normalize(img[0].numpy()) # pos_roi_ = sample_roi[:len(pos_index)] # pos_roi_cls_ = gt_roi_label_[:len(pos_index)] # print(pos_roi_.shape,gt_roi_loc[:len(pos_index)].shape) # pos_bbox = utils.loc2bbox(pos_roi_,gt_roi_loc[:len(pos_index)]) # dataset_utils.draw_pic(img_, dataset.VOC_BBOX_LABEL_NAMES, pos_bbox,pos_roi_cls_) # 这里似乎并不能保证选取出来的sample_roi数目一定是128个,因为极端情况下可以有很多不符合条件的roi,即不能选作正样本也不能选做负样本 return sample_roi, gt_roi_loc, gt_roi_label