def forward(self, x, boxes, transformer): """ Arguments: x (Tensor): the mask logits boxes (list[BoxList]): bounding boxes that are used as reference, one for ech image Returns: results (list[BoxList]): one BoxList for each image, containing the extra field mask """ # mask_prob = x.sigmoid() # [B, T, C] # word_probs = x.permute(1, 0, 2).softmax(2) # select masks coresponding to the predicted classes # num_words = word_probs.shape[0] # labels = [bbox.get_field("labels") for bbox in boxes] # labels = torch.cat(labels) # index = torch.arange(num_words, device=word_probs.device) # word_probs = word_probs[index][:, None] boxes_per_image = [len(box) for box in boxes] word_probs = x.split(boxes_per_image, dim=0) results = [] for x_feature, box in zip(word_probs, boxes): bbox = RBoxList(box.bbox, box.size, mode="xywha") predict_prob = greedy_decode(transformer, x_feature, self.src_mask, self.max_step) for field in box.fields(): bbox.add_field(field, box.get_field(field)) bbox.add_field("word_probs", predict_prob) results.append(bbox) return results
def __getitem__(self, index): if _DEBUG: index = 0 anno = self.annobase[index % self.database_num][ int(index / self.database_num) % len(self.annobase[index % self.database_num])] im_path = anno['image'] img = Image.open(im_path).convert("RGB") # print('im_path:', im_path) text, text_len = self.wk_converter.encode(anno['gt_words']) text_label_split = [] off_cnt = 0 mx_len = np.max(text_len) word_num = len(text_len) for i in range(len(text_len)): text_label_split.append(text[off_cnt:off_cnt + text_len[i]]) off_cnt += text_len[i] padding_words = np.zeros((word_num, mx_len)) for i in range(word_num): padding_words[i][:text_len[i]] = text_label_split[i] if anno["boxes"].shape[0] > 0: target = RBoxList(torch.from_numpy(anno["boxes"]), (anno['width'], anno['height']), mode="xywha") target.add_field("labels", torch.from_numpy(anno["gt_classes"])) target.add_field( "difficult", torch.tensor([0 for i in range(len(anno["gt_classes"]))])) target.add_field("words", torch.from_numpy(padding_words)) target.add_field("word_length", torch.tensor(text_len)) target = target.clip_to_image(remove_empty=True) else: target = torch.from_numpy(padding_words) if self.transforms is not None: img, target = self.transforms(img, target) if _DEBUG: self.show_boxes(img, target) return img, target, index
def forward(self, x, boxes): """ Arguments: x (Tensor): the mask logits boxes (list[BoxList]): bounding boxes that are used as reference, one for ech image Returns: results (list[BoxList]): one BoxList for each image, containing the extra field mask """ # mask_prob = x.sigmoid() # [T, B, C] -> [B, T, C] word_probs = x.permute(1, 0, 2).softmax(2) # print('word_probs:', np.unique(word_probs.data.cpu().numpy())) # select masks coresponding to the predicted classes num_words = word_probs.shape[0] labels = [bbox.get_field("labels") for bbox in boxes] labels = torch.cat(labels) index = torch.arange(num_words, device=labels.device) word_probs = word_probs[index][:, None] boxes_per_image = [len(box) for box in boxes] word_probs = word_probs.split(boxes_per_image, dim=0) results = [] for prob, box in zip(word_probs, boxes): bbox = RBoxList(box.bbox, box.size, mode="xywha") # print('prob:', prob) for field in box.fields(): bbox.add_field(field, box.get_field(field)) bbox.add_field("word_probs", prob) results.append(bbox) return results
def __getitem__(self, index): # if _DEBUG: # index = 1 # img_id = self.ids[index] im_path = self.annobase[index][ 'image'] # os.path.join(self.root, img_id + '.jpg') img = Image.open(im_path).convert("RGB") # im = cv2.imread(im_path) anno = self.annobase[index] target = RBoxList(torch.from_numpy(anno["boxes"]), (anno['width'], anno['height']), mode="xywha") target.add_field("labels", torch.from_numpy(anno["gt_classes"])) target.add_field( "difficult", torch.Tensor([0 for i in range(len(anno["gt_classes"]))])) masks = [ np.array(mask).reshape(1, -1).tolist() for mask in anno["polys"] ] # print('masks data:', masks) masks = SegmentationMask(masks, img.size) target.add_field("masks", masks) # target.add_field("masks", torch.from_numpy(np.array(anno["polys"]).reshape(-1))) target = target.clip_to_image(remove_empty=True) # print('target:', target, im_path) if self.transforms is not None: # off = int(self.num_samples * np.random.rand()) # mix_index = (off + index) % self.num_samples # img_mix = Image.open(self.annobase[mix_index]['image']).convert("RGB") # img, target = self.mixup(img, img_mix, target) img, target = self.transforms(img, target) if _DEBUG: if not target is None: # print('target:', target, im_path) self.show_boxes(img, target) return img, target, index
def __getitem__(self, idx): img_name = self.id_to_img_map[idx] img = utils.pil_load_img(os.path.join(self.root, img_name)) anno = utils.read_anno(self.annotations, img_name) # filter illegal anno = [obj for obj in anno if not obj['illegibility']] # bounding boxes boxes = [ utils.generate_rbox(obj["points"], np.array(img).shape[:2]) for obj in anno ] boxes = torch.as_tensor(boxes).reshape(-1, 5) # guard against no boxes target = RBoxList(boxes, img.size, mode="xywha") # classes classes = [1] * len(anno) classes = torch.tensor(classes) target.add_field("labels", classes) target.add_field("difficult", torch.tensor([0 for i in range(len(classes))])) # masks masks = [obj["points"].reshape((1, -1)).tolist() for obj in anno] masks = SegmentationMask(masks, img.size) target.add_field("masks", masks) # target = target.clip_to_image(remove_empty=True) if self.transforms is not None: img, target = self.transforms(img, target) assert target is not None, "{} target is None.".format(img_name) return img, target, idx
def forward_for_single_feature_map(self, anchors, objectness, box_regression): """ Arguments: anchors: list[BoxList] objectness: tensor of size N, A, H, W box_regression: tensor of size N, A * 5, H, W """ device = objectness.device N, A, H, W = objectness.shape # put in the same format as anchors objectness = objectness.permute(0, 2, 3, 1).reshape(N, -1) objectness = objectness.sigmoid() box_regression = box_regression.view(N, -1, 5, H, W).permute(0, 3, 4, 1, 2) box_regression = box_regression.reshape(N, -1, 5) num_anchors = A * H * W pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) batch_idx = torch.arange(N, device=device)[:, None] box_regression = box_regression[batch_idx, topk_idx] image_shapes = [box.size for box in anchors] concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) concat_anchors = concat_anchors.reshape(N, -1, 5)[batch_idx, topk_idx] # print('concat_anchors:', concat_anchors.size(), concat_anchors[:, 2:4]) proposals = self.box_coder.decode(box_regression.view(-1, 5), concat_anchors.view(-1, 5)) proposals = proposals.view(N, -1, 5) # print('outsider:', proposals.size(), proposals[:, 2:4], 'box_regression:', box_regression) #------- result = [] for proposal, score, im_shape in zip(proposals, objectness, image_shapes): boxlist = RBoxList(proposal, im_shape, mode="xywha") # print('before nms:', boxlist.bbox.size(), boxlist.bbox[:, 2:4]) boxlist.add_field("objectness", score) # boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) boxlist = boxlist_nms( boxlist, self.nms_thresh, max_proposals=self.post_nms_top_n, score_field="objectness", ) # print('after nms:', boxlist.bbox.size(), boxlist.bbox[:, 2:4]) result.append(boxlist) return result
def rotate_boxes(self, target, angle): # def rotate_gt_bbox(iminfo, gt_boxes, gt_classes, angle): gt_boxes = target.bbox if isinstance(target.bbox, torch.Tensor): gt_boxes = target.bbox.data.cpu().numpy() gt_labels = target.get_field("labels") gt_masks = [ gt_polygon.polygons[0].numpy().reshape(-1, 2) for gt_polygon in target.get_field("masks") ] rotated_gt_boxes = np.empty((len(gt_boxes), 5), dtype=np.float32) iminfo = target.size im_height = iminfo[1] im_width = iminfo[0] origin_gt_boxes = gt_boxes # anti-clockwise to clockwise arc cos_cita = np.cos(np.pi / 180 * angle) sin_cita = np.sin(np.pi / 180 * angle) # clockwise matrix rotation_matrix = np.array([[cos_cita, sin_cita], [-sin_cita, cos_cita]]) # rotate rbox pts_ctr = origin_gt_boxes[:, 0:2] pts_ctr = pts_ctr - np.tile((im_width / 2, im_height / 2), (gt_boxes.shape[0], 1)) pts_ctr = np.array(np.dot(pts_ctr, rotation_matrix), dtype=np.int16) pts_ctr = np.squeeze(pts_ctr, axis=-1) + np.tile( (im_width / 2, im_height / 2), (gt_boxes.shape[0], 1)) # rotate masks rotated_gt_masks = [] for polygon in gt_masks: polygon = polygon - np.tile((im_width / 2, im_height / 2), (polygon.shape[0], 1)) polygon = np.array(np.dot(polygon, rotation_matrix), dtype=np.int16) polygon = np.squeeze(polygon, axis=-1) + np.tile( (im_width / 2, im_height / 2), (polygon.shape[0], 1)) rotated_gt_masks.append(polygon.astype(np.int32)) # print('pts_ctr:', pts_ctr, np.tile((im_width / 2, im_height / 2), (gt_boxes.shape[0], 1)).shape) origin_gt_boxes[:, 0:2] = pts_ctr # print origin_gt_boxes[:, 0:2] len_of_gt = len(origin_gt_boxes) # rectificate the angle in the range of [-45, 45] for idx in range(len_of_gt): ori_angle = origin_gt_boxes[idx, 4] height = origin_gt_boxes[idx, 3] width = origin_gt_boxes[idx, 2] # step 1: normalize gt (-45,135) if width < height: ori_angle += 90 width, height = height, width # step 2: rotate (-45,495) rotated_angle = ori_angle + angle # step 3: normalize rotated_angle (-45,135) while rotated_angle > 135: rotated_angle = rotated_angle - 180 rotated_gt_boxes[idx, 0] = origin_gt_boxes[idx, 0] rotated_gt_boxes[idx, 1] = origin_gt_boxes[idx, 1] # rotated_gt_boxes[idx, 3] = height * self.gt_margin # rotated_gt_boxes[idx, 2] = width * self.gt_margin rotated_gt_boxes[idx, 3] = height rotated_gt_boxes[idx, 2] = width rotated_gt_boxes[idx, 4] = rotated_angle x_inbound = np.logical_and(rotated_gt_boxes[:, 0] >= 0, rotated_gt_boxes[:, 0] < im_width) y_inbound = np.logical_and(rotated_gt_boxes[:, 1] >= 0, rotated_gt_boxes[:, 1] < im_height) inbound = np.logical_and(x_inbound, y_inbound) inbound_th = torch.tensor(np.where(inbound)).long().view(-1) rotated_gt_boxes_th = torch.tensor(rotated_gt_boxes[inbound]).to( target.bbox.device) # print('gt_labels before:', gt_labels.size(), inbound_th.size()) gt_labels = gt_labels[inbound_th] # print('gt_labels after:', gt_labels.size()) difficulty = target.get_field("difficult") difficulty = difficulty[inbound_th] target_cpy = RBoxList(rotated_gt_boxes_th, iminfo, mode='xywha') target_cpy.add_field('difficult', difficulty) target_cpy.add_field('labels', gt_labels) # add mask filed masks = [ polygon.reshape((1, -1)).tolist() for polygon in rotated_gt_masks ] masks = SegmentationMask(masks, iminfo) target_cpy.add_field("masks", masks) # print('has word:', target.has_field("words"), target.get_field("words")) if target.has_field("words"): words = target.get_field("words")[inbound_th] target_cpy.add_field('words', words) if target.has_field("word_length"): word_length = target.get_field("word_length")[inbound_th] target_cpy.add_field('word_length', word_length) # print('rotated_gt_boxes_th:', origin_gt_boxes[0], target_cpy.bbox[0]) # print('rotated_gt_boxes_th:', target.bbox.size(), gt_boxes.shape) if target_cpy.bbox.size()[0] <= 0: print(target_cpy.bbox.size()[0]) return None return target_cpy
def forward_for_single_feature_map(self, anchors, objectness_, box_regression_, scale): """ Arguments: anchors: list[BoxList] objectness: tensor of size N, A, H, W box_regression: tensor of size N, A * 5, H, W """ device = objectness_.device N, A, H, W = objectness_.shape width, height = anchors[0].size # scale = width / W # put in the same format as anchors objectness = objectness_.permute(0, 2, 3, 1) objectness = objectness.reshape(N, -1) # get the first 5 channels box_regression = box_regression_[:, :5].view(N, -1, 5, H, W).permute(0, 3, 4, 1, 2) box_regression = box_regression.reshape(N, -1, 5) all_proposals = eastbox2rbox(box_regression, self.base_size, (H, W), scale) num_anchors = A * H * W pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) batch_idx = torch.arange(N, device=device)[:, None] proposals = all_proposals.view(N, -1, 5)[batch_idx, topk_idx] image_shapes = [box.size for box in anchors] result = [] for proposal, score, im_shape in zip(proposals, objectness, image_shapes): if not self.training: # print("score:", score.shape) # print("proposal:", proposal.shape) proposal = proposal[score > self.score_thresh] score = score[score > self.score_thresh] # print("score:", score.shape, score) # print("proposal:", proposal.shape) # print("score:", score) boxlist = RBoxList(proposal, im_shape, mode="xywha") boxlist.add_field("objectness", score) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) boxlist = self.nms_fn( boxlist, self.nms_thresh, max_proposals=self.post_nms_top_n, score_field="objectness", ) result.append(boxlist) return result
def gt_crop(self, target, crop_portion, x_factor, y_factor): gt_boxes = target.bbox if isinstance(target.bbox, torch.Tensor): gt_boxes = target.bbox.data.cpu().numpy() gt_classes = target.get_field("labels") ow, oh = target.size dh = int(oh * crop_portion) dw = int(ow * crop_portion) th = int(oh * (1 - crop_portion)) tw = int(ow * (1 - crop_portion)) y0 = int((dh - 1) * y_factor) x0 = int((dw - 1) * x_factor) gt_boxes[:, 0] -= x0 gt_boxes[:, 1] -= y0 ##################### outer_bound = 0.2 polys = rbox2poly(gt_boxes).reshape(-1, 4, 2) # (b, 4) x_poly = polys[..., 0] y_poly = polys[..., 1] # bounding box with outer border on their heights and widths outer_bound_x = np.tile(outer_bound * gt_boxes[:, 2:3], (1, x_poly.shape[-1])) outer_bound_y = np.tile(outer_bound * gt_boxes[:, 3:4], (1, x_poly.shape[-1])) # (b, 4) x_check = np.logical_and(x_poly >= 0 - outer_bound_x, x_poly < tw + outer_bound_x) y_check = np.logical_and(y_poly >= 0 - outer_bound_y, y_poly < th + outer_bound_y) x_sum = np.sum(x_check.astype(np.int32), axis=-1) y_sum = np.sum(y_check.astype(np.int32), axis=-1) inbound = (x_sum + y_sum) > 7. ##################### # x_inbound = np.logical_and(gt_boxes[:, 0] >= 0, gt_boxes[:, 0] < tw) # y_inbound = np.logical_and(gt_boxes[:, 1] >= 0, gt_boxes[:, 1] < th) ##################### iminfo = (tw, th) # inbound = np.logical_and(x_inbound, y_inbound) inbound_th = torch.tensor(np.where(inbound)).long().view(-1) crop_gt_boxes_th = torch.tensor(gt_boxes[inbound]).to( target.bbox.device) # print('gt_labels before:', gt_labels.size(), inbound_th.size()) gt_labels = gt_classes[inbound_th].to(target.bbox.device) # print('gt_labels after:', gt_labels.size()) difficulty = target.get_field("difficult") difficulty = difficulty[inbound_th].to(target.bbox.device) target_cpy = RBoxList(crop_gt_boxes_th, iminfo, mode='xywha') target_cpy.add_field('difficult', difficulty) target_cpy.add_field('labels', gt_labels) # print('has word:', target.has_field("words"), target.get_field("words")) if target.has_field("words"): words = target.get_field("words")[inbound_th] target_cpy.add_field('words', words) if target.has_field("word_length"): word_length = target.get_field("word_length")[inbound_th] target_cpy.add_field('word_length', word_length) if target.has_field("masks"): seg_masks = target.get_field("masks")[inbound_th] # print('seg_masks:', seg_masks) target_cpy.add_field('masks', seg_masks.shift(-x0, -y0, iminfo)) # print('rotated_gt_boxes_th:', origin_gt_boxes[0], target_cpy.bbox[0]) # print('rotated_gt_boxes_th:', target.bbox.size(), gt_boxes.shape) if target_cpy.bbox.size()[0] <= 0: # print("target has no boxes...") return None return target_cpy