def output_ranklist(img_results, img_infos, out_file): """Output the worst results for debugging. Args: img_results (list[dict]): Image result list. img_infos (list[dict]): Image information list. out_file (str): The output file path. Returns: sorted_results (list[dict]): Image results sorted by hmean. """ assert utils.is_type_list(img_results, dict) assert utils.is_type_list(img_infos, dict) assert isinstance(out_file, str) assert out_file.endswith('json') sorted_results = [] for idx, result in enumerate(img_results): name = img_infos[idx]['file_name'] img_result = result img_result['file_name'] = name sorted_results.append(img_result) sorted_results = sorted(sorted_results, key=itemgetter('hmean'), reverse=False) mmcv.dump(sorted_results, file=out_file) return sorted_results
def sort_vertex(points_x, points_y): """Sort box vertices in clockwise order from left-top first. Args: points_x (list[float]): x of four vertices. points_y (list[float]): y of four vertices. Returns: sorted_points_x (list[float]): x of sorted four vertices. sorted_points_y (list[float]): y of sorted four vertices. """ assert utils.is_type_list(points_x, float) or utils.is_type_list( points_x, int) assert utils.is_type_list(points_y, float) or utils.is_type_list( points_y, int) assert len(points_x) == 4 assert len(points_y) == 4 x = np.array(points_x) y = np.array(points_y) center_x = np.sum(x) * 0.25 center_y = np.sum(y) * 0.25 x_arr = np.array(x - center_x) y_arr = np.array(y - center_y) angle = np.arctan2(y_arr, x_arr) * 180.0 / np.pi sort_idx = np.argsort(angle) sorted_points_x, sorted_points_y = [], [] for i in range(4): sorted_points_x.append(points_x[sort_idx[i]]) sorted_points_y.append(points_y[sort_idx[i]]) return convert_canonical(sorted_points_x, sorted_points_y)
def show_feature(features, names, to_uint8, out_file=None): """Visualize a list of feature maps. Args: features (list(ndarray)): The feature map list. names (list(str)): The visualized title list. to_uint8 (list(1|0)): The list indicating whether to convent feature maps to uint8. out_file (str): The output file name. If set to None, the output image will be shown without saving. """ assert utils.is_ndarray_list(features) assert utils.is_type_list(names, str) assert utils.is_type_list(to_uint8, int) assert utils.is_none_or_type(out_file, str) assert utils.equal_len(features, names, to_uint8) num = len(features) row = col = math.ceil(math.sqrt(num)) for i, (f, n) in enumerate(zip(features, names)): plt.subplot(row, col, i + 1) plt.title(n) if to_uint8[i]: f = f.astype(np.uint8) plt.imshow(f) if out_file is None: plt.show() else: plt.savefig(out_file)
def __init__(self, indexes=[1], scores=[0.9]): assert utils.is_type_list(indexes, int) assert utils.is_type_list(scores, float) assert utils.equal_len(indexes, scores) self.indexes = indexes self.scores = scores
def __init__(self, in_channels, stem_channels, block_cfgs, arch_layers, arch_channels, strides, out_indices=None, plugins=None, init_cfg=[ dict(type='Xavier', layer='Conv2d'), dict(type='Constant', val=1, layer='BatchNorm2d'), ]): super().__init__(init_cfg=init_cfg) assert isinstance(in_channels, int) assert isinstance(stem_channels, int) or utils.is_type_list( stem_channels, int) assert utils.is_type_list(arch_layers, int) assert utils.is_type_list(arch_channels, int) assert utils.is_type_list(strides, tuple) or utils.is_type_list( strides, int) assert len(arch_layers) == len(arch_channels) == len(strides) assert out_indices is None or isinstance(out_indices, (list, tuple)) self.out_indices = out_indices self._make_stem_layer(in_channels, stem_channels) self.num_stages = len(arch_layers) self.use_plugins = False self.arch_channels = arch_channels self.res_layers = [] if plugins is not None: self.plugin_ahead_names = [] self.plugin_after_names = [] self.use_plugins = True for i, num_blocks in enumerate(arch_layers): stride = strides[i] channel = arch_channels[i] if self.use_plugins: self._make_stage_plugins(plugins, stage_idx=i) res_layer = self._make_layer( block_cfgs=block_cfgs, inplanes=self.inplanes, planes=channel, blocks=num_blocks, stride=stride, ) self.inplanes = channel layer_name = f'layer{i + 1}' self.add_module(layer_name, res_layer) self.res_layers.append(layer_name)
def forward_train(self, feat, out_enc, targets_dict, img_metas): if img_metas is not None: assert utils.is_type_list(img_metas, dict) assert len(img_metas) == feat.size(0) valid_ratios = None if img_metas is not None: valid_ratios = [ img_meta.get('valid_ratio', 1.0) for img_meta in img_metas ] if self.mask else None targets = targets_dict['padded_targets'].to(feat.device) tgt_embedding = self.embedding(targets) # bsz * seq_len * emb_dim out_enc = out_enc.unsqueeze(1) # bsz * 1 * emb_dim in_dec = torch.cat((out_enc, tgt_embedding), dim=1) # bsz * (seq_len + 1) * C out_dec = self._2d_attention(in_dec, feat, out_enc, valid_ratios=valid_ratios) # bsz * (seq_len + 1) * num_classes return out_dec[:, 1:, :] # bsz * seq_len * num_classes
def get_gt_masks(ann_infos): """Get ground truth masks and ignored masks. Args: ann_infos (list[dict]): Each dict contains annotation infos of one image, containing following keys: masks, masks_ignore. Returns: gt_masks (list[list[list[int]]]): Ground truth masks. gt_masks_ignore (list[list[list[int]]]): Ignored masks. """ assert utils.is_type_list(ann_infos, dict) gt_masks = [] gt_masks_ignore = [] for ann_info in ann_infos: masks = ann_info['masks'] mask_gt = [] for mask in masks: assert len(mask[0]) >= 8 and len(mask[0]) % 2 == 0 mask_gt.append(mask[0]) gt_masks.append(mask_gt) masks_ignore = ann_info['masks_ignore'] mask_gt_ignore = [] for mask_ignore in masks_ignore: assert len(mask_ignore[0]) >= 8 and len(mask_ignore[0]) % 2 == 0 mask_gt_ignore.append(mask_ignore[0]) gt_masks_ignore.append(mask_gt_ignore) return gt_masks, gt_masks_ignore
def str2tensor(self, strings): """Convert text-string to ctc-loss input tensor. Args: strings (list[str]): ['hello', 'world']. Returns: dict (str: tensor | list[tensor]): tensors (list[tensor]): [torch.Tensor([1,2,3,3,4]), torch.Tensor([5,4,6,3,7])]. flatten_targets (tensor): torch.Tensor([1,2,3,3,4,5,4,6,3,7]). target_lengths (tensor): torch.IntTensot([5,5]). """ assert utils.is_type_list(strings, str) tensors = [] indexes = self.str2idx(strings) for index in indexes: tensor = torch.IntTensor(index) tensors.append(tensor) target_lengths = torch.IntTensor([len(t) for t in tensors]) flatten_target = torch.cat(tensors) return { 'targets': tensors, 'flatten_targets': flatten_target, 'target_lengths': target_lengths }
def tensor2idx(self, output, img_metas, topk=1, return_topk=False): """Convert model output tensor to index-list. Args: output (tensor): The model outputs with size: N * T * C. img_metas (list[dict]): Each dict contains one image info. topk (int): The highest k classes to be returned. return_topk (bool): Whether to return topk or just top1. Returns: indexes (list[list[int]]): [[1,2,3,3,4], [5,4,6,3,7]]. scores (list[list[float]]): [[0.9,0.8,0.95,0.97,0.94], [0.9,0.9,0.98,0.97,0.96]] ( indexes_topk (list[list[list[int]->len=topk]]): scores_topk (list[list[list[float]->len=topk]]) ). """ assert utils.is_type_list(img_metas, dict) assert len(img_metas) == output.size(0) assert isinstance(topk, int) assert topk >= 1 valid_ratios = [ img_meta.get('valid_ratio', 1.0) for img_meta in img_metas ] batch_size = output.size(0) output = F.softmax(output, dim=2) output = output.cpu().detach() batch_topk_value, batch_topk_idx = output.topk(topk, dim=2) batch_max_idx = batch_topk_idx[:, :, 0] scores_topk, indexes_topk = [], [] scores, indexes = [], [] feat_len = output.size(1) for b in range(batch_size): valid_ratio = valid_ratios[b] decode_len = min(feat_len, math.ceil(feat_len * valid_ratio)) pred = batch_max_idx[b, :] select_idx = [] prev_idx = self.blank_idx for t in range(decode_len): tmp_value = pred[t].item() if tmp_value not in (prev_idx, self.blank_idx): select_idx.append(t) prev_idx = tmp_value select_idx = torch.LongTensor(select_idx) topk_value = torch.index_select(batch_topk_value[b, :, :], 0, select_idx) # valid_seqlen * topk topk_idx = torch.index_select(batch_topk_idx[b, :, :], 0, select_idx) topk_idx_list, topk_value_list = topk_idx.numpy().tolist( ), topk_value.numpy().tolist() indexes_topk.append(topk_idx_list) scores_topk.append(topk_value_list) indexes.append([x[0] for x in topk_idx_list]) scores.append([x[0] for x in topk_value_list]) if return_topk: return indexes_topk, scores_topk return indexes, scores
def __init__(self, datasets, separate_eval=True, pipeline=None, force_apply=False, **kwargs): new_datasets = [] if pipeline is not None: assert isinstance( pipeline, list), 'pipeline must be list[dict] or list[list[dict]].' if is_type_list(pipeline, dict): self._apply_pipeline(datasets, pipeline, force_apply) new_datasets = datasets elif is_2dlist(pipeline): assert is_2dlist(datasets) assert len(datasets) == len(pipeline) for sub_datasets, tmp_pipeline in zip(datasets, pipeline): self._apply_pipeline(sub_datasets, tmp_pipeline, force_apply) new_datasets.extend(sub_datasets) else: if is_2dlist(datasets): for sub_datasets in datasets: new_datasets.extend(sub_datasets) else: new_datasets = datasets datasets = [build_dataset(c, kwargs) for c in new_datasets] super().__init__(datasets, separate_eval)
def tesseract_recog_inference(self, imgs, **kwargs): """Inference image(s) with the tesseract recognizer. Args: imgs (ndarray or list[ndarray]): image(s) to inference. Returns: result (dict): Predicted results. """ is_batch = True if isinstance(imgs, np.ndarray): is_batch = False imgs = [imgs] assert is_type_list(imgs, np.ndarray) api = self.get_tesserocr_api() results = [] for img in imgs: image = Image.fromarray(img) api.SetImage(image) api.SetRectangle(0, 0, img.shape[1], img.shape[0]) # Remove beginning and trailing spaces from Tesseract text = api.GetUTF8Text().strip() conf = api.MeanTextConf() / 100 results.append({'text': text, 'score': conf}) # close tesserocr api api.End() if not is_batch: return results[0] else: return results
def str2tensor(self, strings): """ Convert text-string into tensor. Args: strings (list[str]): ['hello', 'world'] Returns: dict (str: Tensor | list[tensor]): tensors (list[Tensor]): [torch.Tensor([1,2,3,3,4]), torch.Tensor([5,4,6,3,7])] padded_targets (Tensor(bsz * max_seq_len)) """ assert utils.is_type_list(strings, str) tensors, padded_targets = [], [] indexes = self.str2idx(strings) for index in indexes: tensor = torch.LongTensor(index) tensors.append(tensor) # target tensor for loss src_target = torch.LongTensor(tensor.size(0) + 2).fill_(0) src_target[-1] = self.end_idx src_target[0] = self.start_idx src_target[1:-1] = tensor padded_target = (torch.ones(self.max_seq_len) * self.padding_idx).long() char_num = src_target.size(0) if char_num > self.max_seq_len: padded_target = src_target[:self.max_seq_len] else: padded_target[:char_num] = src_target padded_targets.append(padded_target) padded_targets = torch.stack(padded_targets, 0).long() return {'targets': tensors, 'padded_targets': padded_targets}
def forward(self, feat, img_metas=None): if img_metas is not None: assert utils.is_type_list(img_metas, dict) assert len(img_metas) == feat.size(0) valid_ratios = None if img_metas is not None: valid_ratios = [ img_meta.get('valid_ratio', 1.0) for img_meta in img_metas ] if self.mask else None h_feat = feat.size(2) feat_v = F.max_pool2d( feat, kernel_size=(h_feat, 1), stride=1, padding=0) feat_v = feat_v.squeeze(2) # bsz * C * W feat_v = feat_v.permute(0, 2, 1).contiguous() # bsz * W * C holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C if valid_ratios is not None: valid_hf = [] T = holistic_feat.size(1) for i, valid_ratio in enumerate(valid_ratios): valid_step = min(T, math.ceil(T * valid_ratio)) - 1 valid_hf.append(holistic_feat[i, valid_step, :]) valid_hf = torch.stack(valid_hf, dim=0) else: valid_hf = holistic_feat[:, -1, :] # bsz * C holistic_feat = self.linear(valid_hf) # bsz * C return holistic_feat
def show_img_boundary(img, boundary): """Show image and instance boundaires. Args: img (ndarray): The input image. boundary (list[float or int]): The input boundary. """ assert isinstance(img, np.ndarray) assert utils.is_type_list(boundary, int) or utils.is_type_list( boundary, float) cv2.polylines(img, [np.array(boundary).astype(np.int32).reshape(-1, 1, 2)], True, color=(0, 255, 0), thickness=1) plt.imshow(img) plt.show()
def _parse_anno_info(self, annotations): """Parse char boxes annotations. Args: annotations (list[dict]): Annotations of one image, where each dict is for one character. Returns: dict: A dict containing the following keys: - chars (list[str]): List of character strings. - char_rects (list[list[float]]): List of char box, with each in style of rectangle: [x_min, y_min, x_max, y_max]. - char_quads (list[list[float]]): List of char box, with each in style of quadrangle: [x1, y1, x2, y2, x3, y3, x4, y4]. """ assert utils.is_type_list(annotations, dict) assert 'char_box' in annotations[0] assert 'char_text' in annotations[0] assert len(annotations[0]['char_box']) in [4, 8] chars, char_rects, char_quads = [], [], [] for ann in annotations: char_box = ann['char_box'] if len(char_box) == 4: char_box_type = ann.get('char_box_type', 'xyxy') if char_box_type == 'xyxy': char_rects.append(char_box) char_quads.append([ char_box[0], char_box[1], char_box[2], char_box[1], char_box[2], char_box[3], char_box[0], char_box[3] ]) elif char_box_type == 'xywh': x1, y1, w, h = char_box x2 = x1 + w y2 = y1 + h char_rects.append([x1, y1, x2, y2]) char_quads.append([x1, y1, x2, y1, x2, y2, x1, y2]) else: raise ValueError(f'invalid char_box_type {char_box_type}') elif len(char_box) == 8: x_list, y_list = [], [] for i in range(4): x_list.append(char_box[2 * i]) y_list.append(char_box[2 * i + 1]) x_max, x_min = max(x_list), min(x_list) y_max, y_min = max(y_list), min(y_list) char_rects.append([x_min, y_min, x_max, y_max]) char_quads.append(char_box) else: raise Exception( f'invalid num in char box: {len(char_box)} not in (4, 8)') chars.append(ann['char_text']) ann = dict(chars=chars, char_rects=char_rects, char_quads=char_quads) return ann
def warp_img(src_img, box, jitter_flag=False, jitter_ratio_x=0.5, jitter_ratio_y=0.1): """Crop box area from image using opencv warpPerspective w/o box jitter. Args: src_img (np.array): Image before cropping. box (list[float | int]): Coordinates of quadrangle. """ assert utils.is_type_list(box, float) or utils.is_type_list(box, int) assert len(box) == 8 h, w = src_img.shape[:2] points_x = [min(max(x, 0), w) for x in box[0:8:2]] points_y = [min(max(y, 0), h) for y in box[1:9:2]] points_x, points_y = sort_vertex(points_x, points_y) if jitter_flag: box_jitter( points_x, points_y, jitter_ratio_x=jitter_ratio_x, jitter_ratio_y=jitter_ratio_y) points = [Point(points_x[i], points_y[i]) for i in range(4)] edges = [ LineString([points[i], points[i + 1 if i < 3 else 0]]) for i in range(4) ] pts1 = np.float32([[points[i].x, points[i].y] for i in range(4)]) box_width = max(edges[0].length, edges[2].length) box_height = max(edges[1].length, edges[3].length) pts2 = np.float32([[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]]) M = cv2.getPerspectiveTransform(pts1, pts2) dst_img = cv2.warpPerspective(src_img, M, (int(box_width), int(box_height))) return dst_img
def forward_test(self, feat, out_enc, img_metas): """ Args: feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. out_enc (Tensor): Encoder output of shape :math:`(N, D_m, H, W)`. img_metas (dict): A dict that contains meta information of input images. Preferably with the key ``valid_ratio``. Returns: Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`. """ if img_metas is not None: assert utils.is_type_list(img_metas, dict) assert len(img_metas) == feat.size(0) valid_ratios = None if img_metas is not None: valid_ratios = [ img_meta.get('valid_ratio', 1.0) for img_meta in img_metas ] if self.mask else None seq_len = self.max_seq_len bsz = feat.size(0) start_token = torch.full((bsz, ), self.start_idx, device=feat.device, dtype=torch.long) # bsz start_token = self.embedding(start_token) # bsz * emb_dim start_token = start_token.unsqueeze(1).expand(-1, seq_len, -1) # bsz * seq_len * emb_dim out_enc = out_enc.unsqueeze(1) # bsz * 1 * emb_dim decoder_input = torch.cat((out_enc, start_token), dim=1) # bsz * (seq_len + 1) * emb_dim outputs = [] for i in range(1, seq_len + 1): decoder_output = self._2d_attention(decoder_input, feat, out_enc, valid_ratios=valid_ratios) char_output = decoder_output[:, i, :] # bsz * num_classes char_output = F.softmax(char_output, -1) outputs.append(char_output) _, max_idx = torch.max(char_output, dim=1, keepdim=False) char_embedding = self.embedding(max_idx) # bsz * emb_dim if i < seq_len: decoder_input[:, i + 1, :] = char_embedding outputs = torch.stack(outputs, 1) # bsz * seq_len * num_classes return outputs
def __init__(self, max_ratio=None, box_type=None): if max_ratio is None: max_ratio = [0.1, 0.2, 0.1, 0.2] else: assert utils.is_type_list(max_ratio, float) assert len(max_ratio) == 4 assert box_type is None or box_type in ('char_rects', 'char_quads') self.max_ratio = max_ratio self.box_type = box_type
def sort_vertex(points_x, points_y): """Sort box vertices in clockwise order from left-top first. Args: points_x (list[float]): x of four vertices. points_y (list[float]): y of four vertices. Returns: sorted_points_x (list[float]): x of sorted four vertices. sorted_points_y (list[float]): y of sorted four vertices. """ assert utils.is_type_list(points_x, (float, int)) assert utils.is_type_list(points_y, (float, int)) assert len(points_x) == 4 assert len(points_y) == 4 vertices = np.stack((points_x, points_y), axis=-1).astype(np.float32) vertices = _sort_vertex(vertices) sorted_points_x = list(vertices[:, 0]) sorted_points_y = list(vertices[:, 1]) return sorted_points_x, sorted_points_y
def crop_img(src_img, box, long_edge_pad_ratio=0.4, short_edge_pad_ratio=0.2, debug=False): """Crop text region with their bounding box. Args: src_img (np.array): The original image. box (list[float | int]): Points of quadrangle. long_edge_pad_ratio (float): Box pad ratio for long edge corresponding to font size. short_edge_pad_ratio (float): Box pad ratio for short edge corresponding to font size. """ assert utils.is_type_list(box, float) or utils.is_type_list(box, int) assert len(box) == 8 assert 0. <= long_edge_pad_ratio < 1.0 assert 0. <= short_edge_pad_ratio < 1.0 h, w = src_img.shape[:2] points_x = np.clip(np.array(box[0::2]), 0, w) points_y = np.clip(np.array(box[1::2]), 0, h) box_width = np.max(points_x) - np.min(points_x) box_height = np.max(points_y) - np.min(points_y) font_size = min(box_height, box_width) if box_height < box_width: horizontal_pad = long_edge_pad_ratio * font_size vertical_pad = short_edge_pad_ratio * font_size else: horizontal_pad = short_edge_pad_ratio * font_size vertical_pad = long_edge_pad_ratio * font_size left = np.clip(int(np.min(points_x) - horizontal_pad), 0, w) top = np.clip(int(np.min(points_y) - vertical_pad), 0, h) right = np.clip(int(np.max(points_x) + horizontal_pad), 0, w) bottom = np.clip(int(np.max(points_y) + vertical_pad), 0, h) dst_img = src_img[top:bottom, left:right] return dst_img
def forward_train(self, feat, out_enc, targets_dict, img_metas=None): if img_metas is not None: assert utils.is_type_list(img_metas, dict) assert len(img_metas) == feat.size(0) valid_ratios = None if img_metas is not None: valid_ratios = [ img_meta.get('valid_ratio', 1.0) for img_meta in img_metas ] if self.mask else None if self.train_mode: targets = targets_dict['padded_targets'].to(feat.device) tgt_embedding = self.embedding(targets) outputs = [] start_token = torch.full((feat.size(0), ), self.start_idx, device=feat.device, dtype=torch.long) start_token = self.embedding(start_token) for i in range(-1, self.max_seq_len): if i == -1: if self.dec_gru: hx1 = cx1 = self.rnn_decoder_layer1(out_enc) hx2 = cx2 = self.rnn_decoder_layer2(hx1) else: hx1, cx1 = self.rnn_decoder_layer1(out_enc) hx2, cx2 = self.rnn_decoder_layer2(hx1) if not self.train_mode: y_prev = start_token else: if self.train_mode: y_prev = tgt_embedding[:, i, :] y, hx1, cx1, hx2, cx2 = self._2d_attention( y_prev, feat, out_enc, hx1, cx1, hx2, cx2, valid_ratios=valid_ratios) if self.train_mode: y = self.pred_dropout(y) else: y = F.softmax(y, -1) _, max_idx = torch.max(y, dim=1, keepdim=False) char_embedding = self.embedding(max_idx) y_prev = char_embedding outputs.append(y) outputs = torch.stack(outputs, 1) return outputs
def __init__(self, in_channels=3, stem_channels=32, base_channels=32, arch_settings=[3, 4, 6, 6, 3], strides=[2, 1, 2, 1, 1], out_indices=None, last_stage_pool=False, init_cfg=[ dict(type='Xavier', layer='Conv2d'), dict(type='Constant', val=1, layer='BatchNorm2d') ]): super().__init__(init_cfg=init_cfg) assert isinstance(in_channels, int) assert isinstance(stem_channels, int) assert utils.is_type_list(arch_settings, int) assert utils.is_type_list(strides, int) assert len(arch_settings) == len(strides) assert out_indices is None or isinstance(out_indices, (list, tuple)) assert isinstance(last_stage_pool, bool) self.out_indices = out_indices self.last_stage_pool = last_stage_pool self.block = BasicBlock self.inplanes = stem_channels self._make_stem_layer(in_channels, stem_channels) self.res_layers = [] planes = base_channels for i, num_blocks in enumerate(arch_settings): stride = strides[i] res_layer = self._make_layer(block=self.block, inplanes=self.inplanes, planes=planes, blocks=num_blocks, stride=stride) self.inplanes = planes * self.block.expansion planes *= 2 layer_name = f'layer{i + 1}' self.add_module(layer_name, res_layer) self.res_layers.append(layer_name)
def convert_canonical(points_x, points_y): """Make left-top be first. Args: points_x (list[float]): x of four vertices. points_y (list[float]): y of four vertices. Returns: sorted_points_x (list[float]): x of sorted four vertices. sorted_points_y (list[float]): y of sorted four vertices. """ assert utils.is_type_list(points_x, float) or utils.is_type_list( points_x, int) assert utils.is_type_list(points_y, float) or utils.is_type_list( points_y, int) assert len(points_x) == 4 assert len(points_y) == 4 points = [Point(points_x[i], points_y[i]) for i in range(4)] polygon = Polygon([(p.x, p.y) for p in points]) min_x, min_y, _, _ = polygon.bounds points_to_lefttop = [ LineString([points[i], Point(min_x, min_y)]) for i in range(4) ] distances = np.array([line.length for line in points_to_lefttop]) sort_dist_idx = np.argsort(distances) lefttop_idx = sort_dist_idx[0] if lefttop_idx == 0: point_orders = [0, 1, 2, 3] elif lefttop_idx == 1: point_orders = [1, 2, 3, 0] elif lefttop_idx == 2: point_orders = [2, 3, 0, 1] else: point_orders = [3, 0, 1, 2] sorted_points_x = [points_x[i] for i in point_orders] sorted_points_y = [points_y[j] for j in point_orders] return sorted_points_x, sorted_points_y
def crop_img(src_img, box): """Crop box area to rectangle. Args: src_img (np.array): Image before crop. box (list[float | int]): Points of quadrangle. """ assert utils.is_type_list(box, float) or utils.is_type_list(box, int) assert len(box) == 8 h, w = src_img.shape[:2] points_x = [min(max(x, 0), w) for x in box[0:8:2]] points_y = [min(max(y, 0), h) for y in box[1:9:2]] left = int(min(points_x)) top = int(min(points_y)) right = int(max(points_x)) bottom = int(max(points_y)) dst_img = src_img[top:bottom, left:right] return dst_img
def evaluate(self, results, metric='hmean-iou', logger=None, score_thr=None, min_score_thr=0.3, max_score_thr=0.9, step=0.1, rank_list=None, **kwargs): """Evaluate the hmean metric. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. score_thr (float): Deprecated. Please use min_score_thr instead. min_score_thr (float): Minimum score threshold of prediction map. max_score_thr (float): Maximum score threshold of prediction map. step (float): The spacing between score thresholds. rank_list (str): json file used to save eval result of each image after ranking. Returns: dict[dict[str: float]]: The evaluation results. """ assert utils.is_type_list(results, dict) metrics = metric if isinstance(metric, list) else [metric] allowed_metrics = ['hmean-iou', 'hmean-ic13'] metrics = set(metrics) & set(allowed_metrics) img_infos = [] ann_infos = [] for i in range(len(self)): img_info = {'filename': self.data_infos[i]['file_name']} img_infos.append(img_info) ann_infos.append(self.get_ann_info(i)) eval_results = eval_hmean(results, img_infos, ann_infos, metrics=metrics, score_thr=score_thr, min_score_thr=min_score_thr, max_score_thr=max_score_thr, step=step, logger=logger, rank_list=rank_list) return eval_results
def __init__(self, box_keys=['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4'], jitter_prob=0.5, max_jitter_ratio_x=0.05, max_jitter_ratio_y=0.02): assert utils.is_type_list(box_keys, str) assert 0 <= jitter_prob <= 1 assert 0 <= max_jitter_ratio_x <= 1 assert 0 <= max_jitter_ratio_y <= 1 self.box_keys = box_keys self.jitter_prob = jitter_prob self.max_jitter_ratio_x = max_jitter_ratio_x self.max_jitter_ratio_y = max_jitter_ratio_y
def _parse_anno_info(self, annotations): """Parse annotations of boxes, texts and labels for one image. Args: annotations (list[dict]): Annotations of one image, where each dict is for one character. Returns: dict: A dict containing the following keys: - bboxes (np.ndarray): Bbox in one image with shape: box_num * 4. - relations (np.ndarray): Relations between bbox with shape: box_num * box_num * D. - texts (np.ndarray): Text index with shape: box_num * text_max_len. - labels (np.ndarray): Box Labels with shape: box_num * (box_num + 1). """ assert utils.is_type_list(annotations, dict) assert 'box' in annotations[0] assert 'text' in annotations[0] assert 'label' in annotations[0] boxes, texts, text_inds, labels, edges = [], [], [], [], [] for ann in annotations: box = ann['box'] x_list, y_list = box[0:8:2], box[1:9:2] sorted_x_list, sorted_y_list = sort_vertex(x_list, y_list) sorted_box = [] for x, y in zip(sorted_x_list, sorted_y_list): sorted_box.append(x) sorted_box.append(y) boxes.append(sorted_box) text = ann['text'] texts.append(ann['text']) text_ind = [self.dict[c] for c in text if c in self.dict] text_inds.append(text_ind) labels.append(ann['label']) edges.append(ann.get('edge', 0)) ann_infos = dict( boxes=boxes, texts=texts, text_inds=text_inds, edges=edges, labels=labels) return self.list_to_numpy(ann_infos)
def forward_test(self, feat, out_enc, img_metas): """ Args: feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. out_enc (Tensor): Encoder output of shape :math:`(N, D_m, H, W)`. img_metas (dict): A dict that contains meta information of input images. Preferably with the key ``valid_ratio``. Returns: Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`. """ if img_metas is not None: assert utils.is_type_list(img_metas, dict) assert len(img_metas) == feat.size(0) return self.forward_train(feat, out_enc, None, img_metas)
def __init__(self, datasets, separate_eval=True, show_mean_scores='auto', pipeline=None, force_apply=False, **kwargs): new_datasets = [] if pipeline is not None: assert isinstance( pipeline, list), 'pipeline must be list[dict] or list[list[dict]].' if is_type_list(pipeline, dict): self._apply_pipeline(datasets, pipeline, force_apply) new_datasets = datasets elif is_2dlist(pipeline): assert is_2dlist(datasets) assert len(datasets) == len(pipeline) for sub_datasets, tmp_pipeline in zip(datasets, pipeline): self._apply_pipeline(sub_datasets, tmp_pipeline, force_apply) new_datasets.extend(sub_datasets) else: if is_2dlist(datasets): for sub_datasets in datasets: new_datasets.extend(sub_datasets) else: new_datasets = datasets datasets = [build_dataset(c, kwargs) for c in new_datasets] super().__init__(datasets, separate_eval) if not separate_eval: raise NotImplementedError( 'Evaluating datasets as a whole is not' ' supported yet. Please use "separate_eval=True"') assert isinstance(show_mean_scores, bool) or show_mean_scores == 'auto' if show_mean_scores == 'auto': show_mean_scores = len(self.datasets) > 1 self.show_mean_scores = show_mean_scores if show_mean_scores is True or show_mean_scores == 'auto' and len( self.datasets) > 1: if len(set([type(ds) for ds in self.datasets])) != 1: raise NotImplementedError( 'To compute mean evaluation scores, all datasets' 'must have the same type')
def forward_test(self, feat, out_enc, img_metas): if img_metas is not None: assert utils.is_type_list(img_metas, dict) assert len(img_metas) == feat.size(0) valid_ratios = None if img_metas is not None: valid_ratios = [ img_meta.get('valid_ratio', 1.0) for img_meta in img_metas ] if self.mask else None seq_len = self.max_seq_len bsz = feat.size(0) start_token = torch.full((bsz, ), self.start_idx, device=feat.device, dtype=torch.long) # bsz start_token = self.embedding(start_token) # bsz * emb_dim start_token = start_token.unsqueeze(1).expand(-1, seq_len, -1) # bsz * seq_len * emb_dim out_enc = out_enc.unsqueeze(1) # bsz * 1 * emb_dim decoder_input = torch.cat((out_enc, start_token), dim=1) # bsz * (seq_len + 1) * emb_dim outputs = [] for i in range(1, seq_len + 1): decoder_output = self._2d_attention(decoder_input, feat, out_enc, valid_ratios=valid_ratios) char_output = decoder_output[:, i, :] # bsz * num_classes char_output = F.softmax(char_output, -1) outputs.append(char_output) _, max_idx = torch.max(char_output, dim=1, keepdim=False) char_embedding = self.embedding(max_idx) # bsz * emb_dim if i < seq_len: decoder_input[:, i + 1, :] = char_embedding outputs = torch.stack(outputs, 1) # bsz * seq_len * num_classes return outputs