def crop_words(img, boxes, height, width=None, grayscale=True): """ # Note make sure that the vertices of all boxes are inside the image """ words = [] #소영추가 vac = [] for j in range(len(boxes)): h, w = img.shape[:2] # polygon case box = np.reshape(boxes[j], (-1, 2)) rbox = polygon_to_rbox(box) if len(rbox) == 1: vac.append(j) continue word_w, word_h = rbox[2] * w, rbox[3] * h word_ar = word_w / word_h word_h = int(height) word_w = int(round(height * word_ar)) src = np.asarray(box * [w, h], np.float32) dst = np.array([[0, 0], [word_w, 0], [word_w, word_h], [0, word_h]], dtype=np.float32) M = cv2.getPerspectiveTransform(src, dst) word = cv2.warpPerspective(img, M, (word_w, word_h), flags=cv2.INTER_CUBIC) if grayscale: word = cv2.cvtColor(word, cv2.COLOR_BGR2GRAY) word = cv2.normalize(word, word, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX) word = word[:, :, None] word = word.astype(np.float32) if width is not None: tmp_word = word[:, :width, :] word = np.ones([height, width, tmp_word.shape[2]]) word[:, slice(0, tmp_word.shape[1]), :] = tmp_word words.append(word) return words, vac
def crop_words(img, boxes, height, width=None, grayscale=True): """ # Note make sure that the vertices of all boxes are inside the image """ #plt.figure(figsize=[12]*2) #plt.imshow(img[:, :, (2,1,0)]/255) #self.plot_gt(i, show_labels=False) #plt.show() words = [] for j in range(len(boxes)): h, w = img.shape[:2] if boxes.shape[1] == 4: # box case box = np.round(boxes[j] * [w, h, w, h]).astype(np.int32) xmin, ymin, xmax, ymax = box word_w, word_h = xmax - xmin, ymax - ymin word_ar = word_w / word_h word_h = int(height) word_w = int(round(height * word_ar)) word = img[ymin:ymax, xmin:xmax, :] word = cv2.resize(word, (word_w, word_h), interpolation=cv2.INTER_CUBIC) else: # polygon case box = np.reshape(boxes[j], (-1, 2)) rbox = polygon_to_rbox(box) word_w, word_h = rbox[2] * w, rbox[3] * h word_ar = word_w / word_h word_h = int(height) word_w = int(round(height * word_ar)) src = np.asarray(box * [w, h], np.float32) dst = np.array( [[0, 0], [word_w, 0], [word_w, word_h], [0, word_h]], dtype=np.float32) M = cv2.getPerspectiveTransform(src, dst) word = cv2.warpPerspective(img, M, (word_w, word_h), flags=cv2.INTER_CUBIC) if grayscale: word = cv2.cvtColor(word, cv2.COLOR_BGR2GRAY) word = cv2.normalize(word, word, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX) word = word[:, :, None] word = word.astype(np.float32) if width is not None: tmp_word = word[:, :width, :] word = np.zeros([height, width, tmp_word.shape[2]]) word[:, slice(0, tmp_word.shape[1]), :] = tmp_word words.append(word) return words
def encode(self, gt_data, debug=False): """Encode ground truth polygones to segments and links for local classification and regression. # Arguments gt_data: shape (boxes, 4 xy + classes) # Return shape (priors, 2 segment_labels + 5 segment_offsets + 2*8 inter_layer_links_labels + 2*4 cross_layer_links_labels) """ rboxes = [] polygons = [] for word in gt_data: xy = np.reshape(word[:8], (-1, 2)) xy = np.copy(xy) * (self.image_w, self.image_h) polygons.append(xy) rbox = polygon_to_rbox(xy) rboxes.append(rbox) rboxes = self.gt_rboxes = np.array(rboxes) polygnos = self.gt_polygons = np.array(polygons) # compute segments for i in range(len(self.prior_maps)): m = self.prior_maps[i] # compute priors #m.compute_priors() num_priors = len(m.priors) # assigne gt to priors a_l = m.minmax_size[0] match_indices = np.full(num_priors, -1, dtype=np.int32) min_lhs_eq_11 = np.full(num_priors, 1e6, dtype=np.float32) for j in range(len(rboxes)): # ~12.9 ms cx, cy, w, h, theta = rboxes[j] c = rboxes[j, :2] # constraint on ratio between box size and word height, equation (11) lhs_eq_11 = max(a_l / h, h / a_l) if lhs_eq_11 <= 1.5: R = rot_matrix(theta) for k in range(num_priors): # hurts # is center of prior is in gt rbox d = np.abs(np.dot(m.priors_xy[k] - c, R.T)) if d[0] < w / 2. and d[1] < h / 2.: # is lhs of equation (11) minimal for prior if lhs_eq_11 < min_lhs_eq_11[k]: min_lhs_eq_11[k] = lhs_eq_11 match_indices[k] = j m.match_indices = match_indices segment_mask = match_indices != -1 # segment labels m.segment_labels = np.empty((num_priors, 2), dtype=np.int8) m.segment_labels[:, 0] = np.logical_not(segment_mask) m.segment_labels[:, 1] = segment_mask # compute offsets only for assigned boxes m.segment_offsets = np.zeros((num_priors, 5)) pos_segment_idxs = np.nonzero(segment_mask)[0] for j in pos_segment_idxs: # box_idx # ~4 ms gt_idx = match_indices[j] rbox = rboxes[gt_idx] polygon = polygons[gt_idx] cx, cy, w, h, theta = rbox R = rot_matrix(theta) prior_x, prior_y = m.priors_xy[j] prior_w, prior_h = m.priors_wh[j] # step 2 figuer 5, rotate word anticlockwise around the center of prior d = rbox[:2] - m.priors_xy[j] #poly_loc = rbox_to_polygon([*d, w, h, theta]) poly_loc = rbox_to_polygon(list(d) + [w, h, theta]) poly_loc_easy = polygon - m.priors_xy[j] poly_loc_rot = np.dot(poly_loc, R.T) # step 3 figure 5, crop word to left and right of prior poly_loc_croped = np.copy(poly_loc_rot) poly_loc_croped[:, 0] = np.clip(poly_loc_croped[:, 0], -prior_w / 2., prior_w / 2.) # step 4 figure 5, rotate croped word box clockwisely poly_loc_rot_back = np.dot(poly_loc_croped, R) rbox_loc_rot_back = polygon_to_rbox(poly_loc_rot_back) # encode, solve (3) to (7) to get local offsets #offset = np.array([*(rbox_loc_rot_back[:2]/a_l), # *(np.log(rbox_loc_rot_back[2:4]/a_l)), # rbox_loc_rot_back[4]]) offset = np.array( list(rbox_loc_rot_back[:2] / a_l) + list(np.log(rbox_loc_rot_back[2:4] / a_l)) + [rbox_loc_rot_back[4]]) offset[:4] /= m.priors[j, -4:] # variances m.segment_offsets[j] = offset # for debugging local geometry if debug: prior_poly_loc = np.array( [[-prior_w, +prior_h], [+prior_w, +prior_h], [+prior_w, -prior_h], [-prior_w, -prior_h]]) / 2. plt.figure(figsize=[10] * 2) ax = plt.gca() ax.add_patch( plt.Polygon(prior_poly_loc, fill=False, edgecolor='r', linewidth=1)) ax.add_patch( plt.Polygon(poly_loc, fill=False, edgecolor='b', linewidth=1)) ax.add_patch( plt.Polygon(np.dot(poly_loc, R.T), fill=False, edgecolor='k', linewidth=1)) #ax.add_patch(plt.Polygon(poly_loc_easy, fill=False, edgecolor='r', linewidth=1)) #ax.add_patch(plt.Polygon(np.dot(poly_loc_easy, R.T), fill=False, edgecolor='y', linewidth=1)) ax.add_patch( plt.Polygon(poly_loc_croped, fill=False, edgecolor='c', linewidth=1)) ax.add_patch( plt.Polygon(poly_loc_rot_back, fill=False, edgecolor='y', linewidth=1)) lim = 50 plt.xlim(-lim, lim) plt.ylim(-lim, lim) plt.grid() plt.show() break # compute link labels m.inter_layer_links_labels = np.zeros((num_priors, 16), dtype=np.int8) m.cross_layer_links_labels = np.zeros((num_priors, 8), dtype=np.int8) if i > 0: previous_map = self.prior_maps[i - 1] # we only have to check neighbors if we are positive for idx in pos_segment_idxs: neighbor_idxs = m.inter_layer_neighbors_idxs[idx] for n, neighbor_idx in enumerate(neighbor_idxs): # valid neighbors if m.inter_layer_neighbors_valid[idx, n]: # neighbor matched to the same word if match_indices[idx] == match_indices[neighbor_idx]: # since we are positive and match to the same word, neighbor has to be positive m.inter_layer_links_labels[idx, n * 2 + 1] = 1 # would be nice, but we refere to invalid neighbors #label = m.inter_layer_neighbors_valid[idx] & (match_indices[neighbor_idxs] == match_indices[idx]) #m.inter_layer_links_labels[idx, 1::2] = label if i > 0: neighbor_idxs = m.cross_layer_neighbors_idxs[idx] for n, neighbor_idx in enumerate(neighbor_idxs): # cross layer neighbors are always valid if match_indices[idx] == previous_map.match_indices[ neighbor_idx]: m.cross_layer_links_labels[idx, n * 2 + 1] = 1 m.inter_layer_links_labels[:, ::2] = np.logical_not( m.inter_layer_links_labels[:, 1::2]) m.cross_layer_links_labels[:, ::2] = np.logical_not( m.cross_layer_links_labels[:, 1::2]) # collect encoded ground truth maps = self.prior_maps segment_labels = np.concatenate([m.segment_labels for m in maps]) segment_offsets = np.concatenate([m.segment_offsets for m in maps]) inter_layer_links_labels = np.concatenate( [m.inter_layer_links_labels for m in maps]) cross_layer_links_labels = np.concatenate( [m.cross_layer_links_labels for m in maps]) return np.concatenate([ segment_labels, segment_offsets, inter_layer_links_labels, cross_layer_links_labels ], axis=1)