def create_roidb_from_box_list(self, box_list, gt_roidb): assert len( box_list ) == self.nrof_images, 'Number of boxes must match number of ground-truth images' roidb = [] for i in range(self.nrof_images): boxes = box_list[i] num_boxes = boxes.shape[0] overlaps = np.zeros((num_boxes, self.nrof_classes), dtype=np.float32) if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0: gt_boxes = gt_roidb[i]['boxes'] gt_classes = gt_roidb[i]['gt_classes'] gt_overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float)) argmaxes = gt_overlaps.argmax(axis=1) maxes = gt_overlaps.max(axis=1) I = np.where(maxes > 0)[0] overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] overlaps = scipy.sparse.csr_matrix(overlaps) roidb.append({ 'boxes': boxes, 'gt_classes': np.zeros((num_boxes, ), dtype=np.int32), 'gt_overlaps': overlaps, 'flipped': False, 'seg_areas': np.zeros((num_boxes, ), dtype=np.float32) }) return roidb
def _compute_targets(self, rois, overlaps, labels): """ Compute bounding-box regression targets for an image. for each roi find the corresponding gt_box, then compute the distance. """ # Indices of ground-truth ROIs gt_inds = np.where(overlaps == 1)[0] if len(gt_inds) == 0: # Bail if the image has no ground-truth ROIs return np.zeros((rois.shape[0], 5), dtype=np.float32) # Indices of examples for which we try to make predictions ex_inds = np.where(overlaps >= self.config.TRAIN.BBOX_THRESH)[0] # Get IoU overlap between each ex ROI and gt ROI ex_gt_overlaps = bbox_overlaps( np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) # Find which gt ROI each ex ROI has max overlap with: # this will be the ex ROI's gt target gt_assignment = ex_gt_overlaps.argmax(axis=1) gt_rois = rois[gt_inds[gt_assignment], :] ex_rois = rois[ex_inds, :] targets = np.zeros((rois.shape[0], 5), dtype=np.float32) targets[ex_inds, 0] = labels[ex_inds] targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) return targets
def _sample_rois(self, all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # overlaps: (rois x gt_boxes) overlaps = bbox_overlaps( np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) labels = gt_boxes[gt_assignment, 4] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= self.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < self.BG_THRESH_HI) & (max_overlaps >= self.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) # Sample background regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 labels[fg_rois_per_this_image:] = 0 rois = all_rois[keep_inds] bbox_target_data = self._compute_targets( rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) bbox_targets, bbox_inside_weights = \ self._get_bbox_regression_labels(bbox_target_data, num_classes) return labels, rois, bbox_targets, bbox_inside_weights
def _sample_rois( self, all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # overlaps: (rois x gt_boxes) overlaps = bbox_overlaps( np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) labels = gt_boxes[gt_assignment, 4] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= self.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice( fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < self.BG_THRESH_HI) & (max_overlaps >= self.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) # Sample background regions without replacement if bg_inds.size > 0: bg_inds = npr.choice( bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 labels[fg_rois_per_this_image:] = 0 rois = all_rois[keep_inds] bbox_target_data = self._compute_targets( rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) bbox_targets, bbox_inside_weights = \ self._get_bbox_regression_labels(bbox_target_data, num_classes) return labels, rois, bbox_targets, bbox_inside_weights
def page_eval(page_image, pred_boxes, gt_boxes, use_pixel_level=True, output_all=False): page_stats = {} overlaps = bbox_overlaps(gt_boxes.astype(np.float32), pred_boxes.astype(np.float32)) gt_to_pred_map = get_coverage_mapping(overlaps.T) inv_page_binary = _inverse_binary(page_image, thresh=0.99) output_titles = [] output_boxes = [] # Check each gt_box for ind in range(gt_boxes.shape[0]): word_stats = {} gt_box = gt_boxes[ind, :] pred_ind = gt_to_pred_map.get(ind, None) if pred_ind is None: continue pred_box = pred_boxes[pred_ind, :] if use_pixel_level: o2o = pixel_iou(gt_box=gt_box, box=pred_box, binary_image=inv_page_binary) else: o2o = overlaps[ind, pred_ind] output_boxes.append(pred_box) output_titles.append('%4.3f' % o2o) word_stats['gt'] = gt_box.tolist() word_stats['pred'] = pred_box.tolist() word_stats['cover'] = o2o page_stats['word_%d' % ind] = word_stats if output_all: for ind in range(pred_boxes.shape[0]): pred_box = pred_boxes[ind, :] output_boxes.append(pred_box) output_titles.append('-') word_stats['pred'] = pred_box.tolist() page_stats['box_%d' % ind] = word_stats page_stats['predictions'] = pred_boxes.shape[0] page_stats['gt_boxes'] = gt_boxes.shape[0] preds_image = debugShowBoxes(page_image.copy(), boxes=output_boxes, gt_boxes=gt_boxes, titles=output_titles, dont_show=True) return page_stats, preds_image
def calc_overlaps(self, anchors, gt_boxes, inds_inside): # overlaps between the anchors and the gt boxes # overlaps (ex, gt) overlaps = bbox_overlaps( np.ascontiguousarray(anchors, dtype=np.float), np.ascontiguousarray(gt_boxes, dtype=np.float)) argmax_overlaps = overlaps.argmax(axis=1) max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] gt_argmax_overlaps = overlaps.argmax(axis=0) gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] return argmax_overlaps, max_overlaps, gt_max_overlaps, \ gt_argmax_overlaps
def _compute_labels(A, anchors, gt_boxes, dontcare_areas, gt_ishard): # label: 1 is positive, 0 is negative, -1 is dont care labels = np.empty((A, ), dtype=np.float32) labels.fill(-1) # initlize label to be -1 # compute overlaps between the anchors and the gt boxes for labeling anchor overlaps, shape is A x G. # Note: anchors (A,4), gt_boxes (G,5) overlaps = bbox_overlaps(np.ascontiguousarray(anchors, dtype=np.float), np.ascontiguousarray(gt_boxes, dtype=np.float)) # (A)#找到和所有anchor的overlap最大的gt_box的index argmax_overlaps = overlaps.argmax( axis=1) # the max index of each raw --> the index of gt_box # (A)#找到和所有anchor的overlap最大的gt_box的value max_overlaps = overlaps[np.arange(A), argmax_overlaps] # (G)#找到和所有gt_box的overlap最大的anchor的index gt_argmax_overlaps = overlaps.argmax( axis=0) # the max index of each column --> the index of anchor # (G)#找到和所有gt_box的overlap最大的anchor的value gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] # (G)#找到和所有gt_box的overlap最大的anchor的index, 同时找到所有具有这些最大overlap的anchor gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] # assign bg labels first so that positive labels can clobber them if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: labels[ max_overlaps < cfg.TRAIN. RPN_NEGATIVE_OVERLAP] = 0 # max_overlaps and labels have same shape # fg label: for each gt, anchor with highest overlap labels[gt_argmax_overlaps] = 1 # fg label: above threshold IOU labels[ max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 # overlap>0.7, fg if cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels last so that negative labels can clobber positives # 将所有的anchor中与gt_box的overlap最大值还小于0.3的anchor的label置为0 labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # preclude dontcare areas if dontcare_areas is not None and dontcare_areas.shape[0] > 0: # intersec shape is D x A intersecs = bbox_intersections( np.ascontiguousarray(dontcare_areas, dtype=np.float), np.ascontiguousarray(anchors, dtype=np.float)) intersecs_ = intersecs.sum(axis=0) # A x 1 labels[intersecs_ > cfg.TRAIN.DONTCARE_AREA_INTERSECTION_HI] = -1 # preclude hard samples that are highly occlusioned, truncated or difficult to see if cfg.TRAIN.PRECLUDE_HARD_SAMPLES and gt_ishard is not None and gt_ishard.shape[ 0] > 0: assert gt_ishard.shape[0] == gt_boxes.shape[0] gt_ishard = gt_ishard.astype(int) gt_hardboxes = gt_boxes[gt_ishard == 1, :] if gt_hardboxes.shape[0] > 0: # H x A hard_overlaps = bbox_overlaps( np.ascontiguousarray(gt_hardboxes, dtype=np.float), # H x 4 np.ascontiguousarray(anchors, dtype=np.float)) # A x 4 hard_max_overlaps = hard_overlaps.max( axis=0) # (A) return the value labels[ hard_max_overlaps >= cfg.TRAIN. RPN_POSITIVE_OVERLAP] = -1 # hard_max_overlaps and labels have the same shape max_intersec_label_inds = hard_overlaps.argmax( axis=1 ) # H x 1, return the index, so the values are all less than A labels[max_intersec_label_inds] = -1 # # subsample positive labels if we have too many, less than 128 num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] if len(fg_inds) > num_fg: disable_inds = np.random.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) # randomly clip some samples labels[disable_inds] = -1 # subsample negative labels if we have too many, less than 128 # if the num of positive samples less than 128, use negative samples to replace to ensure the total num of negative and positive samples is 256 num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) bg_inds = np.where(labels == 0)[0] if len(bg_inds) > num_bg: disable_inds = np.random.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) labels[disable_inds] = -1 # print "was %s inds, disabling %s, now %s inds" % ( # len(bg_inds), len(disable_inds), np.sum(labels == 0)) return labels, max_overlaps, argmax_overlaps
def random_rois(batch_gt_boxes, batch_embeddings=None, image_size=(900, 1200), num_classes=2, num_boxes_per_class=20, random_boxes_per_word=10, std_range=30, lower_bound=None, tf_format_in=False, tf_format_out=False): """ :param batch_gt_boxes: (num_boxes, 5) => [batch_idx, x1, y1, x2, y2] :param batch_embeddings: (num_boxes, embedding_size+1) => e.g. [batch_id, v^T \in R^540] :param image_size: :param num_classes: :param num_boxes_per_class: :param lower_bound: (float) => lower bound of first IoU class :param tf_format_in: :param tf_format_out: :return: """ assert batch_gt_boxes.shape[ 1] == 5 and batch_gt_boxes.ndim == 2, 'Pass gt_boxes in [batch, x,y,x,y] format' assert isinstance(num_classes, int) and num_classes > 1, 'Must have at least 2 classes' if tf_format_in: # Switch to abs boxes coordinates batch_gt_boxes = tf_format_to_abs(batch_gt_boxes, image_size) RANDS_PER_WORD = random_boxes_per_word RANDOM_STD_RANGE = std_range # gt_randoms = _generate_random_boxes_around_gt(gt_boxes, RANDS_PER_WORD, pixel_std=5) batch_size = batch_gt_boxes.astype(np.int32)[:, 0].max() + 1 batch_rois = [] assigned_embeddings = [] for n in range(batch_size): bidx = np.where(batch_gt_boxes[:, 0] == n)[0] gt_boxes = batch_gt_boxes[bidx, 1:] gt_randoms = np.vstack([ _generate_random_boxes_around_gt(gt_boxes, RANDS_PER_WORD, pixel_std=i + 0.5) for i in range(0, RANDOM_STD_RANGE, 2) ]) rand_randoms = _generate_random_relative_boxes( RANDS_PER_WORD * gt_boxes.shape[0]) * np.array(image_size * 2) rois = np.vstack((gt_randoms, rand_randoms)).astype(np.float32) # Clamp to image rois[:, ::2] = np.minimum(np.maximum(rois[:, ::2], 0), image_size[0]) rois[:, 1::2] = np.minimum(np.maximum(rois[:, 1::2], 0), image_size[1]) ovlps = bbox_overlaps(rois.astype(np.float32), gt_boxes.astype(np.float32)) scores = ovlps.max(1).flatten() # NOTICE: the following assumes classes can be 5 or 2 by default, for any other num_classes you should set a lower_bound that makes sense lower_bound = (0.35 if num_classes == 5 else 0.2) if lower_bound is None else lower_bound class_bins = np.linspace(lower_bound, 1., num_classes + 1)[1:] func = partial(_box_scoring_helper, bins=class_bins) labels = np.array(map(func, scores)) keep = _label_filter_picker_helper( num_classes=num_classes, labels=labels, num_boxes_per_class=num_boxes_per_class) rois = rois[keep, :] labels = labels[keep] rois = np.hstack( (np.ones(rois.shape[0], np.float32)[:, np.newaxis] * n, rois, labels[:, np.newaxis])).astype(np.float32) batch_rois.append(rois) if batch_embeddings is not None: assigned_words = ovlps.argmax(1).flatten() embeddings = batch_embeddings[bidx, 1:] embeddings = embeddings[assigned_words, :] # Aligned embedding with randomly selected idx embeddings = embeddings[keep, :] assigned_embeddings.append(embeddings) batch_rois = np.vstack(batch_rois) if tf_format_out: new_rois = batch_rois[:, 1:-1][:, [1, 0, 3, 2]] / np.array( list(image_size) * 2)[::-1] batch_rois[:, 1:-1] = new_rois if batch_embeddings is not None: assigned_embeddings = np.vstack(assigned_embeddings) return batch_rois, assigned_embeddings return batch_rois
def phoc_eval_page(page_image, pred_boxes, pred_phocs, gt_boxes, gt_words): """ :param page_image: :param pred_boxes: :param pred_phocs: :param gt_boxes: :param gt_words: (list) of str contating gt-words :param image_transform: :param o2o_score_func: :return: """ page_stats = {} overlaps = bbox_overlaps(gt_boxes.astype(np.float32), pred_boxes.astype(np.float32)) gt_to_pred_map = get_coverage_mapping(overlaps.T) pred_phocs = np.atleast_2d(pred_phocs) pred_boxes = np.atleast_2d(pred_boxes) output_titles = [] output_boxes = [] # Check each gt_box for ind in range(gt_boxes.shape[0]): word_stats = {} gt_box = gt_boxes[ind, :] gt_word = gt_words[ind] pred_ind = gt_to_pred_map.get(ind, None) word_stats['gt'] = gt_box.tolist() phocs, dim = phoc_letters_and_digits([gt_word]) word_stats['gt_phoc'] = phocs[0, :].tolist() word_stats['text'] = gt_word if pred_ind is not None: pred_box = pred_boxes[pred_ind, :] pred_phoc = pred_phocs[pred_ind, :] o2o = overlaps[ind, pred_ind] output_boxes.append(pred_box) output_titles.append('%s[%d]' % (gt_word, o2o * 100)) word_stats['pred'] = pred_box.tolist() word_stats['pre_phoc'] = pred_phoc.tolist() word_stats['cover'] = o2o page_stats['word_%d' % ind] = word_stats # Do stats for all un-assigned words for idx in set(range(pred_boxes.shape[0])) - set(gt_to_pred_map.values()): word_stats = {} best_gt_id = np.argmax(overlaps[:, idx]) gt_box = gt_boxes[best_gt_id, :] gt_word = gt_words[best_gt_id] pred_ind = idx word_stats['gt'] = gt_box.tolist() phocs, dim = phoc_letters_and_digits([gt_word]) word_stats['gt_phoc'] = phocs[0, :].tolist() word_stats['text'] = gt_word if pred_ind is not None: pred_box = pred_boxes[pred_ind, :] pred_phoc = pred_phocs[pred_ind, :] o2o = overlaps[ind, pred_ind] output_boxes.append(pred_box) output_titles.append('%s[%d]' % (gt_word, o2o*100)) word_stats['pred'] = pred_box.tolist() word_stats['pre_phoc'] = pred_phoc.tolist() word_stats['cover'] = o2o page_stats['word_red_%d' % idx] = word_stats page_stats['predictions'] = pred_boxes.shape[0] page_stats['gt_boxes'] = gt_boxes.shape[0] preds_image = debugShowBoxes(page_image.copy(), boxes=output_boxes, gt_boxes=gt_boxes, titles=output_titles, dont_show=True) return page_stats, preds_image