def predict_image_track_with_precomputed_ref_features(img, ref_features, model_func): orig_shape = img.shape[:2] resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) boxes, probs, labels, *masks = model_func(resized_img, ref_features) boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) if masks: # has mask full_masks = [ _paste_mask(box, mask, orig_shape) for box, mask in zip(boxes, masks[0]) ] masks = full_masks else: # fill with none masks = [None] * len(boxes) results = [ DetectionResult(*args) for args in zip(boxes, probs, labels, masks) ] return results
def detect_one_image(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from TF model, takes image and returns (boxes, probs, labels) Returns: [DetectionResult] """ orig_shape = img.shape[:2] resizer = CustomResize(cfg.PREPROC.SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) boxes, probs, labels, fv = model_func(resized_img) boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) results = [ DetectionResult(*args) for args in zip(boxes, probs, labels, fv) ] return results
def __init__( self, model_path='weights/MaskRCNN-R50C41x-COCO_finetune-docrop_and_rotate_24500.pb', canvas_size=512, debug=False): if not tf.test.is_gpu_available(): from tensorflow.python.framework import test_util assert get_tf_version_tuple() >= (1, 7) and test_util.IsMklEnabled(), \ "Inference requires either GPU support or MKL support!" self.canvas_size = canvas_size self.debug = debug self.id_to_class_name = { 1: 'page', 2: 'profile_image', 3: 'van_tay', 4: 'passport_code' } self.resizer = CustomResize(self.canvas_size, self.canvas_size) print('Loading model at', model_path) self.graph = load_graph(model_path) self.input_tensor = self.graph.get_tensor_by_name('import/image:0') self.output_node_name = [ 'output/boxes', 'output/scores', 'output/labels', 'output/masks' ] self.outputs_tensor = [ self.graph.get_tensor_by_name('import/{}:0'.format(each_node)) for each_node in self.output_node_name ] self.config = tf.compat.v1.ConfigProto() # self.config.gpu_options.allow_growth = True self.config.gpu_options.per_process_gpu_memory_fraction = 0.1 self.sess = tf.compat.v1.Session(config=self.config, graph=self.graph) self.predict_crop(np.zeros((200, 200, 3), dtype=np.uint8)) print('Loaded model!')
def detect_one_image(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from TF model, takes image and returns (boxes, probs, labels, [masks]) Returns: [DetectionResult] """ orig_shape = img.shape[:2] resizer = CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE) resized_img = resizer.augment(img) scale = (resized_img.shape[0] * 1.0 / img.shape[0] + resized_img.shape[1] * 1.0 / img.shape[1]) / 2 boxes, probs, labels, *masks = model_func(resized_img) boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) if masks: # has mask full_masks = [fill_full_mask(box, mask, orig_shape) for box, mask in zip(boxes, masks[0])] masks = full_masks else: # fill with none masks = [None] * len(boxes) results = [DetectionResult(*args) for args in zip(boxes, probs, labels, masks)] return results
def resize_images(inputs): resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_imgs = [resizer.augment(inp[0]) for inp in inputs] org_shapes = [inp[0].shape for inp in inputs] scales = [np.sqrt(rimg.shape[0] * 1.0 / org_shape[0] * rimg.shape[1] / org_shape[1]) for rimg, org_shape in zip(resized_imgs, org_shapes)] return [[resized_imgs[i], inp[1], scales[i], org_shapes[i][:2]] for i, inp in enumerate(inputs)]
def predict_image(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from the TF model. It takes image and returns (boxes, probs, labels, [masks]) Returns: [DetectionResult] """ orig_shape = img.shape[:2] resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) boxes, probs, labels, *masks = model_func(resized_img) # Some slow numpy postprocessing: boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) if masks: full_masks = [_paste_mask(box, mask, orig_shape) for box, mask in zip(boxes, masks[0])] masks = full_masks else: # fill with none masks = [None] * len(boxes) results = [DetectionResult(*args) for args in zip(boxes, probs, labels.tolist(), masks)] return results
def detect_one_image_scale(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from TF model, takes image and returns (boxes, probs, labels, [masks]) Returns: [DetectionResult] """ scores_ts = [] boxes_ts = [] labels_ts = [] masks_ts = [] def add_preds_t(scores_t, boxes_t, labels_t, masks_t): scores_ts.append(scores_t) boxes_ts.append(boxes_t) labels_ts.append(labels_t) masks_ts.append(masks_t) orig_shape = img.shape[:2] for bbox_aug_scale in cfg.TEST.BBOX_AUG_SCALES: resizer = CustomResize(bbox_aug_scale, cfg.TEST.BBOX_AUG_MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) boxes, probs, labels, *masks = model_func(resized_img) boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) add_preds_t(probs, boxes, labels, masks) if cfg.TEST.BBOX_AUG_COORD_HEUR == 'UNION': boxes_c = np.vstack(boxes_ts) scores_c = np.vstack(scores_ts) lables_c = np.vstack(labels_ts) masks_c = np.vstack(masks_ts) # Apply NMS logger.info("detect_one_image_scale...") logger.info(boxes_c) logger.info(scores_c) if masks: # has mask full_masks = [fill_full_mask(box, mask, orig_shape) for box, mask in zip(boxes_c, masks_c[0])] masks = full_masks else: # fill with none masks = [None] * len(boxes_c) results = [DetectionResult(*args) for args in zip(boxes_c, scores_c, lables_c, masks)] return results
def pre_processing_inference(img): orig_shape = img.shape[:2] resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) return resized_img, scale, orig_shape
def run_resize_image(img): orig_shape = img.shape[:2] resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) return resized_img, orig_shape, scale
def predict_image(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from the TF model. It takes image and returns (boxes, probs, labels, [masks]) Returns: [DetectionResult] """ global total_time global cnt print("predict_image") # print("model_func") # print(model_func) orig_shape = img.shape[:2] resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) start_time = time.time() boxes, probs, labels, *masks = model_func(resized_img) end_time = time.time() cnt += 1 total_time += end_time - start_time print( f"--------- Inference time : {total_time / cnt}seconds -----------------" ) # print(f"boxes : {boxes}") # print(f"probs : {probs}") # print(f"labels : {labels}") # print(f"masks : {masks}") # print(len(masks)) # 1 # print(masks[0].shape) # (11, 28, 28) # Some slow numpy postprocessing: boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) if masks: full_masks = [ _paste_mask(box, mask, orig_shape) for box, mask in zip(boxes, masks[0]) ] masks = full_masks else: # fill with none masks = [None] * len(boxes) results = [ DetectionResult(*args) for args in zip(boxes, probs, labels.tolist(), masks) ] return results
def __init__(self, name, need_network=True, need_img=True, model="best"): super().__init__(name=name, is_deterministic=True) self._resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) self._prev_box = None self._ff_gt_feats = None self._need_network = need_network self._need_img = need_img self._rotated_bbox = None if need_network: logger.set_logger_dir( "/tmp/test_log_/" + str(random.randint(0, 10000)), 'd') if model == "best": load = "train_log/hard_mining3/model-1360500" elif model == "nohardexamples": load = "train_log/condrcnn_all_2gpu_lrreduce2/model-1200500" elif model == "newrpn": load = "train_log/newrpn1/model" elif model == "resnet50_nohardexamples": load = "train_log/condrcnn_all_resnet50/model-1200500" cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3] elif model == "resnet50": load = "train_log/hard_mining3_resnet50/model-1360500" cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3] elif model == "gotonly": load = "train_log/hard_mining3_onlygot/model-1361000" elif model.startswith("checkpoint:"): load = model.replace("checkpoint:", "") else: assert False, ("unknown model", model) from dataset import DetectionDataset # init tensorpack model # cfg.freeze(False) DetectionDataset( ) # initialize the config with information from our dataset cfg.EXTRACT_GT_FEATURES = True cfg.MODE_TRACK = False extract_model = ResNetFPNModel() extract_ff_feats_cfg = PredictConfig( model=extract_model, session_init=get_model_loader(load), input_names=['image', 'roi_boxes'], output_names=['rpn/feature']) finalize_configs(is_training=False) self._extract_func = OfflinePredictor(extract_ff_feats_cfg) cfg.EXTRACT_GT_FEATURES = False cfg.MODE_TRACK = True cfg.USE_PRECOMPUTED_REF_FEATURES = True self._pred_func = self._make_pred_func(load)
def detect_one_image_cls(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from TF model, takes image and returns (boxes, probs, labels, [masks]) Returns: [DetectionResult] """ orig_shape = img.shape[:2] resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_img = resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) boxes, probs, labels, ious, img_level_label, img_level_label_score, *masks = model_func( resized_img) boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) # box, prob, keep = soft_nms_py(boxes, probs, overlap_thresh=0.3, score_thresh=0.001, method='gaussian') # top_det, top_score = box_voting(box, prob, boxes, probs, thresh=0.7, scoring_method='ID') # # labels = labels[keep] # ious = ious[keep] # boxes = top_det # probs = top_score if masks: # has mask full_masks = [ fill_full_mask(box, mask, orig_shape) for box, mask in zip(boxes, masks[0]) ] masks = full_masks else: # fill with none masks = [None] * len(boxes) results = [ DetectionResult(*args) for args in zip(boxes, probs, labels, ious, masks) ] return results, img_level_label[0]
def get_train_aseval_dataflow(): """ Args: shard, num_shards: to get subset of evaluation data """ prw = PRWDataset(cfg.DATA.BASEDIR) imgs = prw.load() # no filter for training # test if it can repeat keys ds = DataFromList(imgs, shuffle=False) aug = imgaug.AugmentorList( [CustomResize(cfg.PREPROC.SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)]) def preprocess(img): fname = img['file_name'] im = cv2.imread(fname, cv2.IMREAD_COLOR) orig_shape = im.shape[:2] assert im is not None, fname im = im.astype('float32') # augmentation: im, params = aug.augment_return_params(im) ret = [fname, im, orig_shape] return ret ds = MapData(ds, preprocess) return ds
def __init__(self, cfg): self.cfg = cfg self.aug = imgaug.AugmentorList([ CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True) ])
def read_and_augment_images(ds): def mapf(dp): fname = dp[0] im = cv2.imread(fname, cv2.IMREAD_COLOR).astype('float32') assert im is not None, dp[0] dp[0] = im # assume floatbox as input assert dp[1].dtype == np.float32 dp[1] = box_to_point8(dp[1]) dp.append(fname) return dp ds = MapData(ds, mapf) augs = [ CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE), imgaug.Flip(horiz=True) ] ds = AugmentImageComponents(ds, augs, index=(0, ), coords_index=(1, )) def unmapf(points): boxes = point8_to_box(points) return boxes ds = MapDataComponent(ds, unmapf, 1) return ds
def __init__(self, cfg): self.cfg = cfg self.aug = imgaug.AugmentorList([ imgaug.RandomApplyAug(SquareAspectRatioResize(), 0.075), # imgaug.RandomApplyAug(imgaug.RandomCropRandomShape(wmin=int( # 0.75*cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE[0]), hmin=int(0.75*cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE[0])), 0.25), CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.RandomApplyAug(imgaug.Flip(horiz=True), 0.5), ])
def detect_one_image(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from TF model, takes [image] and returns (probs, boxes) Returns: [DetectionResult] """ resizer = CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE) resized_img = resizer.augment(img) scale = (resized_img.shape[0] * 1.0 / img.shape[0] + resized_img.shape[1] * 1.0 / img.shape[1]) / 2 fg_probs, fg_boxes = model_func(resized_img) fg_boxes = fg_boxes / scale fg_boxes = clip_boxes(fg_boxes, img.shape[:2]) return nms_fastrcnn_results(fg_boxes, fg_probs)
def detect_one_image(img, model_func): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from TF model, takes [image] and returns (probs, boxes) Returns: [DetectionResult] """ resizer = CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE) resized_img = resizer.augment(img) scale = (resized_img.shape[0] * 1.0 / img.shape[0] + resized_img.shape[1] * 1.0 / img.shape[1]) / 2 boxes, probs, labels = model_func(resized_img) boxes = boxes / scale results = [DetectionResult(*args) for args in zip(labels, boxes, probs)] return results
def detect_one_image_TTA2(img, model_func): orig_shape = img.shape[:2] SCALES = [1800, 2000] all_scale_results = [] augs = [0, 4] mask_whole = np.zeros((img.shape[0], img.shape[1])) for s in SCALES: mask_whole_d = np.zeros((img.shape[0], img.shape[1])) for d in augs: img = do_flip_transpose(img, d) resizer = CustomResize(s, config.MAX_SIZE) resized_img = resizer.augment(img.copy()) scale = (resized_img.shape[0] * 1.0 / img.shape[0] + resized_img.shape[1] * 1.0 / img.shape[1]) / 2 boxes, probs, labels, *masks = model_func(resized_img) boxes = boxes / scale if masks: # has mask full_masks = [ fill_full_mask_TTA(box, mask, orig_shape) for box, mask in zip(boxes, masks[0]) ] masks = full_masks else: # fill with none masks = [None] * len(boxes) results = [ DetectionResult(*args) for args in zip(boxes, probs, labels, masks) ] for re in results: mask_whole_d += undo_flip_transpose(re.mask, d) mask_whole_d = mask_whole_d / float(len(augs)) mask_whole += mask_whole_d mask_whole = mask_whole / float(len(SCALES)) mask_whole = mask_whole > 0.5 return mask_whole.astype('uint8')
def __init__(self, cfg): self.cfg = cfg self.aug = imgaug.AugmentorList([ # imgaug.RandomApplyAug(imgaug.RandomResize( xrange = (0.8, 1.5), minimum = (cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE[0], cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE[0]), aspect_ratio_thres = 0.0 ), prob = 0.5), imgaug.Flip(horiz=True, prob=0.5), imgaug.Flip(vert=True, prob=0.5), imgaug.RandomApplyAug(imgaug.Rotation(max_deg=180.0, step_deg=30.0, center_range=(0.5, 0.5)), prob=0.5), imgaug.RandomApplyAug(imgaug.Grayscale(keepshape=True), prob=0.5), CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), ])
def __init__(self, cfg, is_aws, is_gcs): self.cfg = cfg self.aug = imgaug.AugmentorList( [ CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True), ] ) self.is_aws = is_aws self.is_gcs = is_gcs if self.is_aws: self.s3 = boto3.resource("s3") elif self.is_gcs: self.storage_client = storage.Client.create_anonymous_client() self.bucket = self.storage_client.get_bucket("determined-ai-coco-dataset")
def get_query_dataflow(): """ Args: shard, num_shards: to get subset of evaluation data """ prw = PRWDataset(cfg.DATA.BASEDIR) imgs = prw.load_query() # no filter for training # test if it can repeat keys ds = DataFromList(imgs, shuffle=False) aug = imgaug.AugmentorList( [CustomResize(cfg.PREPROC.SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)]) def preprocess(img): fname, boxes, re_id_class = img['file_name'], img['boxes'], img[ 're_id_class'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" # augmentation: im, params = aug.augment_return_params(im) points = box_to_point8(boxes) points = aug.augment_coords(points, params) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" ret = [im, boxes, re_id_class] return ret ds = MapData(ds, preprocess) return ds
def get_train_dataflow(): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ roidbs = DetectionDataset().load_training_roidbs(cfg.DATA.TRAIN) print_class_histogram(roidbs) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. num = len(roidbs) roidbs = list( filter(lambda img: len(img['boxes'][img['is_crowd'] == 0]) > 0, roidbs)) logger.info( "Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}" .format(num - len(roidbs), len(roidbs))) ds = DataFromList(roidbs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(roidb): fname, boxes, klass, is_crowd = roidb['file_name'], roidb[ 'boxes'], roidb['class'], roidb['is_crowd'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') height, width = im.shape[:2] # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" if not cfg.DATA.ABSOLUTE_COORD: boxes[:, 0::2] *= width boxes[:, 1::2] *= height # augmentation: im, params = aug.augment_return_params(im) points = box_to_point8(boxes) points = aug.augment_coords(points, params) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" ret = {'image': im} # rpn anchor: try: if cfg.MODE_FPN: multilevel_anchor_inputs = get_multilevel_rpn_anchor_input( im, boxes, is_crowd) for i, (anchor_labels, anchor_boxes) in enumerate(multilevel_anchor_inputs): ret['anchor_labels_lvl{}'.format(i + 2)] = anchor_labels ret['anchor_boxes_lvl{}'.format(i + 2)] = anchor_boxes else: # anchor_labels, anchor_boxes ret['anchor_labels'], ret[ 'anchor_boxes'] = get_rpn_anchor_input( im, boxes, is_crowd) boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] ret['gt_boxes'] = boxes ret['gt_labels'] = klass if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once( "Input {} is filtered for training: {}".format(fname, str(e)), 'warn') return None if cfg.MODE_MASK: # augmentation will modify the polys in-place segmentation = copy.deepcopy(roidb['segmentation']) segmentation = [ segmentation[k] for k in range(len(segmentation)) if not is_crowd[k] ] assert len(segmentation) == len(boxes) # Apply augmentation on polygon coordinates. # And produce one image-sized binary mask per box. masks = [] width_height = np.asarray([width, height], dtype=np.float32) for polys in segmentation: if not cfg.DATA.ABSOLUTE_COORD: polys = [p * width_height for p in polys] polys = [aug.augment_coords(p, params) for p in polys] masks.append( segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret['gt_masks'] = masks # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) return ret if cfg.TRAINER == 'horovod': ds = MultiThreadMapData(ds, 5, preprocess) # MPI does not like fork() else: ds = MultiProcessMapDataZMQ(ds, 10, preprocess) return ds
def get_train_dataflow_coco(add_mask=False): """ Return a training dataflow. Each datapoint is: image, fm_labels, fm_boxes, gt_boxes, gt_class [, masks] """ imgs = COCODetection.load_many(config.BASEDIR, config.TRAIN_DATASET, add_gt=True, add_mask=add_mask) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. imgs = list(filter(lambda img: len(img['boxes']) > 0, imgs)) # log invalid training ds = DataFromList(imgs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(img): print("start preproc coco") start = time.time() if config.USE_SECOND_HEAD: fname, boxes, klass, second_klass, is_crowd = img['file_name'], img['boxes'], img['class'], \ img['second_class'], img['is_crowd'] else: fname, boxes, klass, is_crowd = img['file_name'], img[ 'boxes'], img['class'], img['is_crowd'] second_klass = None res = preproc_img(fname, boxes, klass, second_klass, is_crowd, aug) if res is None: print("coco: preproc_img returned None on", fname) return None ret, params = res im = ret[0] boxes = ret[3] # masks if add_mask: # augmentation will modify the polys in-place segmentation = copy.deepcopy(img.get('segmentation', None)) segmentation = [ segmentation[k] for k in range(len(segmentation)) if not is_crowd[k] ] assert len(segmentation) == len(boxes), (len(segmentation), len(boxes)) # one image-sized binary mask per box masks = [] for polys in segmentation: polys = [aug.augment_coords(p, params) for p in polys] masks.append( segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret.append(masks) # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) end = time.time() elapsed = end - start print("coco example done, elapsed:", elapsed) return ret #ds = MapData(ds, preprocess) ds = MultiProcessMapData(ds, nr_proc=4, map_func=preprocess, buffer_size=20) return ds
def get_train_dataflow_mapillary(add_mask=False, map_to_coco=False): train_img_path = config.MAPILLARY_PATH + "training/images/" train_label_path = config.MAPILLARY_PATH + "training/instances/" imgs = glob.glob(train_img_path + "*.jpg") ds = DataFromList(imgs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(fname): print("start preproc mapillary") start = time.time() label_fname = fname.replace(train_img_path, train_label_path).replace(".jpg", ".png") pil_label = Image.open(label_fname) label = np.array(pil_label) instances = np.unique(label) instance_classes = [x // 256 for x in instances] # filter by categories we use instances_valid = [ cls in config.MAPILLARY_CAT_IDS_TO_USE for cls in instance_classes ] instances = [ inst for inst, valid in zip(instances, instances_valid) if valid ] instance_classes = [ cls for cls, valid in zip(instance_classes, instances_valid) if valid ] if len(instances) == 0: print("no instances") pil_label.close() return None if map_to_coco: instance_classes = [ config.MAPILLARY_TO_COCO_MAP[cls] for cls in instance_classes ] instance_classes = [ config.VOID_LABEL if cls == config.VOID_LABEL else COCOMeta.category_id_to_class_id[cls] for cls in instance_classes ] else: # remap to contiguous numbers starting with 1 instance_classes = [ config.MAPILLARY_CAT_IDS_TO_USE.index(cls) + 1 for cls in instance_classes ] masks = np.array([label == inst for inst in instances], dtype=np.uint8) #import cProfile #start1 = time.time() boxes1 = np.array( [get_bbox_from_segmentation_mask(mask) for mask in masks], dtype=np.float32) #boxes1_time = time.time() - start1 #pr = cProfile.Profile() #pr.enable() #start1 = time.time() #boxes2 = get_bboxes_from_segmentation_masks(masks) #print("boxes1", boxes1_time, "boxes2", time.time() - start1) #pr.disable() #pr.print_stats(sort="cumulative") #assert (boxes1 == boxes2).all(), (boxes1, boxes2) boxes = boxes1 second_klass = np.array(instance_classes, dtype=np.int) klass = np.ones_like(second_klass) is_crowd = np.zeros_like(second_klass) res = preproc_img(fname, boxes, klass, second_klass, is_crowd, aug) if res is None: print("mapillary: preproc_img returned None on", fname) pil_label.close() return None ret, params = res if add_mask: do_flip, h, w = params[1] assert do_flip in (True, False), do_flip # augment label label = np.array(pil_label.resize((w, h), Image.NEAREST)) if do_flip: label = label[:, ::-1] # create augmented masks masks = np.array([label == inst for inst in instances], dtype=np.uint8) ret.append(masks) end = time.time() elapsed = end - start print("mapillary example done, elapsed:", elapsed) VISUALIZE = False if VISUALIZE: from viz import draw_annotation, draw_mask config.CLASS_NAMES = [str(idx) for idx in range(81)] im = ret[0] boxes = ret[3] draw_klass = ret[-2] viz = draw_annotation(im, boxes, draw_klass) for mask in masks: viz = draw_mask(viz, mask) tpviz.interactive_imshow(viz) pil_label.close() return ret #ds = MapData(ds, preprocess) ds = MultiProcessMapData(ds, nr_proc=8, map_func=preprocess, buffer_size=35) return ds
def get_train_dataflow_davis(add_mask=False): # train_img_path = config.DAVIS_PATH + "train/" # train_label_path = config.DAVIS_PATH + "train-gt/" # imgs = glob.glob(train_img_path + "*/*.jpg") # train_img_path = "/home/luiten/vision/PReMVOS/data/first/bike-trial/lucid_data_dreaming/" # train_label_path = "/home/luiten/vision/PReMVOS/data/first/bike-trial/lucid_data_dreaming/" # train_img_path = "/home/luiten/vision/PReMVOS/data/"+config.DAVIS_NAME+"/lucid_data_dreaming/" # train_label_path = "/home/luiten/vision/PReMVOS/data/"+config.DAVIS_NAME+"/lucid_data_dreaming/" # train_img_path = "/home/luiten/vision/youtubevos/ytvos_data/together/generated/augment_images/" # train_label_path = "/home/luiten/vision/youtubevos/ytvos_data/together/generated/augment_gt/" train_img_path = "/home/luiten/vision/youtubevos/DAVIS/davis_together/augment_images/" train_label_path = "/home/luiten/vision/youtubevos/DAVIS/davis_together/augment_gt/" imgs = sorted(glob.glob(train_img_path + "*/*.jpg")) ds = DataFromList(imgs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(fname): # print("start preproc mapillary") start = time.time() label_fname = fname.replace(train_img_path, train_label_path).replace(".jpg", ".png") pil_label = Image.open(label_fname) label = np.array(pil_label) instances = np.unique(label) instance_classes = [x // 256 for x in instances] if len(instances) == 0: print("no instances") pil_label.close() return None masks = np.array([label == inst for inst in instances], dtype=np.uint8) boxes1 = np.array( [get_bbox_from_segmentation_mask(mask) for mask in masks], dtype=np.float32) boxes = boxes1 # second_klass = np.array(instance_classes, dtype=np.int) second_klass = np.zeros_like(instance_classes, dtype=np.int) klass = np.ones_like(second_klass) is_crowd = np.zeros_like(second_klass) res = preproc_img(fname, boxes, klass, second_klass, is_crowd, aug) if res is None: print("davis: preproc_img returned None on", fname) pil_label.close() return None ret, params = res if add_mask: do_flip, h, w = params[1] assert do_flip in (True, False), do_flip # augment label label = np.array(pil_label.resize((w, h), Image.NEAREST)) if do_flip: label = label[:, ::-1] # create augmented masks masks = np.array([label == inst for inst in instances], dtype=np.uint8) ret.append(masks) end = time.time() elapsed = end - start # print("davis example done, elapsed:", elapsed) VISUALIZE = False if VISUALIZE: from viz import draw_annotation, draw_mask config.CLASS_NAMES = [str(idx) for idx in range(81)] im = ret[0] boxes = ret[3] draw_klass = ret[-2] viz = draw_annotation(im, boxes, draw_klass) for mask in masks: viz = draw_mask(viz, mask) tpviz.interactive_imshow(viz) pil_label.close() return ret ds = MapData(ds, preprocess) # ds = MultiProcessMapData(ds, nr_proc=8, map_func=preprocess, buffer_size=35) # ds = MultiProcessMapData(ds, nr_proc=8, map_func=preprocess) return ds
class MaskRCNNDocCrop(): def __init__( self, model_path='weights/MaskRCNN-R50C41x-COCO_finetune-docrop_and_rotate_24500.pb', canvas_size=512, debug=False): if not tf.test.is_gpu_available(): from tensorflow.python.framework import test_util assert get_tf_version_tuple() >= (1, 7) and test_util.IsMklEnabled(), \ "Inference requires either GPU support or MKL support!" self.canvas_size = canvas_size self.debug = debug self.id_to_class_name = { 1: 'page', 2: 'profile_image', 3: 'van_tay', 4: 'passport_code' } self.resizer = CustomResize(self.canvas_size, self.canvas_size) print('Loading model at', model_path) self.graph = load_graph(model_path) self.input_tensor = self.graph.get_tensor_by_name('import/image:0') self.output_node_name = [ 'output/boxes', 'output/scores', 'output/labels', 'output/masks' ] self.outputs_tensor = [ self.graph.get_tensor_by_name('import/{}:0'.format(each_node)) for each_node in self.output_node_name ] self.config = tf.compat.v1.ConfigProto() # self.config.gpu_options.allow_growth = True self.config.gpu_options.per_process_gpu_memory_fraction = 0.1 self.sess = tf.compat.v1.Session(config=self.config, graph=self.graph) self.predict_crop(np.zeros((200, 200, 3), dtype=np.uint8)) print('Loaded model!') def _scale_box(self, box, scale): w_half = (box[2] - box[0]) * 0.5 h_half = (box[3] - box[1]) * 0.5 x_c = (box[2] + box[0]) * 0.5 y_c = (box[3] + box[1]) * 0.5 w_half *= scale h_half *= scale scaled_box = np.zeros_like(box) scaled_box[0] = x_c - w_half scaled_box[2] = x_c + w_half scaled_box[1] = y_c - h_half scaled_box[3] = y_c + h_half return scaled_box def _paste_mask(self, box, mask, shape, accurate_paste=True): """ Args: box: 4 float mask: MxM floats shape: h,w Returns: A uint8 binary image of hxw. """ assert mask.shape[0] == mask.shape[1], mask.shape if accurate_paste: # This method is accurate but much slower. mask = np.pad(mask, [(1, 1), (1, 1)], mode='constant') box = self._scale_box(box, float(mask.shape[0]) / (mask.shape[0] - 2)) mask_pixels = np.arange(0.0, mask.shape[0]) + 0.5 mask_continuous = interpolate.interp2d(mask_pixels, mask_pixels, mask, fill_value=0.0) h, w = shape ys = np.arange(0.0, h) + 0.5 xs = np.arange(0.0, w) + 0.5 ys = (ys - box[1]) / (box[3] - box[1]) * mask.shape[0] xs = (xs - box[0]) / (box[2] - box[0]) * mask.shape[1] # Waste a lot of compute since most indices are out-of-border res = mask_continuous(xs, ys) return (res >= 0.5).astype('uint8') else: # This method (inspired by Detectron) is less accurate but fast. # int() is floor # box fpcoor=0.0 -> intcoor=0.0 x0, y0 = list(map(int, box[:2] + 0.5)) # box fpcoor=h -> intcoor=h-1, inclusive x1, y1 = list(map(int, box[2:] - 0.5)) # inclusive x1 = max(x0, x1) # require at least 1x1 y1 = max(y0, y1) w = x1 + 1 - x0 h = y1 + 1 - y0 # rounding errors could happen here, because masks were not originally computed for this shape. # but it's hard to do better, because the network does not know the "original" scale mask = (cv2.resize(mask, (w, h)) > 0.5).astype('uint8') ret = np.zeros(shape, dtype='uint8') ret[y0:y1 + 1, x0:x1 + 1] = mask return ret def predict_crop(self, img, debug_id=None): start_time = time.time() orig_shape = img.shape[:2] resized_img = self.resizer.augment(img) scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) boxes, probs, labels, *masks = self.sess.run( self.outputs_tensor, feed_dict={self.input_tensor: resized_img}) # Some slow numpy postprocessing: boxes = boxes / scale # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. boxes = clip_boxes(boxes, orig_shape) if masks: full_masks = [ self._paste_mask(box, mask, orig_shape) for box, mask in zip(boxes, masks[0]) ] masks = full_masks else: # fill with none masks = [None] * len(boxes) polygons = [] # Estimate polygon based on the mask right here for mask in masks: temp_mask = np.expand_dims(mask, axis=-1) * 255 cnts = cv2.findContours(temp_mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) cnts = imutils.grab_contours(cnts) cnt = max(cnts, key=cv2.contourArea) peri = cv2.arcLength(cnt, True) estimated_polygon = cv2.approxPolyDP(cnt, 0.02 * peri, True) polygons.append(estimated_polygon) # temp_mask = cv2.cvtColor(temp_mask, cv2.COLOR_GRAY2BGR) # viz_img = cv2.polylines(temp_mask, [estimated_polygon], isClosed=True, color=(255, 0, 255), thickness=10) # cv2.imwrite('mask.png', viz_img) # import ipdb; ipdb.set_trace() results = [ DetectionResult(*args) for args in zip(boxes, probs, labels.tolist(), masks, polygons) ] if self.debug: print('Crop tooks {} secs.'.format(time.time() - start_time)) debug_id = str(uuid.uuid4()) if debug_id is None else debug_id debug_path = os.path.join('./debugs/', debug_id) os.makedirs(debug_path, exist_ok=True) final = draw_final_outputs_blackwhite(img, results) # cv2.imwrite(debug_path, final) cv2.imwrite(os.path.join(debug_path, 'prediction.png'), final) return results, debug_path return results def create_shapely_polygon(self, each_object): try: obj_polygon = Polygon([(each[0][0], each[0][1]) for each in each_object.polygon]) if not obj_polygon.is_valid: obj_polygon = obj_polygon.buffer(0) except ValueError: # Use bb instead x1, y1, x2, y2 = [int(each) for each in each_object.box] org_bb = [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] obj_polygon = Polygon(org_bb) if not obj_polygon.is_valid: obj_polygon = obj_polygon.buffer(0) return obj_polygon def rotate_anno(self, all_object, angle, raw_img_shape, before_shape=None): new_object = [] # Create full page mask old_shape = before_shape if before_shape is not None else raw_img_shape[:: -1] full_page = [(0, 0), (old_shape[1], 0), (old_shape[1], old_shape[0]), (0, old_shape[0])] rotated_full_page = rotate_polygon(full_page, angle, raw_img_shape) # Calculate offset of rotation top_left_x = min([each[0] for each in rotated_full_page]) top_left_y = min([each[1] for each in rotated_full_page]) for obj in all_object: rotated_obj = obj # For bb x1, y1, x2, y2 = [int(each) for each in obj.box] org_bb = [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] rotated_bb = rotate_polygon(org_bb, angle, raw_img_shape, top_left_x, top_left_y) rotated_bb = [ item for sublist in [rotated_bb[0], rotated_bb[2]] for item in sublist ] # For polygon org_polygon = [(each[0][0], each[0][1]) for each in obj.polygon] rotated_polygon = rotate_polygon(org_polygon, angle, raw_img_shape, top_left_x, top_left_y) rotated_polygon = np.expand_dims(np.array(rotated_polygon, dtype=np.int32), axis=1) rotated_obj = rotated_obj._replace(polygon=rotated_polygon, box=rotated_bb) new_object.append(rotated_obj) return new_object def get_overlap_object(self, page, page_index, cropped_result_raw): cropped_result = copy.deepcopy(cropped_result_raw) page_polygon = self.create_shapely_polygon(page) if not page_polygon.is_valid: page_polygon = page_polygon.buffer(0) del cropped_result[page_index] overlaped_object = [] for each_object in cropped_result: obj_polygon = self.create_shapely_polygon(each_object) intersec_percentage = obj_polygon.intersection( page_polygon).area / obj_polygon.area if intersec_percentage >= 0.85: overlaped_object.append(each_object) return overlaped_object def keep_only_biggest(self, all_object): group_dict = {k.class_id: [] for k in all_object} filtered_object = [] for each in all_object: group_dict[each.class_id].append(each) for each_group, all_members in group_dict.items(): area_list = [] for each_member in all_members: each_polygon = self.create_shapely_polygon(each_member) area_list.append(each_polygon.area) filtered_object.append(all_members[np.argmax(area_list)]) return filtered_object def refine_object_location(self, page_bbox, other_objects): x, y = page_bbox[0], page_bbox[1] new_objects = [] for index, each in enumerate(other_objects): refined_object = each old_bb = refined_object.box new_bb = np.array( [old_bb[0] - x, old_bb[1] - y, old_bb[2] - x, old_bb[3] - y], dtype=np.int32) old_polygon = np.squeeze(refined_object.polygon) new_polygon = np.array([[e[0] - x, e[1] - y] for e in old_polygon], dtype=np.int32) new_polygon = np.expand_dims(new_polygon, axis=1) refined_object = refined_object._replace(box=new_bb, polygon=new_polygon) new_objects.append(refined_object) return new_objects def find_object_by_name(self, all_object, field_name): for each_object in all_object: if self.id_to_class_name[each_object.class_id] == field_name: return each_object return None def big_rotate_without_anchor(self, cropped_page, page, all_object): h, w, _ = cropped_page.shape if h > w: angle = 90 before_rotate_shape = cropped_page.shape[:-1] cropped_page = imutils.rotate_bound(cropped_page, angle, cval=(255, 255, 255)) after_rotate_shape = cropped_page.shape[:-1] page = self.rotate_anno([page], angle, after_rotate_shape, before_rotate_shape)[0] all_object = self.rotate_anno(all_object, angle, after_rotate_shape, before_rotate_shape) return cropped_page, page, all_object def big_rotate_with_anchor(self, cropped_page, page, all_object, anchor_field): anchor_object = self.find_object_by_name(all_object, anchor_field) anchor_polygon = self.create_shapely_polygon(anchor_object) if not anchor_polygon.is_valid: anchor_polygon = anchor_polygon.buffer(0) anchor_points = anchor_polygon.centroid.coords[0] page_height, page_width, _ = cropped_page.shape up_side_down = False if anchor_field == 'profile_image' or 'van_tay': if anchor_points[0] >= 0.5 * page_width: up_side_down = True elif anchor_field == 'passport_code': if anchor_points[1] <= 0.5 * page_height: up_side_down = True if up_side_down: for i in range(2): # 180 deg angle = 90 before_rotate_shape = cropped_page.shape[:-1] cropped_page = imutils.rotate_bound(cropped_page, angle, cval=(255, 255, 255)) after_rotate_shape = cropped_page.shape[:-1] page = self.rotate_anno([page], angle, after_rotate_shape, before_rotate_shape)[0] all_object = self.rotate_anno(all_object, angle, after_rotate_shape, before_rotate_shape) return cropped_page, page, all_object def crop_and_rotate(self, image, debug_id=None): start_time = time.time() cropped_results = self.predict_crop(image, debug_id=debug_id) if self.debug: cropped_result, debug_path = cropped_results else: cropped_result = cropped_results all_pages_result = [(index, each) for index, each in enumerate(cropped_result) if each.class_id == 1] results = [] for page_index, each_page in all_pages_result: raw_page_polygon = each_page.polygon # For later return # Find what other object are accosiated with this page other_objects = self.get_overlap_object(each_page, page_index, cropped_result) # Crop the page in raw img page_bbox = [int(each) for each in each_page.box] cropped_page = image[page_bbox[1]:page_bbox[3], page_bbox[0]:page_bbox[2]] # And then refine the page polygon each_page = self.refine_object_location(page_bbox, [each_page])[0] if other_objects: # Then clear the duplicate by one using the biggest other_objects = self.keep_only_biggest(other_objects) # And refine the accosiated location right now other_objects = self.refine_object_location( page_bbox, other_objects) # Then do fine rotation for the whole group first # Now we estimate the rotation angle of the page angle = cv2.minAreaRect(each_page.polygon)[-1] if angle < -45: angle = -(90 + angle) else: angle = -angle # Then rotate the whole bounding box including the page and the anno associated with that page too before_rotate_shape = cropped_page.shape[:-1] cropped_page = imutils.rotate_bound(cropped_page, angle=angle, cval=(255, 255, 255)) after_rotate_shape = cropped_page.shape[:-1] # After this gota crop the page and refine all polygon again each_page = self.rotate_anno([each_page], angle, after_rotate_shape, before_rotate_shape)[0] page_polygon = [(each[0][0], each[0][1]) for each in each_page.polygon] all_X = [each[0] for each in page_polygon] all_Y = [each[1] for each in page_polygon] # Practice show I should extend the crop a litte bit to avoid bad mask arround the border current_height, current_width, _ = cropped_page.shape page_bbox = [ max(0, min(all_X) - int(0.15 * current_width)), max(0, min(all_Y) - int(0.15 * current_height)), min(current_width, max(all_X) + int(0.1 * current_width)), min(current_height, max(all_Y) + int(0.1 * current_height)) ] cropped_page = cropped_page[page_bbox[1]:page_bbox[3], page_bbox[0]:page_bbox[2]] each_page = self.refine_object_location(page_bbox, [each_page])[0] if other_objects: other_objects = self.rotate_anno(other_objects, angle, after_rotate_shape, before_rotate_shape) other_objects = self.refine_object_location( page_bbox, other_objects) if self.debug: viz_img = cv2.polylines(cropped_page.copy(), [x.polygon for x in other_objects], isClosed=True, color=(0, 255, 255), thickness=2) cv2.imwrite( os.path.join( debug_path, 'rotate_step_1_page_{}.png'.format(page_index)), viz_img) # Now we do big rotation like 90 or 180 :P cropped_page, each_page, other_objects = self.big_rotate_without_anchor( cropped_page, each_page, other_objects) if self.debug: viz_img = cv2.polylines(cropped_page.copy(), [x.polygon for x in other_objects], isClosed=True, color=(0, 255, 255), thickness=2) cv2.imwrite( os.path.join( debug_path, 'rotate_step_2_page_{}.png'.format(page_index)), viz_img) other_object_name = [] if other_objects: # Then use some anchor point to correct upsidedown cases other_object_name = [ self.id_to_class_name[each.class_id] for each in other_objects ] # If fingerprint, face and mrz all appear, use the one with highest confidents # Priority profile image and fingerprint first most_conf = max(other_objects, key=lambda x: x.score) anchor_field = self.id_to_class_name[most_conf.class_id] do_it = False if 'profile_image' in other_object_name and anchor_field != 'profile_image': profile_image_object = other_objects[ other_object_name.index('profile_image')] if abs(profile_image_object.score - most_conf.score) <= 0.05: anchor_field = 'profile_image' do_it = True if not do_it and 'van_tay' in other_object_name and anchor_field != 'van_tay': profile_image_object = other_objects[ other_object_name.index('van_tay')] if abs(profile_image_object.score - most_conf.score) <= 0.05: anchor_field = 'van_tay' cropped_page, each_page, other_objects = self.big_rotate_with_anchor( cropped_page, each_page, other_objects, anchor_field) if self.debug: viz_img = cv2.polylines(cropped_page.copy(), [x.polygon for x in other_objects], isClosed=True, color=(0, 255, 255), thickness=2) cv2.imwrite( os.path.join(debug_path, 'rotated_page_{}.png'.format(page_index)), viz_img) # Now just do some minor formating return_res = [] for field in ['profile_image', 'passport_code']: if field in other_object_name: obj = other_objects[other_object_name.index(field)] temp_res = { 'polys': [(each[0][0], each[0][1]) for each in obj.polygon], 'conf': obj.score } else: temp_res = None return_res.append(temp_res) face_res, mrz_res = return_res results.append({ 'crop_rotated_page': { 'image': cropped_page, 'polys': [(each[0][0], each[0][1]) for each in raw_page_polygon], 'conf': each_page.score, }, 'face': face_res, 'mrz': mrz_res }) if self.debug: print('Crop and rotate tooks {} secs'.format(time.time() - start_time)) # with open(os.path.join(debug_path, 'crop_and_rotate.json'), 'w', encoding='utf-8') as f: # json.dump(results, f, ensure_ascii=False, indent=4) return results
def detect_one_image(img, model_func, *args): """ Run detection on one image, using the TF callable. This function should handle the preprocessing internally. Args: img: an image model_func: a callable from TF model, takes image and returns (boxes, probs, labels, [masks]) Returns: [DetectionResult] """ orig_shape = img.shape[:2] resizer = CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE) resized_img = resizer.augment(img) scale = (resized_img.shape[0] * 1.0 / img.shape[0] + resized_img.shape[1] * 1.0 / img.shape[1]) / 2 if config.USE_SECOND_HEAD: if config.EXTRACT_FEATURES: boxes, probs, labels, posteriors, second_labels, second_posteriors, masks, features = model_func( resized_img) masks = [masks] else: boxes, probs, labels, posteriors, second_labels, second_posteriors, *masks = model_func( resized_img) features = [None for _ in range(labels.size)] else: if config.EXTRACT_FEATURES: boxes, probs, labels, posteriors, masks, features = model_func( resized_img, *args) masks = [masks] else: boxes, probs, labels, posteriors, *masks = model_func(resized_img) features = [None for _ in range(labels.size)] boxes = boxes / scale boxes = clip_boxes(boxes, orig_shape) if masks: # has mask full_masks = [ fill_full_mask(box, mask, orig_shape) for box, mask in zip(boxes, masks[0]) ] masks = full_masks else: # fill with none masks = [None] * len(boxes) if config.USE_SECOND_HEAD: results = [ SecondDetectionResult(*args) for args in zip(boxes, probs, labels, posteriors, masks, second_labels, second_posteriors, features) ] else: results = [ DetectionResult(*args) for args in zip(boxes, probs, labels, posteriors, masks, features) ] return results
def get_train_dataflow(): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ imgs = COCODetection.load_many(cfg.DATA.BASEDIR, cfg.DATA.TRAIN, add_gt=True, add_mask=cfg.MODE_MASK) """ To train on your own data, change this to your loader. Produce "imgs" as a list of dict, in the dict the following keys are needed for training: height, width: integer file_name: str, full path to the image boxes: numpy array of kx4 floats class: numpy array of k integers is_crowd: k booleans. Use k False if you don't know what it means. segmentation: k lists of numpy arrays (one for each box). Each list of numpy array corresponds to the mask for one instance. Each numpy array in the list is a polygon of shape Nx2, because one mask can be represented by N polygons. If your segmentation annotations are originally masks rather than polygons, either convert it, or the augmentation code below will need to be changed or skipped accordingly. """ # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. num = len(imgs) imgs = list( filter(lambda img: len(img['boxes'][img['is_crowd'] == 0]) > 0, imgs)) logger.info( "Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}" .format(num - len(imgs), len(imgs))) ds = DataFromList(imgs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(cfg.PREPROC.SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(img): fname, boxes, klass, is_crowd = img['file_name'], img['boxes'], img[ 'class'], img['is_crowd'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" # augmentation: im, params = aug.augment_return_params(im) points = box_to_point8(boxes) points = aug.augment_coords(points, params) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" # rpn anchor: try: if cfg.MODE_FPN: multilevel_anchor_inputs = get_multilevel_rpn_anchor_input( im, boxes, is_crowd) anchor_inputs = itertools.chain.from_iterable( multilevel_anchor_inputs) else: # anchor_labels, anchor_boxes anchor_inputs = get_rpn_anchor_input(im, boxes, is_crowd) assert len(anchor_inputs) == 2 boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once( "Input {} is filtered for training: {}".format(fname, str(e)), 'warn') return None ret = [im] + list(anchor_inputs) + [boxes, klass] if cfg.MODE_MASK: # augmentation will modify the polys in-place segmentation = copy.deepcopy(img['segmentation']) segmentation = [ segmentation[k] for k in range(len(segmentation)) if not is_crowd[k] ] assert len(segmentation) == len(boxes) # Apply augmentation on polygon coordinates. # And produce one image-sized binary mask per box. masks = [] for polys in segmentation: polys = [aug.augment_coords(p, params) for p in polys] masks.append( segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret.append(masks) # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) return ret if cfg.TRAINER == 'horovod': ds = MultiThreadMapData(ds, 5, preprocess) # MPI does not like fork() else: ds = MultiProcessMapDataZMQ(ds, 10, preprocess) return ds
def get_train_dataflow(src): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ #imgs = COCODetection.load_many(cfg.DATA.BASEDIR, cfg.DATA.TRAIN, add_gt=True, add_mask=cfg.MODE_MASK) classes = ( 'BG', # always index 0 'bathtub', 'bed', 'bookshelf', 'box', 'chair', 'counter', 'desk', 'door', 'dresser', 'garbage_bin', 'lamp', 'monitor', 'night_stand', 'pillow', 'sink', 'sofa', 'table', 'toilet', 'tv') class_to_ind = dict(list(zip(classes, list(range(len(classes)))))) #src = '/media/ayan/Drive/IMI-Research/Datasets/Datasets_OP_Train/' textfile_index = natsorted( [src + f for f in np.sort(os.listdir(src)) if f.endswith('.txt')]) imgs = [] count = 0 for fn in textfile_index: each_file = {} count = count + 1 print(str(count) + ':::', fn) F = open(fn, 'r') file_F = F.read() file_F = file_F.split('\n') each_file['file_name'] = file_F[0] im = cv2.imread(each_file['file_name']) each_file['height'] = im.shape[0] each_file['width'] = im.shape[1] objects = file_F[2:len(file_F) - 1] boxes = [] class_ = [] for obj in objects: objs_line = obj.split(' ') x1 = float(objs_line[1]) - 1.0 y1 = float(objs_line[2]) - 1.0 x2 = float(objs_line[3]) - 1.0 y2 = float(objs_line[4]) - 1.0 y2 = float(objs_line[4]) - 1.0 if x1 >= x2: x2 = x1 + 1 boxes.append([x1, y1, x2, y2]) cls = class_to_ind[objs_line[0]] class_.append(cls) each_file['boxes'] = np.array(boxes).astype(np.float32) each_file['class'] = np.array(class_).astype(np.int32) each_file['is_crowd'] = np.zeros_like(each_file['class']).astype( np.int8) imgs.append(each_file) """ To train on your own data, change this to your loader. Produce "imgs" as a list of dict, in the dict the following keys are needed for training: height, width: integer file_name: str, full path to the image boxes: numpy array of kx4 floats class: numpy array of k integers is_crowd: k booleans. Use k False if you don't know what it means. segmentation: k lists of numpy arrays (one for each box). Each list of numpy array corresponds to the mask for one instance. Each numpy array in the list is a polygon of shape Nx2, because one mask can be represented by N polygons. If your segmentation annotations are originally masks rather than polygons, either convert it, or the augmentation code below will need to be changed or skipped accordingly. """ # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. num = len(imgs) imgs = list( filter(lambda img: len(img['boxes'][img['is_crowd'] == 0]) > 0, imgs)) logger.info( "Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}" .format(num - len(imgs), len(imgs))) ds = DataFromList(imgs, shuffle=False) aug = imgaug.AugmentorList([ CustomResize(cfg.PREPROC.SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(img): fname, boxes, klass, is_crowd = img['file_name'], img['boxes'], img[ 'class'], img['is_crowd'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" # augmentation: im, params = aug.augment_return_params(im) points = box_to_point8(boxes) points = aug.augment_coords(points, params) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" # rpn anchor: try: if cfg.MODE_FPN: multilevel_anchor_inputs = get_multilevel_rpn_anchor_input( im, boxes, is_crowd) anchor_inputs = itertools.chain.from_iterable( multilevel_anchor_inputs) else: # anchor_labels, anchor_boxes anchor_inputs = get_rpn_anchor_input(im, boxes, is_crowd) assert len(anchor_inputs) == 2 boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once( "Input {} is filtered for training: {}".format(fname, str(e)), 'warn') return None ret = [im] + list(anchor_inputs) + [boxes, klass] if cfg.MODE_MASK: # augmentation will modify the polys in-place segmentation = copy.deepcopy(img['segmentation']) segmentation = [ segmentation[k] for k in range(len(segmentation)) if not is_crowd[k] ] assert len(segmentation) == len(boxes) # Apply augmentation on polygon coordinates. # And produce one image-sized binary mask per box. masks = [] for polys in segmentation: polys = [aug.augment_coords(p, params) for p in polys] masks.append( segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret.append(masks) # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) return ret if cfg.TRAINER == 'horovod': ds = MultiThreadMapData(ds, 5, preprocess) # MPI does not like fork() else: ds = MultiProcessMapDataZMQ(ds, 10, preprocess) return ds