def get_rec_field_and_stride_after_concat_nets(receptive_field_netA, stride_netA, receptive_field_netB, stride_netB): """We are concatenating the two networks net(x) = netB(netA(x)), both with strides and receptive fields. This functions computes the stride and receptive field of the combination """ if isinstance(receptive_field_netA, FeatureMapSize): assert isinstance(stride_netA, FeatureMapSize) and isinstance( receptive_field_netB, FeatureMapSize) and isinstance( stride_netB, FeatureMapSize ), "All inputs should be either of type FeatureMapSize or int" rec_field_w, stride_w = Os2dHeadCreator.get_rec_field_and_stride_after_concat_nets( receptive_field_netA.w, stride_netA.w, receptive_field_netB.w, stride_netB.w) rec_field_h, stride_h = Os2dHeadCreator.get_rec_field_and_stride_after_concat_nets( receptive_field_netA.h, stride_netA.h, receptive_field_netB.h, stride_netB.h) return FeatureMapSize(w=rec_field_w, h=rec_field_h), FeatureMapSize(w=stride_w, h=stride_h) rec_field = stride_netA * (receptive_field_netB - 1) + receptive_field_netA stride = stride_netA * stride_netB return rec_field, stride
def __init__(self, do_simple_affine, is_cuda, use_inverse_geom_model): super(Os2dAlignment, self).__init__() self.model_type = "affine" if not do_simple_affine else "simple_affine" # "affine" or "simple_affine" self.use_inverse_geom_model = use_inverse_geom_model # create the parameter regression network if self.model_type == "affine": transform_net_output_dim = 6 elif self.model_type == "simple_affine": transform_net_output_dim = 4 else: raise (RuntimeError("Unknown transformation model \"{0}\"".format( self.model_type))) # all these numbers are semantically different, but are set to 15 due to the details in the model architecture # these number have to be compatible with the network regressing transformation parameters # following the weakalign code, we use 15 here # all the sizes are in (H, W) format # NOTE: tenchically the code should work with non-square grids, but this was never tested, so expect bugs self.out_grid_size = FeatureMapSize(w=15, h=15) self.reference_feature_map_size = FeatureMapSize(w=15, h=15) self.network_stride = FeatureMapSize(w=1, h=1) self.network_receptive_field = FeatureMapSize(w=15, h=15) self.input_feature_dim = self.reference_feature_map_size.w * self.reference_feature_map_size.h self.parameter_regressor = TransformationNet( output_dim=transform_net_output_dim, use_cuda=is_cuda, normalization= 'batchnorm', # if not self.use_group_norm else 'groupnorm', kernel_sizes=[7, 5], channels=[128, 64], input_feature_dim=self.input_feature_dim)
def resnet101_c4(use_group_norm=False): """ Constructs the ResNet101 C4 feature extractor Args: use_group_norm (bool) - if True use torch.nn.GroupNorm with GROUPNORM_NUMGROUPS groups as normalization layers, otherwise use torch.nn.BatchNorm2d """ return _resnet_fe(resnet101, 4, use_group_norm, feature_map_stride=FeatureMapSize(h=16, w=16), feature_map_receptive_field=FeatureMapSize(h=16, w=16))
def get_class_images_and_sizes(self, class_ids, do_augmentation=False): if self.mine_extra_class_images and do_augmentation: # select random label image if several are mined class_images = [] for class_id in class_ids: if class_id in self.label_image_collection: num_mined = len(self.label_image_collection[class_id]) random_int = torch.randint(num_mined + 1, (1, ), dtype=torch.long) if random_int == 0: # use the original image class_image = self.dataset.gt_images_per_classid[ class_id] else: # use the selected mined image class_image = self.label_image_collection[class_id][ random_int - 1] else: # nothing was mined for this class class_image = self.dataset.gt_images_per_classid[class_id] class_images.append(class_image) else: class_images = [ self.dataset.gt_images_per_classid[class_id] for class_id in class_ids ] class_image_sizes = [FeatureMapSize(img=img) for img in class_images] return class_images, class_image_sizes
def detect(self, target, source) -> Tuple[BoxList, torch.Tensor]: target = self._preprocess(target, cfg.model.class_image_size) source = self._preprocess(source, self.source_img_size) with torch.no_grad(): loc_prediction_batch, class_prediction_batch, _, fm_size, transform_corners_batch = \ self.net(images=source, class_images=target) image_loc_scores_pyramid = [loc_prediction_batch[0]] image_class_scores_pyramid = [class_prediction_batch[0]] img_size_pyramid = [FeatureMapSize(img=source)] transform_corners_pyramid = [transform_corners_batch[0]] class_ids = [0] boxes = self.box_coder.decode_pyramid(image_loc_scores_pyramid, image_class_scores_pyramid, img_size_pyramid, class_ids, nms_iou_threshold=cfg.eval.nms_iou_threshold, nms_score_threshold=cfg.eval.nms_score_threshold, transform_corners_pyramid=transform_corners_pyramid) boxes.remove_field("default_boxes") scores = boxes.get_field("scores") good_ids = torch.nonzero(scores.float() > self.score_threshold).view(-1) if good_ids.numel() > 0: _, ids = scores[good_ids].sort(descending=False) good_ids = good_ids[ids[-self.max_detections:]] boxes = boxes[good_ids].cpu() scores = scores[good_ids].cpu() boxes = boxes.bbox_xyxy boxes[:, [0,2]] /= source.shape[3] boxes[:, [1,3]] /= source.shape[2] return boxes, scores else: return None, None
def get_feature_map_size_for_network(img_size, net, is_cuda=False): """get_feature_map_size_for_network computes the size of the feature map when the network is applied to an image of specific size. The function creates a dummy image of required size, and just runs a network on it. This approach is very robust, but can be quite slow, so these calls shoulb be cached. Args: img_size (FeatureMapSize) - size of the input image net - the net to run is_cuda (bool) -flag showing where to put the dummy image on a GPU. Output: feature_map_size (FeatureMapSize) - the size of the feature map """ dummy_image = torch.zeros( 1, 3, img_size.h, img_size.w) # batch_size, num_channels, height, width if is_cuda: dummy_image = dummy_image.cuda() with torch.no_grad(): dummy_feature_maps = net(dummy_image) feature_map_size = FeatureMapSize(img=dummy_feature_maps) if is_cuda: torch.cuda.empty_cache() return feature_map_size
def get_the_boxes(image_filename): file_with_boxes = os.path.join( image_path, get_box_file_for_image_file(image_filename)) # get image size - recompute boxes boxes = read_boxes_from(file_with_boxes) img = read_image(os.path.join(image_path, image_filename)) imsize = FeatureMapSize(img=img) # choose the correct box if have two of them # From INSTRE documentation: # Specially, for each tuple-class in INSTRE-M, there are two corresponding object classes in INSTRE-S1. # In each annotation file for a INSTRE-M image, the first line records the object labeled as [a] in INSTRE-S1 # and the second line records the object labeled as [b] in INSTRE-S1. # # CAUTION! the matlab file has boxes in x1, y1, x2, y2, but the .txt files in x, y, w, h query_path_split = query_image_path_original.split("/") image_filename_split = image_filename.split("/") if query_path_split[0].lower( ) == "instre-s1" and image_filename_split[0].lower( ) == "instre-m": assert len( boxes ) == 2, f"INSTRE-M images should have exactly two boxes, but have {boxes}" assert query_path_split[1][2] in ["a", "b"] i_box = 0 if query_path_split[1][2] == "a" else 1 boxes = [convert_the_box_from_xywh(boxes[i_box], imsize)] elif query_path_split[0].lower() == "instre-s1" and image_filename_split[0].lower() == "instre-s1" or \ query_path_split[0].lower() == "instre-s2" and image_filename_split[0].lower() == "instre-s2": boxes = [ convert_the_box_from_xywh(box, imsize) for box in boxes ] else: raise RuntimeError( f"Should not be happening, query {query_image_path_original}, image {image_filename}, boxes {boxes}" ) return boxes
def read_annotation(xml_file: str): tree = ElementTree.parse(xml_file) root = tree.getroot() filename = root.find('filename').text im_size = root.find("size") width = int(im_size.find("width").text) height = int(im_size.find("height").text) im_size = FeatureMapSize(h=height, w=width) bboxes = [] class_ids = [] difficult_flags = [] for boxes in root.iter("object"): ymin, xmin, ymax, xmax = None, None, None, None difficult_flag = int(boxes.find("difficult").text) class_id = boxes.find("name").text for box in boxes.findall("bndbox"): assert ymin is None ymin = int(box.find("ymin").text) xmin = int(box.find("xmin").text) ymax = int(box.find("ymax").text) xmax = int(box.find("xmax").text) cur_box = [xmin, ymin, xmax, ymax] bboxes.append(cur_box) difficult_flags.append(difficult_flag) class_ids.append(class_id) return filename, bboxes, class_ids, difficult_flags, im_size
def decode_scores_show_detections(dataloader, images, class_ids, class_scores, loc_scores, corners): num_images = images.size(0) for i_image in range(num_images): # show elements with the largest losses img_size_pyramid = [FeatureMapSize(img=images[i_image])] image_loc_scores_pyramid = [loc_scores[i_image]] image_cls_scores_pyramid = [class_scores[i_image]] corners_pyramid = [corners[i_image]] # decode image predictions boxes_one_image = \ dataloader.box_coder.decode_pyramid(image_loc_scores_pyramid, image_cls_scores_pyramid, img_size_pyramid, class_ids, nms_iou_threshold=cfg.eval.nms_iou_threshold, nms_score_threshold=cfg.eval.nms_score_threshold, transform_corners_pyramid=corners_pyramid) show_annotated_image( img=dataloader.unnorm_image(images[i_image]), boxes=boxes_one_image, labels=boxes_one_image.get_field("labels"), scores=boxes_one_image.get_field("scores"), default_boxes=boxes_one_image.get_field("default_boxes"), transform_corners=boxes_one_image.get_field("transform_corners"), class_ids=class_ids, score_threshold=cfg.visualization.train.score_threshold, max_dets=cfg.visualization.train.max_detections, showfig=True)
def _transform_image_gt(self, img, do_augmentation=True, hflip=False, vflip=False, do_resize=True): do_augmentation = do_augmentation and self.data_augmentation is not None # batch level data augmentation img, _ = transforms_boxes.transpose(img, hflip=hflip, vflip=vflip, boxes=None, transform_list=None) if do_augmentation: # color distortion img = self.data_augmentation.random_distort(img) # random crop img = self.data_augmentation.random_crop_label_image(img) # resize image if do_resize: random_interpolation = self.data_augmentation.random_interpolation if do_augmentation else False # get the new size - while preserving aspect ratio size_old = FeatureMapSize(img=img) h, w = get_image_size_after_resize_preserving_aspect_ratio( h=size_old.h, w=size_old.w, target_size=self.gt_image_size) size_new = FeatureMapSize(w=w, h=h) img, _ = transforms_boxes.resize( img, target_size=size_new, random_interpolation=random_interpolation) transforms_th = [transforms.ToTensor()] if self.img_normalization is not None: transforms_th += [ transforms.Normalize(self.img_normalization["mean"], self.img_normalization["std"]) ] img = transforms.Compose(transforms_th)(img) return img
def get_image_sizes(dataset): print("Reading images from {}".format(dataset.image_path)) image_sizes_by_id = OrderedDict() images_in_dataset = dataset.gtboxframe.groupby(["imageid", "imagefilename"]).size().reset_index() for _, datum in tqdm(images_in_dataset.iterrows()): img = dataset._get_dataset_image_by_id(datum["imageid"]) im_size = FeatureMapSize(img=img) image_sizes_by_id[datum["imageid"]] = im_size print("Found {} images".format(len(image_sizes_by_id))) return image_sizes_by_id
def convert_boxlist_maskrcnn_to_os2d(boxlist_maskrcnn): image_size = FeatureMapSize(w=boxlist_maskrcnn.size[0], h=boxlist_maskrcnn.size[1]) boxlist = BoxList_os2d(boxlist_maskrcnn.convert("xyxy").bbox, image_size, mode="xyxy") # add extra fields for f in boxlist_maskrcnn.fields(): boxlist.add_field(f, boxlist_maskrcnn.get_field(f)) return boxlist
def _read_dataset_images(self): # create caches self.image_path_per_image_id = OrderedDict() self.image_size_per_image_id = OrderedDict() self.image_per_image_id = OrderedDict() for image_id, image_file in zip(self.image_ids, self.image_file_names): if image_id not in self.image_path_per_image_id : # store the image path img_path = os.path.join(self.image_path, image_file) self.image_path_per_image_id[image_id] = img_path # get image size (needed for bucketing) img = self._get_dataset_image_by_id(image_id) self.image_size_per_image_id[image_id] = FeatureMapSize(img=img) self.logger.info("{1} {0} data images".format(len(self.image_path_per_image_id), "Read" if self.cache_images else "Found"))
def _get_dataset_image_by_id(self, image_id): assert image_id in self.image_path_per_image_id, "Can work only with checked images" if image_id not in self.image_per_image_id : img_path = self.image_path_per_image_id[image_id] img = read_image(img_path) img_size = FeatureMapSize(img=img) if max(img_size.w, img_size.h) != self.image_size: h, w = get_image_size_after_resize_preserving_aspect_ratio(img_size.h, img_size.w, self.image_size) img = img.resize((w, h), resample=Image.ANTIALIAS) # resize images in case they were not of the correct size on disk if self.cache_images: self.image_per_image_id[image_id] = img else: img = self.image_per_image_id[image_id] return img
def forward(self, images=None, class_images=None, feature_maps=None, class_head=None, train_mode=False, fine_tune_features=True): """ Forward pass of the OS2D model. Cant function in several different regimes: [training mode] Extract features from input and class images, and applies the model to get clasificaton/localization scores of all classes on all images Args: images (tensor) - batch of input images class_images (list of tensors) - list of class images (possibly of different sizes) train_mode (bool) - should be True fine_tune_features (bool) - flag showing whether to enable gradients over features [evaluation mode] feature_maps (tensor) - pre-extracted feature maps, sized batch_size x feature_dim x height x width class_head (Os2dHead) - head created to detect some classes, inside has class_feature_maps, sized class_batch_size x feature_dim x class_height x class_width train_mode (bool) - should be False Outputs: loc_scores (tensor) - localization prediction, sized batch_size x num_classes x 4 x num_anchors (bbox parameterization) class_scores (tensor) - classification prediction, sized batch_size x num_classes x num_anchors class_scores_transform_detached (tensor) - same, but with transofrms detached from the computational graph used not to tune transofrmation on the negative examples fm_sizes (FeatureMapSize) - size of the output score map, num_anchors == fm_sizes.w * fm_sizes.h transform_corners (tensor) - points defining parallelograms showing transformations, sized batch_size x num_classes x 8 x num_anchors """ with torch.set_grad_enabled(train_mode and fine_tune_features): # extract features if feature_maps is None: assert images is not None, "If feature_maps is None than images cannot be None" feature_maps = self.net_feature_maps(images) # get features for labels if class_head is None: assert class_images is not None, "If class_conv_layer is None than class_images cannot be None" class_feature_maps = self.net_label_features(class_images) class_head = self.os2d_head_creator.create_os2d_head( class_feature_maps) # process features maps of different pyramid levels loc_scores, class_scores, class_scores_transform_detached, transform_corners = \ self.apply_class_heads_to_feature_maps(feature_maps, class_head) fm_size = FeatureMapSize(img=feature_maps) return loc_scores, class_scores, class_scores_transform_detached, fm_size, transform_corners
def save_cropped_boxes(dataset, tgt_image_path, extension=".jpg", num_random_crops_per_image=0): # crop all the boxes db = {"cids":[], "cluster":[], "gtbboxid":[], "classid":[], "imageid":[], "difficult":[], "type":[], "size":[], "bbox":[]} for image_id in tqdm(dataset.image_ids): img = dataset._get_dataset_image_by_id(image_id) boxes = dataset.get_image_annotation_for_imageid(image_id) assert boxes.has_field("labels"), "GT boxes need a field 'labels'" # remove all fields except "labels" and "difficult" for f in boxes.fields(): if f not in ["labels", "difficult"]: boxes.remove_field(f) if not boxes.has_field("difficult"): boxes.add_field("difficult", torch.zeros(len(boxes), dtype=torch.bool)) num_gt_boxes = len(boxes) im_size = FeatureMapSize(img=img) assert im_size == boxes.image_size eval_scale = dataset.get_eval_scale() # sample random boxes if needed if num_random_crops_per_image > 0: boxes_random = torch.rand(num_random_crops_per_image, 4) x1 = torch.min(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w x2 = torch.max(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w y1 = torch.min(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h y2 = torch.max(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h boxes_random = torch.stack([x1, y1, x2, y2], 1).floor() # crop boxes that are too small min_size = 10.0 / eval_scale * max(im_size.w, im_size.h) mask_bad_boxes = (boxes_random[:,0] + min_size > boxes_random[:,2]) | (boxes_random[:,1] + min_size > boxes_random[:,3]) good_boxes = torch.nonzero(~mask_bad_boxes).view(-1) boxes_random = boxes_random[good_boxes] boxes_random = BoxList(boxes_random, im_size, mode="xyxy") boxes_random.add_field("labels", torch.full([len(boxes_random)], -1, dtype=torch.long)) boxes_random.add_field("difficult", torch.zeros(len(boxes_random), dtype=torch.bool)) boxes = cat_boxlist([boxes, boxes_random]) if boxes is not None: for i_box in range(len(boxes)): # box format: left, top, right, bottom box = boxes[i_box].bbox_xyxy.view(-1) box = [b.item() for b in box] cropped_img = img.crop(box) if i_box < num_gt_boxes: lbl = boxes[i_box].get_field("labels").item() dif_flag = boxes[i_box].get_field("difficult").item() box_id = i_box box_type = "GT" else: lbl = -1 dif_flag = 0 box_id = i_box box_type = "RN" # create the file name to be used with cirtorch.datasets.datahelpers.cid2filename and their dataloader cid = "box{box_id:05d}_lbl{label:05d}_dif{dif:01d}_im{image_id:05d}{box_type}".format(box_id=box_id, image_id = image_id, label = lbl, dif = dif_flag, box_type=box_type) file_name = cid2filename(cid, prefix=tgt_image_path) # save the image image_path, _ = os.path.split(file_name) mkdir(image_path) if extension: cropped_img.save("{}{}".format(file_name, extension)) else: # cirtorch uses files with empty extension for training for some reason, need to support that cropped_img.save("{}".format(file_name), format="jpeg") # add to the db structure db["cids"].append(cid) db["cluster"].append(lbl) # use labels as clusters not to sample negatives from the same object db["classid"].append(lbl) db["gtbboxid"].append(box_id) db["imageid"].append(image_id) db["difficult"].append(dif_flag) if i_box < num_gt_boxes: db["type"].append("gtproposal") else: db["type"].append("randomcrop") db["size"].append(cropped_img.size) db["bbox"].append(box) # format (x1,y1,x2,y2) return db
def build_imagenet_test_episodes(subset_name, data_path, logger): episode_id = int(subset_name.split('-')[-1]) epi_data_name = "epi_inloc_in_domain_1_5_10_500" image_size = 1000 dataset_path = os.path.join(data_path, "ImageNet-RepMet") roidb_path = os.path.join(dataset_path, "RepMet_CVPR2019_data", "data", "Imagenet_LOC", "voc_inloc_roidb.pkl") with open(roidb_path, 'rb') as fid: roidb = pickle.load(fid, encoding='latin1') episodes_path = os.path.join(dataset_path, "RepMet_CVPR2019_data", "data", "Imagenet_LOC", "episodes", f"{epi_data_name}.pkl") with open(episodes_path, 'rb') as fid: episode_data = pickle.load(fid, encoding='latin1') logger.info(f"Extracting episode {episode_id} out of {len(episode_data)}") episode = episode_data[episode_id] dataset_image_path = os.path.join(data_path, "ImageNet-RepMet", "ILSVRC") SWAP_IMG_PATH_SRC = "/dccstor/leonidka1/data/imagenet/ILSVRC/" def _get_image_path(image_path): image_path = image_path.replace(SWAP_IMG_PATH_SRC, "") return image_path # episode["epi_cats"] - list of class ids # episode["query_images"] - list of path to the episode images # episode["epi_cats_names"] - list of names of the episode classes # episode["train_boxes"] - list of box data about class boxes num_classes = len(episode["epi_cats"]) gt_path = os.path.join(dataset_path, epi_data_name) gt_path = os.path.join(gt_path, f"classes_episode_{episode_id}") gt_image_path = os.path.join(gt_path, "images") mkdir(gt_image_path) classdatafile = os.path.join( gt_path, f"classes_{epi_data_name}_episode_{episode_id}.csv") if not os.path.isfile(classdatafile): logger.info( f"Did not find data file {classdatafile}, creating it from the RepMet source data" ) # create the annotation file from the raw dataset gtboxframe = [] # will be creating dataframe from a list of dicts gt_filename_by_id = {} for i_class in range(len(episode["train_boxes"])): train_boxes_data = episode["train_boxes"][i_class] class_id = train_boxes_data[0] assert class_id in episode[ "epi_cats"], f"class_id={class_id} should be listed in episode['epi_cats']={episode['epi_cats']}" query_image_path_original = _get_image_path(train_boxes_data[2]) query_bbox = train_boxes_data[3] query_bbox = query_bbox.flatten() classfilename = f"{class_id:05d}_{'_'.join(query_image_path_original.split('/'))}" if class_id not in gt_filename_by_id: logger.info( f"Adding query #{len(gt_filename_by_id)} - {class_id}: {query_image_path_original}" ) if not os.path.isfile(classfilename) or True: query_img = read_image( os.path.join(dataset_image_path, query_image_path_original)) query_img_cropped_box = query_img.crop(query_bbox) query_img_cropped_box.save( os.path.join(gt_image_path, classfilename)) gt_filename_by_id[class_id] = classfilename else: logger.info( f"WARNING: class {class_id} has multiple entries in GT image {query_image_path_original}, using the first box as GT" ) for class_id in episode["epi_cats"]: if class_id not in gt_filename_by_id: logger.info( f"WARNING: ground truth for class {class_id} not found in episode {episode_id}" ) def convert_the_box_to_relative(box, imsize): lx = float(box[0]) / imsize.w ty = float(box[1]) / imsize.h rx = float(box[2]) / imsize.w by = float(box[3]) / imsize.h return lx, ty, rx, by def find_image_path_in_roidb(image_file_name, roidb): for i_image, im_data in enumerate(roidb["roidb"]): if im_data["flipped"]: raise RuntimeError( f"Image {i_image} data {im_data} has flipped flag on") if im_data["image"] == image_file_name: return i_image return None for image_file_name in episode["query_images"]: # add one bbox to the annotation # required_columns = ["imageid", "imagefilename", "classid", "classfilename", "gtbboxid", "difficult", "lx", "ty", "rx", "by"] image_id = find_image_path_in_roidb(image_file_name, roidb) im_data = roidb["roidb"][image_id] image_file_name = _get_image_path(image_file_name) imsize = FeatureMapSize(w=int(im_data["width"]), h=int(im_data["height"])) boxes_xyxy = im_data["boxes"] classes = im_data["gt_classes"] for box, class_id in zip(boxes_xyxy, classes): if class_id in gt_filename_by_id: item = OrderedDict() item["imageid"] = int(image_id) item["imagefilename"] = image_file_name item["classid"] = int(class_id) item["classfilename"] = gt_filename_by_id[class_id] item["gtbboxid"] = len(gtboxframe) item["difficult"] = 0 item["lx"], item["ty"], item["rx"], item[ "by"] = convert_the_box_to_relative(box, imsize) gtboxframe.append(item) gtboxframe = pd.DataFrame(gtboxframe) gtboxframe.to_csv(classdatafile) gtboxframe = pd.read_csv(classdatafile) return gtboxframe, gt_image_path, dataset_image_path, image_size
def _transform_image_to_pyramid(self, image_id, boxes=None, do_augmentation=True, hflip=False, vflip=False, pyramid_scales=(1, ), mined_data=None): img = self._get_dataset_image_by_id(image_id) img_size = FeatureMapSize(img=img) do_augmentation = do_augmentation and self.data_augmentation is not None num_pyramid_levels = len(pyramid_scales) use_mined_crop = mined_data is not None if use_mined_crop: crop_position = mined_data["crop_position_xyxy"] if boxes is None: boxes = BoxList.create_empty(img_size) mask_cutoff_boxes = torch.zeros(len(boxes), dtype=torch.bool) mask_difficult_boxes = torch.zeros(len(boxes), dtype=torch.bool) box_inverse_transform = TransformList() # batch level data augmentation img, boxes = transforms_boxes.transpose( img, hflip=hflip, vflip=vflip, boxes=boxes, transform_list=box_inverse_transform) if use_mined_crop: # update crop_position_xyxy with the symmetries if hflip or vflip: _, crop_position = transforms_boxes.transpose( img, hflip=hflip, vflip=vflip, boxes=crop_position) if do_augmentation: if self.data_augmentation.do_random_crop: if not use_mined_crop: img, boxes, mask_cutoff_boxes, mask_difficult_boxes = \ self.data_augmentation.random_crop(img, boxes=boxes, transform_list=box_inverse_transform) else: img, boxes, mask_cutoff_boxes, mask_difficult_boxes = \ self.data_augmentation.crop_image(img, crop_position, boxes=boxes, transform_list=box_inverse_transform) img, boxes = transforms_boxes.resize( img, target_size=self.data_augmentation.random_crop_size, random_interpolation=self.data_augmentation. random_interpolation, boxes=boxes, transform_list=box_inverse_transform) # color distortion img = self.data_augmentation.random_distort(img) random_interpolation = self.data_augmentation.random_interpolation if do_augmentation else False img_size = FeatureMapSize(img=img) pyramid_sizes = [ FeatureMapSize(w=int(img_size.w * s), h=int(img_size.h * s)) for s in pyramid_scales ] img_pyramid = [] boxes_pyramid = [] pyramid_box_inverse_transform = [] for p_size in pyramid_sizes: box_inverse_transform_this_scale = copy.deepcopy( box_inverse_transform) p_img, p_boxes = transforms_boxes.resize( img, target_size=p_size, random_interpolation=random_interpolation, boxes=boxes, transform_list=box_inverse_transform_this_scale) pyramid_box_inverse_transform.append( box_inverse_transform_this_scale) img_pyramid.append(p_img) boxes_pyramid.append(p_boxes) transforms_th = [transforms.ToTensor()] if self.img_normalization is not None: transforms_th += [ transforms.Normalize(self.img_normalization["mean"], self.img_normalization["std"]) ] for i_p in range(num_pyramid_levels): img_pyramid[i_p] = transforms.Compose(transforms_th)( img_pyramid[i_p]) return img_pyramid, boxes_pyramid, mask_cutoff_boxes, mask_difficult_boxes, pyramid_box_inverse_transform
def make_iterator_extract_scores_from_images_batched(dataloader, maskrcnn_model, maskrcnn_config, logger, image_batch_size=None, is_cuda=False): logger.info("Starting iterations over images") # get images of all classes class_images, class_aspect_ratios, class_ids = dataloader.get_all_class_images( ) num_classes = len(class_images) assert len(class_aspect_ratios) == num_classes assert len(class_ids) == num_classes query_img_sizes = [FeatureMapSize(img=img) for img in class_images] # loop over all images iterator_batches = dataloader.make_iterator_for_all_images( image_batch_size) for batch_ids, pyramids_batch, box_transforms_batch, initial_img_size_batch in iterator_batches: t_start_batch = time.time() # extract features at all pyramid levels batch_images_pyramid = [] bboxes_xyxy = [] labels = [] scores = [] num_pyramid_levels = len(pyramids_batch) for batch_images in pyramids_batch: if is_cuda: batch_images = batch_images.cuda() # print("Image size:", images_b.size()) batch_images = [ dataloader.unnorm_image(img) for img in batch_images ] batch_images = torch.stack(batch_images, 0) bboxes_xyxy_, labels_, scores_ = run_maskrcnn_on_images( maskrcnn_model, maskrcnn_config, batch_images) bboxes_xyxy.append(bboxes_xyxy_) labels.append(labels_) scores.append(scores_) batch_images_pyramid.append(batch_images) for i_image_in_batch, image_id in enumerate(batch_ids): # get data from all pyramid levels bboxes_xyxy_p = [] labels_p = [] scores_p = [] for i_p in range(num_pyramid_levels): bboxes_xyxy_p.append(bboxes_xyxy[i_p][i_image_in_batch]) labels_p.append(labels[i_p][i_image_in_batch]) scores_p.append(scores[i_p][i_image_in_batch]) # get a pyramid of one image[i_p] one_image_pyramid = [ p[i_image_in_batch] for p in batch_images_pyramid ] # extract the box transformations box_reverse_transforms = box_transforms_batch[i_image_in_batch] # get the boxes in the correct format bboxes_xyxy_p = [ BoxList(bbox, FeatureMapSize(img=img), mode="xyxy") for bbox, img in zip(bboxes_xyxy_p, one_image_pyramid) ] bboxes_xyxy_p = [ t(bb) for t, bb in zip(box_reverse_transforms, bboxes_xyxy_p) ] # add labels and scores into the box structure for bb, l, s in zip(bboxes_xyxy_p, labels_p, scores_p): bb.add_field("labels", l) bb.add_field("scores", s) # get the size of the initial image initial_img_size = initial_img_size_batch[i_image_in_batch] yield image_id, bboxes_xyxy_p, one_image_pyramid, query_img_sizes, class_ids, initial_img_size
def evaluate(dataloader, detector, cfg_maskrcnn, retrievalnet, opt, cfg_eval, cfg_visualization, is_cuda=False, logger_prefix="detector-retrieval"): logger = logging.getLogger(f"{logger_prefix}.evaluate") dataset_name = dataloader.get_name() dataset_scale = dataloader.get_eval_scale() logger.info("Starting to eval on {0}, scale {1}".format( dataset_name, dataset_scale)) t_start_eval = time.time() detector.eval() retrievalnet.eval() ## setup retrievalnet # setting up the multi-scale parameters ms = [1] msp = 1 if opt.retrieval_multiscale: ms = [1, 1. / math.sqrt(2), 1. / 2] if retrievalnet.meta[ "pooling"] == "gem" and retrievalnet.whiten is None: msp = retrievalnet.pool.p.data.tolist()[0] #setup whitening if opt.retrieval_whitening_path is not None: logger.info("Whitening is precomputed, loading it from {0}".format( opt.retrieval_whitening_path)) whitening_data = torch.load(opt.retrieval_whitening_path) if ( (opt.retrieval_multiscale and "ms" in whitening_data) or \ (not opt.retrieval_multiscale and "ss" in whitening_data ) ): if opt.retrieval_multiscale: Lw = copy.deepcopy(whitening_data["ms"]) else: Lw = copy.deepcopy(whitening_data["ss"]) else: raise RuntimeError( "Whitening should be precomputed with the network") # convert whitening data to torch tensors Lw["m"], Lw["P"] = torch.from_numpy(Lw["m"]), torch.from_numpy(Lw["P"]) if is_cuda: Lw["m"], Lw["P"] = Lw["m"].cuda(), Lw["P"].cuda() else: Lw = None with torch.no_grad( ): # do evaluation in forward mode only (for speed and memory) # extract features from query images query_images, _, _ = dataloader.get_all_class_images(do_resize=False) if is_cuda: query_images = [img.cuda() for img in query_images] query_images = [img[0] for img in query_images ] # get rid of the batch dimension query_images = [ resize_image_tensor(img, opt.retrieval_image_size) for img in query_images ] query_images = [dataloader.unnorm_image(img) for img in query_images] query_images_with_aug = [] for im in query_images: query_images_with_aug.append(im) if not cfg_eval.class_image_augmentation: num_class_views = 1 elif cfg_eval.class_image_augmentation == "rotation90": im90 = im.rot90(1, [1, 2]) im180 = im90.rot90(1, [1, 2]) im270 = im180.rot90(1, [1, 2]) query_images_with_aug.append(im90) query_images_with_aug.append(im180) query_images_with_aug.append(im270) num_class_views = 4 elif cfg_eval.class_image_augmentation == "horflip": im_flipped = im.flip(2) query_images_with_aug.append(im_flipped) num_class_views = 2 else: raise RuntimeError( f"Unknown value of class_image_augmentation: {cfg_eval.class_image_augmentation}" ) query_images = query_images_with_aug query_vectors = extract_vectors_from_images(retrievalnet, query_images, ms=ms, msp=msp) # apply whitening if defined if Lw is not None: query_vectors = whitenapply(query_vectors, Lw["m"], Lw["P"]) query_vectors = torch.transpose(query_vectors, 0, 1) # prepare looping over all iamges iterator = make_iterator_extract_scores_from_images_batched( dataloader, detector, cfg_maskrcnn, logger, image_batch_size=cfg_eval.batch_size, is_cuda=is_cuda) boxes, labels, scores = [], [], [] gt_boxes = [] image_ids = [] losses = OrderedDict() # loop over all dataset images num_evaluted_images = 0 for data in iterator: image_id, boxes_one_image, image_pyramid, query_img_sizes, class_ids, initial_img_size = data image_ids.append(image_id) logger.info(f"Image {num_evaluted_images}: id {image_id}") num_evaluted_images += 1 img_size_pyramid = [ FeatureMapSize(img=img) for img in image_pyramid ] gt_boxes_one_image = dataloader.get_image_annotation_for_imageid( image_id) gt_boxes.append(gt_boxes_one_image) # vizualize GT for debug if cfg_visualization.show_gt_boxes: visualizer.show_gt_boxes(image_id, gt_boxes_one_image, class_ids, dataloader) # decode image predictions # merge boxes_one_image, labels_one_image, scores_one_image from different pyramid layers boxes_one_image = cat_boxlist(boxes_one_image) # do NMS good_indices = nms( boxes_one_image, opt.nms_iou_threshold_detector_score, nms_score_threshold=opt.nms_score_threshold_detector_score) boxes_one_image = boxes_one_image[good_indices] # extract feature vectors from the predictions image_original = dataloader._transform_image(image_id, do_augmentation=True, hflip=False, vflip=False)[0] if is_cuda: image_original = image_original.cuda() image_patches = crop_resize_image_patches( image_original, boxes_one_image, opt.retrieval_image_size, logger, unnorm_image=dataloader.unnorm_image, is_cuda=is_cuda) # filter out cases when failed to crop a box: outside of the image good_indices = [ i for i, p in enumerate(image_patches) if p is not None ] if good_indices: # non empty image_patches = [p for p in image_patches if p is not None] boxes_one_image = boxes_one_image[good_indices] image_vectors = extract_vectors_from_images(retrievalnet, image_patches, ms=ms, msp=msp) # compute class scores from image_vectors and query_vectors (already transposed) if Lw is not None: # apply whitening if defined image_vectors = whitenapply(image_vectors, Lw["m"], Lw["P"]) scores_retrieval = torch.mm(query_vectors, image_vectors) num_queries = scores_retrieval.size(0) num_detections = scores_retrieval.size(1) list_of_active_label = torch.LongTensor(class_ids) if cfg_eval.class_image_augmentation: list_of_active_label = torch.stack( [list_of_active_label] * num_class_views, 1).view(-1) # take all labels for all boxes - will sort them by scores at eval scores_one_image = scores_retrieval.view(-1) boxes_one_image = cat_boxlist([boxes_one_image] * num_queries) labels_one_image = torch.stack([list_of_active_label] * num_detections, 1).contiguous().view(-1) # add scores and labels: overwrite if existed boxes_one_image.add_field("labels", labels_one_image) boxes_one_image.add_field("scores", scores_one_image) # NMS using the retrieval scores good_indices = nms( boxes_one_image, cfg_eval.nms_iou_threshold, nms_score_threshold=cfg_eval.nms_score_threshold, do_separate_per_label=not cfg_eval.nms_across_classes) boxes_one_image = boxes_one_image[good_indices] else: boxes_one_image.add_field( "labels", torch.zeros(0, dtype=torch.long, device=boxes_one_image.bbox_xyxy.device)) boxes_one_image.add_field( "scores", torch.zeros(0, dtype=torch.float, device=boxes_one_image.bbox_xyxy.device)) boxes.append(boxes_one_image.cpu()) if cfg_visualization.show_detections: # do not pass class_ids - this is already taken care of visualizer.show_detections(boxes_one_image, image_id, dataloader, cfg_visualization, class_ids=None) # normalize by number of steps for k in losses: losses[k] /= num_evaluted_images # Save detection if requested if cfg_visualization.path_to_save_detections: data = { "image_ids": image_ids, "boxes_xyxy": [bb.bbox_xyxy for bb in boxes], "labels": [bb.get_field("labels") for bb in boxes], "scores": [bb.get_field("scores") for bb in boxes], "gt_boxes_xyxy": [bb.bbox_xyxy for bb in gt_boxes], "gt_labels": [bb.get_field("labels") for bb in gt_boxes], "gt_difficults": [bb.get_field("difficult") for bb in gt_boxes] } dataset_name = dataloader.get_name() os.makedirs(cfg_visualization.path_to_save_detections, exist_ok=True) save_path = os.path.join(cfg_visualization.path_to_save_detections, dataset_name + "_detections.pth") torch.save(data, save_path) # compute mAP for mAP_iou_threshold in cfg_eval.mAP_iou_thresholds: logger.info("Evaluating at IoU th {:0.2f}".format(mAP_iou_threshold)) ap_data = do_voc_evaluation(boxes, gt_boxes, iou_thresh=mAP_iou_threshold, use_07_metric=False) losses["mAP@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map"] losses["mAPw@{:0.2f}".format( mAP_iou_threshold)] = ap_data["map_weighted"] losses["recall@{:0.2f}".format(mAP_iou_threshold)] = ap_data["recall"] losses["AP_joint_classes@{:0.2f}".format( mAP_iou_threshold)] = ap_data["ap_joint_classes"] # per class AP information for i_class, (ap, recall, n_pos) in enumerate( zip(ap_data["ap_per_class"], ap_data["recall_per_class"], ap_data["n_pos"])): if not np.isnan(ap): assert i_class in class_ids, "Could not find class_id in the list of ids" logger.info( "Class {0} (local {3}), AP {1:0.4f}, #obj {2}, recall {4:0.4f}" .format(i_class, ap, n_pos, class_ids.index(i_class), recall)) # save timing losses["eval_time"] = (time.time() - t_start_eval) logger.info("Evaluated on {0}, scale {1}".format(dataset_name, dataset_scale)) print_meters(losses, logger) return losses
def resample_of_correlation_map_simple(corr_maps, resampling_grids_grid_coord, class_pool_mask): """This function resamples the correlation tensor according to the grids of points representing the transformations produces by the transformation network. This function is left hear for understanding, use resample_of_correlation_map_fast, which is faster. Args: corr_maps (Tensor[float], size=batch_size x class_batch_size x (h^T*w^T) x h^A x w^A): This tensor contains correlations between of features of the input and class feature maps. This function resamples this tensor. CAUTION: this tensor shows be viewed to batch_size x class_batch_size x w^T x h^T x h^A x w^A (note the switch of w^T and h^T dimensions) This happens to be able to load models of the weakalign repo resampling_grids_grid_coord (Tensor[float], size=batch_size x class_batch_size x h^A x w^A x h^T x w^T x 2): This tensor contains non-integer coordinates of the points that show where we need to resample class_pool_mask (Tensor[float]): size=class_batch_size x 1 x h^T x w^T This tensor contains the mask, by which the resampled correlations are multiplied before final average pooling. It masks out the border features of the class feature maps. Returns: matches_pooled (Tensor[float]): size=batch_size x class_batch_size x x 1 x h^A x w^A Time comparison resample_of_correlation_map_simple vs resample_of_correlation_map_fast: for 2 images, 11 labels, train_patch_width 400, train_patch_height 600 (fm width = 25, fm height = 38) CPU time simple: 0.14s CPU time fast: 0.11s GPU=Geforce GTX 1080Ti GPU time simple: 0.010s GPU time fast: 0.006s """ batch_size = corr_maps.size(0) class_batch_size = corr_maps.size(1) template_fm_size = FeatureMapSize( h=resampling_grids_grid_coord.size(-3), w=resampling_grids_grid_coord.size(-2)) image_fm_size = FeatureMapSize(img=corr_maps) assert template_fm_size.w * template_fm_size.h == corr_maps.size( 2 ), 'the number of channels in the correlation map = {0} should match the size of the resampling grid = {1}'.format( corr_maps.size(2), template_fm_size) # use a single batch dimension corr_maps = corr_maps.view(batch_size * class_batch_size, corr_maps.size(2), image_fm_size.h, image_fm_size.w) resampling_grids_grid_coord = resampling_grids_grid_coord.view( batch_size * class_batch_size, image_fm_size.h, image_fm_size.w, template_fm_size.h, template_fm_size.w, 2) # extract matches from all channels one by one in a loop, and then combine them (using the average pooling w.r.t. the mask of active points defined by class_pool_mask) matches_all_channels = [] # the order of the loops matters for template_x in range(template_fm_size.w): for template_y in range(template_fm_size.h): # note the weird order of coordinates - related to the transposed coordinates in the weakalign network channel_id = template_x * template_fm_size.h + template_y channel = corr_maps[:, channel_id:channel_id + 1, :, :] points = resampling_grids_grid_coord[:, :, :, template_y, template_x, :] matches_one_channel = F.grid_sample(channel, points, mode="bilinear", padding_mode='border', align_corners=True) matches_all_channels.append(matches_one_channel) matches_all_channels = torch.stack(matches_all_channels, -1) # start pooling: fix all dimensions explicitly mostly to be safe matches_all_channels = matches_all_channels.view( batch_size, class_batch_size, image_fm_size.h, image_fm_size.w, template_fm_size.h * template_fm_size.w) mask = class_pool_mask.view(1, class_batch_size, 1, 1, template_fm_size.h * template_fm_size.w) matches_all_channels = matches_all_channels * mask matches_pooled = matches_all_channels.sum(4) matches_pooled = matches_pooled.view(batch_size, class_batch_size, 1, image_fm_size.h, image_fm_size.w) return matches_pooled
def make_iterator_extract_scores_from_images_batched(dataloader, net, logger, image_batch_size, is_cuda, num_random_pyramid_scales=0, num_random_negative_labels=-1, class_image_augmentation=""): """ Generator to loop over dataset and apply the model to all elements. The iterator will loop over images one by one. Used in evaluate and .train.mine_hard_patches Args: dataloader - the dataloader to get data net - the network to use logger - the created logger image_batch_size (int) - the number of images to put in one batch is_cuda (bool) - use GPUs or not num_random_pyramid_scales (int) - numnber of random pyramid scales to try, default (0) means the standard scales from config passed to dataloader.make_iterator_for_all_images num_random_negative_labels (int) - number of random negative labels to try, default (-1) means to add all possible labels class_image_augmentation (str) - type of class image augmentation to do, default - no augmentation, support "rotation90" and "horflip" Returns: Creates an iterator over tuples of data: image_id (int) image_loc_scores_p (list of tensors) - localization scores to get bounding boxes when decoding len(image_loc_scores_p) = num pyramid levels, tensor size: num_labels x 4 x num_anchors image_class_scores_p (list of tensors) - clasification scores to recognize classes when decoding len(image_class_scores_p) = num pyramid levels, tensor size: num_labels x num_anchors one_image_pyramid (list of tensors) - input images at all pyramid levels batch_query_img_sizes (list of FeatureMapSize) - sizes of used query images (used in mine_hard_patches) len(batch_query_img_sizes) = num query images batch_class_ids (list of int) - class ids of used query images; len(batch_class_ids) = num query images, box_reverse_transforms (list of os2d.structures.transforms.TransformList) - reverse transforms to convert boxes from the coordinates of each resized image to the original global coordinates len(box_reverse_transforms) = num pyramid levels image_fm_sizes_p (list of FeatureMapSize) - sizes of the feature maps of the current pyramid len(image_fm_sizes_p) = num pyramid levels transform_corners_p (list of tensors) - corners of the parallelogram after the transformation mapping (used for visualization) len(transform_corners_p) = num pyramid levels, tensor size: num_labels x 8 x num_anchors """ logger.info("Extracting scores from all images") # get images of all classes class_images, class_aspect_ratios, class_ids = dataloader.get_all_class_images() num_classes = len(class_images) assert len(class_aspect_ratios) == num_classes assert len(class_ids) == num_classes query_img_sizes = [FeatureMapSize(img=img) for img in class_images] # the current code works only with class batch == 1, this in inefficient in some place, but good in others # is there a better way? class_batch_size = 1 # extract all class convolutions from batched class images class_conv_layer_batched = [] logger.info("Extracting weights from {0} classes{1}".format(num_classes, f" with {class_image_augmentation} augmentation" if class_image_augmentation else "")) for i in range(0, num_classes, class_batch_size): batch_class_ids = class_ids[i : i + class_batch_size] batch_class_images = [] for i_label in range(len(batch_class_ids)): im = class_images[i + i_label].squeeze(0) if is_cuda: im = im.cuda() batch_class_images.append(im) if not class_image_augmentation: num_class_views = 1 elif class_image_augmentation == "rotation90": im90 = im.rot90(1, [1, 2]) im180 = im90.rot90(1, [1, 2]) im270 = im180.rot90(1, [1, 2]) batch_class_images.append(im90) batch_class_images.append(im180) batch_class_images.append(im270) num_class_views = 4 elif class_image_augmentation == "horflip": im_flipped = im.flip(2) batch_class_images.append(im_flipped) num_class_views = 2 else: raise RuntimeError(f"Unknown value of class_image_augmentation: {class_image_augmentation}") for b_im in batch_class_images: class_feature_maps = net.net_label_features([b_im]) class_conv_layer = net.os2d_head_creator.create_os2d_head(class_feature_maps) class_conv_layer_batched.append(class_conv_layer) # loop over all images iterator_batches = dataloader.make_iterator_for_all_images(image_batch_size, num_random_pyramid_scales=num_random_pyramid_scales) for batch_ids, pyramids_batch, box_transforms_batch, initial_img_size_batch in iterator_batches: t_start_batch = time.time() # select labels to use for search at this batch if num_random_negative_labels >= 0 : # randomly shuffle labels neg_labels = torch.randperm(len(class_conv_layer_batched)) neg_labels = neg_labels[:num_random_negative_labels] # add positive labels pos_labels = dataloader.get_class_ids_for_image_ids(batch_ids) pos_labels = dataloader.convert_label_ids_global_to_local(pos_labels, class_ids) batch_labels_local = torch.cat([neg_labels, pos_labels], 0).unique() else: # take all the labels - needed for evaluation batch_labels_local = torch.arange(len(class_conv_layer_batched)) batch_class_ids = [class_ids[l // num_class_views] for l in batch_labels_local] batch_query_img_sizes = [query_img_sizes[l // num_class_views] for l in batch_labels_local] # extract features at all pyramid levels batch_images_pyramid = [] loc_scores = [] class_scores = [] fm_sizes = [] transform_corners = [] num_pyramid_levels = len(pyramids_batch) t_cum_features = 0.0 t_cum_labels = 0.0 for batch_images in pyramids_batch: if is_cuda: batch_images = batch_images.cuda() t_start_features = time.time() feature_maps = net.net_feature_maps(batch_images) torch.cuda.synchronize() t_cum_features += time.time() - t_start_features # batch class images loc_scores.append([]) class_scores.append([]) fm_sizes.append([]) transform_corners.append([]) t_start_labels = time.time() assert class_batch_size == 1, "the iterator on images works only with labels batches of size 1" for i_class_batch in batch_labels_local: # apply net at this pyramid level loc_s_p, class_s_p, _, fm_sizes_p, transform_corners_p = \ net(class_head=class_conv_layer_batched[i_class_batch], feature_maps=feature_maps) loc_scores[-1].append(loc_s_p) class_scores[-1].append(class_s_p) fm_sizes[-1].append(fm_sizes_p) transform_corners[-1].append(transform_corners_p) torch.cuda.synchronize() t_cum_labels += time.time() - t_start_labels if not feature_maps.requires_grad: # explicitly remove a possibly large chunk of GPU memory del feature_maps batch_images_pyramid.append(batch_images) timing_str = "Feature time: {0}, Label time: {1}, ".format(time_for_printing(t_cum_features, mode="s"), time_for_printing(t_cum_labels, mode="s")) # loc_scores, class_scores: pyramid_level x class_batch x image_in_batch x for i_image_in_batch, image_id in enumerate(batch_ids): # get scores from all pyramid levels image_loc_scores_p, image_class_scores_p, image_fm_sizes_p = [], [], [] transform_corners_p = [] for i_p in range(num_pyramid_levels): if loc_scores is not None and loc_scores[0] is not None and loc_scores[0][0] is not None: image_loc_scores_p.append(torch.cat([s[i_image_in_batch] for s in loc_scores[i_p]], 0)) else: image_loc_scores_p.append(None) image_class_scores_p.append(torch.cat([s[i_image_in_batch] for s in class_scores[i_p]], 0)) if transform_corners is not None and transform_corners[0] is not None and transform_corners[0][0] is not None: transform_corners_p.append(torch.cat([s[i_image_in_batch] for s in transform_corners[i_p]], 0)) else: transform_corners_p.append(None) image_fm_sizes_p.append(fm_sizes[i_p][0]) # get a pyramid of one image[i_p] one_image_pyramid = [p[i_image_in_batch] for p in batch_images_pyramid] # extract the box transformations box_reverse_transforms = box_transforms_batch[i_image_in_batch] logger.info(timing_str + "Net time: {0}".format(time_since(t_start_batch))) yield image_id, image_loc_scores_p, image_class_scores_p, one_image_pyramid,\ batch_query_img_sizes, batch_class_ids, box_reverse_transforms, image_fm_sizes_p, transform_corners_p
def resample_of_correlation_map_fast(corr_maps, resampling_grids_grid_coord, class_pool_mask): """This function resamples the correlation tensor according to the grids of points representing the transformations produces by the transformation network. This is a more efficient version of resample_of_correlation_map_simple Args: corr_maps (Tensor[float], size=batch_size x class_batch_size x (h^T*w^T) x h^A x w^A): This tensor contains correlations between of features of the input and class feature maps. This function resamples this tensor. CAUTION: this tensor shows be viewed to batch_size x class_batch_size x w^T x h^T x h^A x w^A (note the switch of w^T and h^T dimensions) This happens to be able to load models of the weakalign repo resampling_grids_grid_coord (Tensor[float], size=batch_size x class_batch_size x h^A x w^A x h^T x w^T x 2): This tensor contains non-integer coordinates of the points that show where we need to resample class_pool_mask (Tensor[float]): size=class_batch_size x 1 x h^T x w^T This tensor contains the mask, by which the resampled correlations are multiplied before final average pooling. It masks out the border features of the class feature maps. Returns: matches_pooled (Tensor[float]): size=batch_size x class_batch_size x x 1 x h^A x w^A Time comparison resample_of_correlation_map_simple vs resample_of_correlation_map_fast: for 2 images, 11 labels, train_patch_width 400, train_patch_height 600 (fm width = 25, fm height = 38) CPU time simple: 0.14s CPU time fast: 0.11s GPU=Geforce GTX 1080Ti GPU time simple: 0.010s GPU time fast: 0.006s """ batch_size = corr_maps.size(0) class_batch_size = corr_maps.size(1) template_fm_size = FeatureMapSize( h=resampling_grids_grid_coord.size(-3), w=resampling_grids_grid_coord.size(-2)) image_fm_size = FeatureMapSize(img=corr_maps) assert template_fm_size.w * template_fm_size.h == corr_maps.size( 2 ), 'the number of channels in the correlation map = {0} should match the size of the resampling grid = {1}'.format( corr_maps.size(2), template_fm_size) # memory efficient computation will be done by merging the Y coordinate # and the index of the channel in corr_map into one single float # merge the two dimensions together corr_map_merged_y_and_id_in_corr_map = corr_maps.contiguous().view( batch_size * class_batch_size, 1, -1, image_fm_size.w) # note the weird order of coordinates - related to the transposed coordinates in the Ignacio's network y_grid, x_grid = torch.meshgrid(torch.arange(template_fm_size.h), torch.arange(template_fm_size.w)) index_in_corr_map = y_grid + x_grid * template_fm_size.h # clamp to strict [-1, 1] # convert to torch.double to get more accuracy resampling_grids_grid_coord_ = resampling_grids_grid_coord.clamp( -1, 1).to(dtype=torch.double) resampling_grids_grid_coord_x_ = resampling_grids_grid_coord_.narrow( -1, 0, 1) resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_.narrow( -1, 1, 1) # adjust the y coordinate to take into account the index in the corr_map: # convert from [-1, 1] to [0, image_fm_size[0]] resampling_grids_grid_coord_y_ = (resampling_grids_grid_coord_y_ + 1) / 2 * (image_fm_size.h - 1) # merge with the index in corr map [0] resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_.view( [-1] + list(index_in_corr_map.size())) index_in_corr_map = index_in_corr_map.unsqueeze(0) index_in_corr_map = index_in_corr_map.to( device=resampling_grids_grid_coord_.device, dtype=resampling_grids_grid_coord_.dtype) resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_ + index_in_corr_map * image_fm_size.h # convert back to [-1, -1] resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_ / ( image_fm_size.h * template_fm_size.h * template_fm_size.w - 1) * 2 - 1 resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_.view_as( resampling_grids_grid_coord_x_) resampling_grids_grid_coord_merged_y_and_id_in_corr_map = torch.cat( [resampling_grids_grid_coord_x_, resampling_grids_grid_coord_y_], dim=-1) # flatten the resampling grid resampling_grids_grid_coord_merged_y_and_id_in_corr_map_1d = \ resampling_grids_grid_coord_merged_y_and_id_in_corr_map.view(batch_size * class_batch_size, -1, 1, 2) # extract the required points matches_all_channels = F.grid_sample( corr_map_merged_y_and_id_in_corr_map.to(dtype=torch.double), resampling_grids_grid_coord_merged_y_and_id_in_corr_map_1d, mode="bilinear", padding_mode='border', align_corners=True) matches_all_channels = matches_all_channels.view( batch_size, class_batch_size, 1, image_fm_size.h * image_fm_size.w, template_fm_size.h * template_fm_size.w) matches_all_channels = matches_all_channels.to(dtype=torch.float) # combine extracted matches using the average pooling w.r.t. the mask of active points defined by class_pool_mask) mask = class_pool_mask.view(1, class_batch_size, 1, 1, template_fm_size.h * template_fm_size.w) matches_all_channels = matches_all_channels * mask matches_pooled = matches_all_channels.sum(4) matches_pooled = matches_pooled.view(batch_size, class_batch_size, 1, image_fm_size.h, image_fm_size.w) return matches_pooled
def evaluate(dataloader, net, cfg, criterion=None, print_per_class_results=False): """ Evaluation of the provided model at one dataset Args: dataloader - the dataloader to get data net - the network to use cfg - config with all the parameters criterion - criterion (usually the same one as used for training), can be None, will just not compute related metrics print_per_class_results - flag showing whether to printout extra data (per class AP) - usually used at the final evaluation Returns: losses (OrderedDict) - all computed metrics, e.g., losses["[email protected]"] - mAP at IoU threshold 0.5 """ logger = logging.getLogger("OS2D.evaluate") dataset_name = dataloader.get_name() dataset_scale = dataloader.get_eval_scale() logger.info("Starting to eval on {0}, scale {1}".format(dataset_name, dataset_scale)) t_start_eval = time.time() net.eval() iterator = make_iterator_extract_scores_from_images_batched(dataloader, net, logger, image_batch_size=cfg.eval.batch_size, is_cuda=cfg.is_cuda, class_image_augmentation=cfg.eval.class_image_augmentation) boxes = [] gt_boxes = [] losses = OrderedDict() image_ids = [] # loop over all dataset images num_evaluted_images = 0 for data in iterator: image_id, image_loc_scores_pyramid, image_class_scores_pyramid,\ image_pyramid, query_img_sizes, class_ids,\ box_reverse_transform, image_fm_sizes_p, transform_corners_pyramid\ = data image_ids.append(image_id) num_evaluted_images += 1 img_size_pyramid = [FeatureMapSize(img=img) for img in image_pyramid] num_labels = len(class_ids) gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(image_id) gt_boxes.append(gt_boxes_one_image) # compute losses if len(gt_boxes_one_image) > 0: # there is some annotation for this image gt_labels_one_image = gt_boxes_one_image.get_field("labels") dataloader.update_box_labels_to_local(gt_boxes_one_image, class_ids) loc_targets_pyramid, class_targets_pyramid = \ dataloader.box_coder.encode_pyramid(gt_boxes_one_image, img_size_pyramid, num_labels, default_box_transform_pyramid=box_reverse_transform) # return the original labels back gt_boxes_one_image.add_field("labels", gt_labels_one_image) # vizualize GT for debug if cfg.visualization.eval.show_gt_boxes: visualizer.show_gt_boxes(image_id, gt_boxes_one_image, class_ids, dataloader) if cfg.is_cuda: loc_targets_pyramid = [loc_targets.cuda() for loc_targets in loc_targets_pyramid] class_targets_pyramid = [class_targets.cuda() for class_targets in class_targets_pyramid] transform_corners_pyramid = [transform_corners.cuda() for transform_corners in transform_corners_pyramid] add_batch_dim = lambda list_of_tensors: [t.unsqueeze(0) for t in list_of_tensors] if criterion is not None: # if criterion is provided, use it to compute all metrics it can losses_iter = criterion(add_batch_dim(image_loc_scores_pyramid) if image_loc_scores_pyramid[0] is not None else None, add_batch_dim(loc_targets_pyramid), add_batch_dim(image_class_scores_pyramid), add_batch_dim(class_targets_pyramid) ) # convert to floats for l in losses_iter: losses_iter[l] = losses_iter[l].mean().item() # printing print_meters(losses_iter, logger) # update logs add_to_meters_in_dict(losses_iter, losses) # decode image predictions boxes_one_image = \ dataloader.box_coder.decode_pyramid(image_loc_scores_pyramid, image_class_scores_pyramid, img_size_pyramid, class_ids, nms_iou_threshold=cfg.eval.nms_iou_threshold, nms_score_threshold=cfg.eval.nms_score_threshold, inverse_box_transforms=box_reverse_transform, transform_corners_pyramid=transform_corners_pyramid) boxes.append(boxes_one_image.cpu()) if cfg.visualization.eval.show_detections: visualizer.show_detection_from_dataloader(boxes_one_image, image_id, dataloader, cfg.visualization.eval, class_ids=None) if cfg.visualization.eval.show_class_heatmaps: visualizer.show_class_heatmaps(image_id, class_ids, image_fm_sizes_p, class_targets_pyramid, image_class_scores_pyramid, cfg_local=cfg.visualization.eval, class_image_augmentation=cfg.eval.class_image_augmentation) if cfg.is_cuda: torch.cuda.empty_cache() # normalize by number of steps for k in losses: losses[k] /= num_evaluted_images # Save detection if requested path_to_save_detections = cfg.visualization.eval.path_to_save_detections if path_to_save_detections: data = {"image_ids" : image_ids, "boxes_xyxy" : [bb.bbox_xyxy for bb in boxes], "labels" : [bb.get_field("labels") for bb in boxes], "scores" : [bb.get_field("scores") for bb in boxes], "gt_boxes_xyxy" : [bb.bbox_xyxy for bb in gt_boxes], "gt_labels" : [bb.get_field("labels") for bb in gt_boxes], "gt_difficults" : [bb.get_field("difficult") for bb in gt_boxes] } dataset_name = dataloader.get_name() os.makedirs(path_to_save_detections, exist_ok=True) save_path = os.path.join(path_to_save_detections, dataset_name + "_detections.pth") torch.save(data, save_path) # compute mAP for mAP_iou_threshold in cfg.eval.mAP_iou_thresholds: logger.info("Evaluating at IoU th {:0.2f}".format(mAP_iou_threshold)) ap_data = do_voc_evaluation(boxes, gt_boxes, iou_thresh=mAP_iou_threshold, use_07_metric=False) losses["mAP@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map"] losses["mAPw@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map_weighted"] losses["recall@{:0.2f}".format(mAP_iou_threshold)] = ap_data["recall"] if print_per_class_results: # per class AP information for i_class, (ap, recall, n_pos) in enumerate(zip(ap_data["ap_per_class"], ap_data["recall_per_class"], ap_data["n_pos"])): if not np.isnan(ap): assert i_class in class_ids, "Could not find class_id in the list of ids" logger.info("Class {0} (local {3}), AP {1:0.4f}, #obj {2}, recall {4:0.4f}".format(i_class, ap, n_pos, class_ids.index(i_class), recall)) # save timing losses["eval_time"] = (time.time() - t_start_eval) logger.info("Evaluated on {0}, scale {1}".format(dataset_name, dataset_scale)) print_meters(losses, logger) return losses
def forward(self, feature_maps): """ Args: feature_maps (Tensor[float], size b^A x d x h^A x w^A) - contains the feature map of the input image b^A - batch size d - feature dimensionality h^A - height of the feature map w^A - width of the feature map Returns: # here b^C is the class batch size, i.e., the number of class images contained in self.class_batch_size passed when creating this object output_localization (Tensor[float], size b^A x b^C x 4 x h^A x w^A) - the localization output w.r.t. the standard box encoding - computed by DetectionBoxCoder.build_loc_targets output_recognition (Tensor[float], size size b^A x b^C x 1 x h^A x w^A) - the recognition output for each of the classes - the correlation, linearly converted to [0, 1] segment, the higher the better match to the class output_recognition_transform_detached (Tensor[float], size b^A x b^C x 1 x h^A x w^A) - same to output_recognition, but with the computational graph detached from the transformation (for backward that does not update the transofrmation - intended for the negatives) corner_coordinates (Tensor[float], size size b^A x b^C x 8 x h^A x w^A) - the corners of the default boxes after the transofrmation, datached from the computational graph, for visualisation only """ # get dims batch_size = feature_maps.size(0) feature_dim = feature_maps.size(1) image_fm_size = FeatureMapSize(img=feature_maps) class_fm_size = FeatureMapSize(img=self.class_feature_maps) feature_dim_for_regression = class_fm_size.h * class_fm_size.w class_feature_dim = self.class_feature_maps.size(1) assert feature_dim == class_feature_dim, "Feature dimensionality of input={0} and class={1} feature maps has to equal".format( feature_dim, class_feature_dim) # L2-normalize the feature map feature_maps = normalize_feature_map_L2(feature_maps, 1e-5) # get correlations all to all corr_maps = torch.einsum("bfhw,afxy->abwhxy", self.class_feature_maps, feature_maps) # need to try to optimize this with opt_einsum: https://optimized-einsum.readthedocs.io/en/latest/ # CAUTION: note the switch of dimensions hw to wh. This is done for compatability with the FeatureCorrelation class by Ignacio Rocco https://github.com/ignacio-rocco/ncnet/blob/master/lib/model.py (to be able to load their models) # reshape to have the correlation map of dimensions similar to the standard tensor for image feature maps corr_maps = corr_maps.contiguous().view( batch_size * self.class_batch_size, feature_dim_for_regression, image_fm_size.h, image_fm_size.w) # compute the grids to resample corr maps resampling_grids_local_coord = self.aligner(corr_maps) # build classifications outputs cor_maps_for_recognition = corr_maps.contiguous().view( batch_size, self.class_batch_size, feature_dim_for_regression, image_fm_size.h, image_fm_size.w) resampling_grids_local_coord = resampling_grids_local_coord.contiguous( ).view(batch_size, self.class_batch_size, image_fm_size.h, image_fm_size.w, self.aligner.out_grid_size.h, self.aligner.out_grid_size.w, 2) # need to recompute resampling_grids to [-1, 1] coordinates w.r.t. the feature maps to sample points with F.grid_sample # first get the list of boxes that corresponds to the receptive fields of the parameter regression network: box sizes are the receptive field sizes, stride is the network stride default_boxes_xyxy_wrt_fm = self.box_grid_generator_feature_map_level.create_strided_boxes_columnfirst( fm_size=image_fm_size) default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.view( 1, 1, image_fm_size.h, image_fm_size.w, 4) # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x box_grid_height x box_grid_width x 4 default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.to( resampling_grids_local_coord.device) resampling_grids_fm_coord = convert_box_coordinates_local_to_global( resampling_grids_local_coord, default_boxes_xyxy_wrt_fm) # covert to coordinates normalized to [-1, 1] (to be compatible with torch.nn.functional.grid_sample) resampling_grids_fm_coord_x = resampling_grids_fm_coord.narrow( -1, 0, 1) resampling_grids_fm_coord_y = resampling_grids_fm_coord.narrow( -1, 1, 1) resampling_grids_fm_coord_unit = torch.cat([ resampling_grids_fm_coord_x / (image_fm_size.w - 1) * 2 - 1, resampling_grids_fm_coord_y / (image_fm_size.h - 1) * 2 - 1 ], dim=-1) # clamp to fit the image plane resampling_grids_fm_coord_unit = resampling_grids_fm_coord_unit.clamp( -1, 1) # extract and pool matches # # slower code: # matches_summed = self.resample_of_correlation_map_simple(cor_maps_for_recognition, # resampling_grids_fm_coord_unit, # self.class_pool_mask) # we use faster, but somewhat more obscure version matches_summed = self.resample_of_correlation_map_fast( cor_maps_for_recognition, resampling_grids_fm_coord_unit, self.class_pool_mask) if matches_summed.requires_grad: matches_summed_transform_detached = self.resample_of_correlation_map_fast( cor_maps_for_recognition, resampling_grids_fm_coord_unit.detach(), self.class_pool_mask) else: # Optimization to make eval faster matches_summed_transform_detached = matches_summed # build localization targets default_boxes_xyxy_wrt_image = self.box_grid_generator_image_level.create_strided_boxes_columnfirst( fm_size=image_fm_size) default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.view( 1, 1, image_fm_size.h, image_fm_size.w, 4) # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x box_grid_height x box_grid_width x 4 default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.to( resampling_grids_local_coord.device) resampling_grids_image_coord = convert_box_coordinates_local_to_global( resampling_grids_local_coord, default_boxes_xyxy_wrt_image) num_pooled_points = self.aligner.out_grid_size.w * self.aligner.out_grid_size.h resampling_grids_x = resampling_grids_image_coord.narrow( -1, 0, 1).contiguous().view(-1, num_pooled_points) resampling_grids_y = resampling_grids_image_coord.narrow( -1, 1, 1).contiguous().view(-1, num_pooled_points) class_boxes_xyxy = torch.stack([ resampling_grids_x.min(dim=1)[0], resampling_grids_y.min(dim=1)[0], resampling_grids_x.max(dim=1)[0], resampling_grids_y.max(dim=1)[0] ], 1) # extract rectangle borders to draw complete boxes corner_coordinates = resampling_grids_image_coord[:, :, :, :, [ 0, -1 ]][:, :, :, :, :, [0, -1]] # only the corners corner_coordinates = corner_coordinates.detach_() corner_coordinates = corner_coordinates.view( batch_size, self.class_batch_size, image_fm_size.h, image_fm_size.w, 8) # batch_size x label_batch_size x fm_height x fm_width x 8 corner_coordinates = corner_coordinates.transpose(3, 4).transpose( 2, 3) # batch_size x label_batch_size x 5 x fm_height x fm_width class_boxes = BoxList(class_boxes_xyxy.view(-1, 4), image_fm_size, mode="xyxy") default_boxes_wrt_image = BoxList(default_boxes_xyxy_wrt_image.view( -1, 4), image_fm_size, mode="xyxy") default_boxes_with_image_batches = cat_boxlist( [default_boxes_wrt_image] * batch_size * self.class_batch_size) output_localization = Os2dBoxCoder.build_loc_targets( class_boxes, default_boxes_with_image_batches) # num_boxes x 4 output_localization = output_localization.view( batch_size, self.class_batch_size, image_fm_size.h, image_fm_size.w, 4) # batch_size x label_batch_size x fm_height x fm_width x 4 output_localization = output_localization.transpose(3, 4).transpose( 2, 3) # batch_size x label_batch_size x 4 x fm_height x fm_width output_recognition = (matches_summed - 1.0) / 2.0 output_recognition_transform_detached = ( matches_summed_transform_detached - 1.0) / 2.0 return output_localization, output_recognition, output_recognition_transform_detached, corner_coordinates
def mine_hard_patches(dataloader, net, cfg, criterion): """Mine patches that are hard: classification false positives and negative, localization errors At each level of sampled image pyramid, we need to cut out a piece of size appropriate for training (levels are defined by cfg.train.mining.num_random_pyramid_scales, cfg.train.mining.num_random_negative_classes) Args: dataloader - dataloader to use (often the same as the one for training) net - the network to use cfg - config with all the parameters criterion - criterion (usually the same one as used for training) Returns: hardnegdata_per_imageid (OrderedDict) - mined data, keys are the image ids; further used in dataloader.set_hard_negative_data(hardnegdata_per_imageid) when preparing batches """ logger = logging.getLogger("OS2D.mining_hard_patches") logger.info("Starting to mine hard patches") t_start_mining = time.time() net.eval() num_batches = len(dataloader) hardnegdata_per_imageid = OrderedDict() iterator = make_iterator_extract_scores_from_images_batched(dataloader, net, logger, image_batch_size=cfg.eval.batch_size, is_cuda=cfg.is_cuda, num_random_pyramid_scales=cfg.train.mining.num_random_pyramid_scales, num_random_negative_labels=cfg.train.mining.num_random_negative_classes) boxes = [] gt_boxes = [] losses = OrderedDict() # loop over all dataset images for data in iterator: t_item_start = time.time() image_id, image_loc_scores_pyramid, image_class_scores_pyramid, \ image_pyramid, query_img_sizes, \ batch_class_ids, box_reverse_transform_pyramid, image_fm_sizes_p, transform_corners_pyramid \ = data img_size_pyramid = [FeatureMapSize(img=image) for image in image_pyramid] gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(image_id) gt_boxes.append(gt_boxes_one_image) # compute losses # change labels to the ones local to the current image dataloader.update_box_labels_to_local(gt_boxes_one_image, batch_class_ids) num_labels = len(batch_class_ids) loc_targets_pyramid, class_targets_pyramid = \ dataloader.box_coder.encode_pyramid(gt_boxes_one_image, img_size_pyramid, num_labels, default_box_transform_pyramid=box_reverse_transform_pyramid) # vizualize GT for debug if cfg.visualization.mining.show_gt_boxes: visualizer.show_gt_boxes(image_id, gt_boxes_one_image, batch_class_ids, dataloader) # compute losses if cfg.is_cuda: loc_targets_pyramid = [loc_targets.cuda() for loc_targets in loc_targets_pyramid] class_targets_pyramid = [class_targets.cuda() for class_targets in class_targets_pyramid] add_batch_dim = lambda list_of_tensors: [t.unsqueeze(0) for t in list_of_tensors] loc_scores_pyramid = add_batch_dim(image_loc_scores_pyramid) cls_targets_remapped_pyramid = [] for loc_scores, img_size, box_reverse_transform in zip(loc_scores_pyramid, img_size_pyramid, box_reverse_transform_pyramid): # loop over the pyramid levels cls_targets_remapped, ious_anchor, ious_anchor_corrected = \ dataloader.box_coder.remap_anchor_targets(loc_scores, [img_size], query_img_sizes, [gt_boxes_one_image], box_reverse_transform=[box_reverse_transform]) cls_targets_remapped_pyramid.append(cls_targets_remapped) losses_iter, losses_per_anchor = criterion(loc_scores_pyramid, add_batch_dim(loc_targets_pyramid), add_batch_dim(image_class_scores_pyramid), add_batch_dim(class_targets_pyramid), cls_targets_remapped=cls_targets_remapped_pyramid, patch_mining_mode=True) if cfg.visualization.mining.show_class_heatmaps: visualizer.show_class_heatmaps(image_id, batch_class_ids, image_fm_sizes_p, class_targets_pyramid, image_class_scores_pyramid, cfg_local=cfg.visualization.mining) assert dataloader.data_augmentation is not None, "Can mine hard patches only through data augmentation" crop_size = dataloader.data_augmentation.random_crop_size # convert to floats for l in losses_iter: losses_iter[l] = losses_iter[l].mean().item() # printing print_meters(losses_iter, logger) # update logs add_to_meters_in_dict(losses_iter, losses) # construct crop boxes for all the anchors and NMS them - NMS pos ang neg anchors separately query_fm_sizes = [dataloader.box_coder._get_feature_map_size_per_image_size(sz) for sz in query_img_sizes] crops = [] achors = [] labels_of_anchors = [] pyramid_level_of_anchors = [] losses_of_anchors = [] corners_of_anchors = [] losses_loc_of_anchors = [] pos_mask_of_anchors = [] pos_loc_mask_of_anchors = [] neg_mask_of_anchors = [] anchor_indices = [] i_image_in_batch = 0 # only one image comes here for i_p, img_size in enumerate(img_size_pyramid): for i_label, query_fm_size in enumerate(query_fm_sizes): crop_position, anchor_position, anchor_index = \ dataloader.box_coder.output_box_grid_generator.get_box_to_cut_anchor(img_size, crop_size, image_fm_sizes_p[i_p], box_reverse_transform_pyramid[i_p]) cur_corners = transform_corners_pyramid[i_p][i_label].transpose(0,1) cur_corners = dataloader.box_coder.apply_transform_to_corners(cur_corners, box_reverse_transform_pyramid[i_p], img_size) if cfg.is_cuda: crop_position, anchor_position = crop_position.cuda(), anchor_position.cuda() crops.append(crop_position) achors.append(anchor_position) device = crop_position.bbox_xyxy.device losses_of_anchors.append(losses_per_anchor["cls_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy)) pos_mask_of_anchors.append(losses_per_anchor["pos_mask"][i_p][i_image_in_batch, i_label].to(device=device)) neg_mask_of_anchors.append(losses_per_anchor["neg_mask"][i_p][i_image_in_batch, i_label].to(device=device)) losses_loc_of_anchors.append(losses_per_anchor["loc_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy)) pos_loc_mask_of_anchors.append(losses_per_anchor["pos_for_regression"][i_p][i_image_in_batch, i_label].to(device=device)) corners_of_anchors.append(cur_corners.to(crop_position.bbox_xyxy)) num_anchors = len(crop_position) labels_of_anchors.append(torch.full([num_anchors], i_label, dtype=torch.long)) pyramid_level_of_anchors.append(torch.full([num_anchors], i_p, dtype=torch.long)) anchor_indices.append(anchor_index) # stack all crops = cat_boxlist(crops) achors = cat_boxlist(achors) labels_of_anchors = torch.cat(labels_of_anchors, 0) pyramid_level_of_anchors = torch.cat(pyramid_level_of_anchors, 0) losses_of_anchors = torch.cat(losses_of_anchors, 0) losses_loc_of_anchors = torch.cat(losses_loc_of_anchors, 0) pos_mask_of_anchors = torch.cat(pos_mask_of_anchors, 0) pos_loc_mask_of_anchors = torch.cat(pos_loc_mask_of_anchors, 0) neg_mask_of_anchors = torch.cat(neg_mask_of_anchors, 0) anchor_indices = torch.cat(anchor_indices, 0) corners_of_anchors = torch.cat(corners_of_anchors, 0) def nms_masked_and_collect_data(mask, crops_xyxy, scores, nms_iou_threshold_in_mining, max_etries=None): mask_ids = torch.nonzero(mask).squeeze(1) boxes_selected = copy.deepcopy(crops_xyxy[mask]) boxes_selected.add_field("scores", scores[mask]) remaining_boxes = nms(boxes_selected, nms_iou_threshold_in_mining) remaining_boxes = mask_ids[remaining_boxes] # sort and take the topk, because NMS is not sorting by default ids = torch.argsort(scores[remaining_boxes], descending=True) if max_etries is not None: ids = ids[:max_etries] remaining_boxes = remaining_boxes[ids] return remaining_boxes nms_iou_threshold_in_mining = cfg.train.mining.nms_iou_threshold_in_mining num_hard_patches_per_image = cfg.train.mining.num_hard_patches_per_image # hard negatives hard_negs = nms_masked_and_collect_data(neg_mask_of_anchors, crops, losses_of_anchors, nms_iou_threshold_in_mining, num_hard_patches_per_image) # hard positives for classification hard_pos = nms_masked_and_collect_data(pos_mask_of_anchors, crops, losses_of_anchors, nms_iou_threshold_in_mining, num_hard_patches_per_image) # hard positives for localization hard_pos_loc = nms_masked_and_collect_data(pos_loc_mask_of_anchors, crops, losses_loc_of_anchors, nms_iou_threshold_in_mining, num_hard_patches_per_image) # merge all together def standardize(v): return v.item() if type(v) == torch.Tensor else v def add_item(data, role, pyramid_level, label_local, anchor_index, crop_position_xyxy, anchor_position_xyxy, transform_corners): new_item = OrderedDict() new_item["pyramid_level"] = standardize(pyramid_level) new_item["label_local"] = standardize(label_local) new_item["anchor_index"] = standardize(anchor_index) new_item["role"] = role new_item["crop_position_xyxy"] = crop_position_xyxy new_item["anchor_position_xyxy"] = anchor_position_xyxy new_item["transform_corners"] = transform_corners data.append(new_item) hardnegdata = [] for i in hard_negs: add_item(hardnegdata, "neg", pyramid_level_of_anchors[i], labels_of_anchors[i], anchor_indices[i], crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu()) for i in hard_pos: add_item(hardnegdata, "pos", pyramid_level_of_anchors[i], labels_of_anchors[i], anchor_indices[i], crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu()) for i in hard_pos_loc: add_item(hardnegdata, "pos_loc", pyramid_level_of_anchors[i], labels_of_anchors[i], anchor_indices[i], crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu()) # extract loss values and compute the box positions to crop for a in hardnegdata: a["label_global"] = standardize(batch_class_ids[ a["label_local"] ]) a["loss"] = standardize(losses_per_anchor["cls_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]]) a["loss_loc"] = standardize(losses_per_anchor["loc_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]]) a["score"] = standardize(image_class_scores_pyramid[a["pyramid_level"]][a["label_local"], a["anchor_index"]]) a["image_id"] = standardize(image_id) hardnegdata_per_imageid[image_id] = hardnegdata if cfg.visualization.mining.show_mined_patches: visualizer.show_mined_patches(image_id, batch_class_ids, dataloader, hardnegdata) logger.info("Item time: {0}, since mining start: {1}".format(time_since(t_item_start), time_since(t_start_mining))) logger.info("Hard negative mining finished in {0}".format(time_since(t_start_mining))) return hardnegdata_per_imageid
def _prepare_batch(self, image_ids, use_all_labels=False): batch_images = [] batch_class_images = [] batch_loc_targets = [] batch_class_targets = [] # flag to use hard neg mining use_mined_data = self.hardnegdata_per_imageid is not None # select which mined boxes to use if use_mined_data: # for half of the images select hard positives, for half - hard negatives # the order of images in a batch is random, so no need to randomize here batch_size = len(image_ids) num_neg_patches = batch_size // 2 role_to_select = ["neg"] * num_neg_patches + ["pos"] * ( batch_size - num_neg_patches) mined_data = {} for image_id, role in zip(image_ids, role_to_select): mined_data_for_image = self.hardnegdata_per_imageid[image_id] # filter for the correct role mined_data_for_image = [ d for d in mined_data_for_image if d["role"][:len(role)] == role ] if len(mined_data_for_image) == 0: mined_data_for_image = self.hardnegdata_per_imageid[ image_id] assert len( mined_data_for_image ) > 0, "Could not find mined {0} for image {1}".format( role, image_id) # select random element i_rand = torch.randint(len(mined_data_for_image), (1, ), dtype=torch.long).item() mined_data[image_id] = mined_data_for_image[i_rand] # self.logger.info("Image {0}, mined data: {1}".format(image_id, mined_data[image_id])) # collect labels for this batch batch_data = self.dataset.get_dataframe_for_image_ids(image_ids) if not use_all_labels: class_ids = batch_data["classid"].unique() # select labels for mined hardnegs if use_mined_data: # select labels that are compatible with mining mined_labels = [ mined_data[image_id]["label_global"] for image_id in mined_data ] else: mined_labels = [] # randomly prune label images if too many max_batch_labels = self.max_batch_labels if self.max_batch_labels is not None else class_ids.size + len( mined_labels) + 1 class_ids = np.unique(class_ids) np.random.shuffle(class_ids) class_ids = class_ids[:max_batch_labels - len(mined_labels)] class_ids = np.unique( np.concatenate((class_ids, np.array(mined_labels).astype( class_ids.dtype)), axis=0)) else: class_ids = self.dataset.get_class_ids() class_ids = sorted(list(class_ids)) # decide on batch level data augmentation if self.data_augmentation is not None: batch_vflip = random.random( ) < 0.5 if self.data_augmentation.batch_random_vflip else False batch_hflip = random.random( ) < 0.5 if self.data_augmentation.batch_random_hflip else False else: batch_vflip = False batch_hflip = False # prepare class images num_classes = len(class_ids) class_images, class_image_sizes = self.get_class_images_and_sizes( class_ids, do_augmentation=True) batch_class_images = [ self._transform_image_gt(img, hflip=batch_hflip, vflip=batch_vflip) for img in class_images ] # get the image sizes after resize in self._transform_image_gt, format - width, height class_image_sizes = [ FeatureMapSize(img=img) for img in batch_class_images ] # prepare images and boxes img_size = None batch_box_inverse_transform = [] batch_boxes = [] batch_img_size = [] for image_id in image_ids: # get annotation boxes = self.get_image_annotation_for_imageid(image_id) # convert global indices to local # if use_global_labels==False then local indices will be w.r.t. labels in this batch # if use_global_labels==True then local indices will be w.r.t. labels in the whole dataset (not class_ids) self.update_box_labels_to_local(boxes, class_ids) # prepare image and boxes: convert image to tensor, data augmentation: some boxes might be cut off the image image_mined_data = None if not use_mined_data else mined_data[ image_id] img, boxes, mask_cutoff_boxes, mask_difficult_boxes, box_inverse_transform = \ self._transform_image(image_id, boxes, hflip=batch_hflip, vflip=batch_vflip, mined_data=image_mined_data) # mask_difficult_boxes is set True for boxes that are largely chopped off, those are not used for training if boxes.has_field("difficult"): old_difficult = boxes.get_field("difficult") boxes.add_field("difficult", old_difficult | mask_difficult_boxes) boxes.get_field("labels")[mask_cutoff_boxes] = -2 # vizualize groundtruth images and boxes - to debug data augmentation if self.show_gt_boxes and self.data_augmentation is not None: visualizer.show_gt_boxes(image_id, boxes, class_ids, self, image_to_show=img) # check image size in this batch if img_size is None: img_size = FeatureMapSize(img=img) else: assert img_size == FeatureMapSize( img=img), "Images in a batch should be of the same size" loc_targets, class_targets = self.box_coder.encode( boxes, img_size, num_classes) batch_loc_targets.append(loc_targets) batch_class_targets.append(class_targets) batch_images.append(img) batch_box_inverse_transform.append([box_inverse_transform]) batch_boxes.append(boxes) batch_img_size.append(img_size) # stack data batch_images = torch.stack(batch_images, 0) batch_loc_targets = torch.stack(batch_loc_targets, 0) batch_class_targets = torch.stack(batch_class_targets, 0) return batch_images, batch_class_images, batch_loc_targets, batch_class_targets, class_ids, class_image_sizes, \ batch_box_inverse_transform, batch_boxes, batch_img_size
def build_train_dataloader_from_config(cfg, box_coder, img_normalization, dataset_train=None, data_path="", logger_prefix="OS2D.train"): """Construct dataloaders to use for training. Args: cfg - config object, training is done on cfg.train.dataset_name dataset box_coder (Os2dBoxCoder) img_normalization (dict) - normalization to use, keys "mean" and "std" have lists of 3 floats each dataset_train (DatasetOneShotDetection) - one needs either to provide a dataset object or a path to create such object from config data_path (str) - root path to search for datasets logger_prefix (str) - prefix to ass to the logger outputs Output: dataloader_train (DataloaderOneShotDetection) - the dataloader for training datasets_train_subset_for_eval (list of DatasetOneShotDetection) - subsets of the training set to pass to evaluation dataloaders """ if dataset_train is None: assert data_path, "If explicit dataset_train is not provided one needs to provide a data_path to create one" dataset_train = build_dataset_by_name( data_path, cfg.train.dataset_name, eval_scale=cfg.train.dataset_scale, cache_images=cfg.train.cache_images, no_image_reading=not cfg.train.do_training) logger = logging.getLogger(logger_prefix + ".dataloader") # create training dataloader random_crop_size = FeatureMapSize(w=cfg.train.augment.train_patch_width, h=cfg.train.augment.train_patch_height) evaluation_scale = dataset_train.eval_scale / dataset_train.image_size pyramid_scales_eval = cfg.eval.scales_of_image_pyramid pyramid_scales_eval = [p * evaluation_scale for p in pyramid_scales_eval] dataloader_train = DataloaderOneShotDetection( dataset=dataset_train, box_coder=box_coder, batch_size=cfg.train.batch_size, class_batch_size=cfg.train.class_batch_size, img_normalization=img_normalization, random_flip_batches=cfg.train.augment.random_flip_batches, random_crop_size=random_crop_size, random_crop_scale=evaluation_scale, jitter_aspect_ratio=cfg.train.augment.jitter_aspect_ratio, scale_jitter=cfg.train.augment.scale_jitter, min_box_coverage=cfg.train.augment.min_box_coverage, random_color_distortion=cfg.train.augment.random_color_distortion, random_crop_class_images=cfg.train.augment.random_crop_class_images, gt_image_size=cfg.model.class_image_size, pyramid_scales_eval=pyramid_scales_eval, do_augmentation=True, mine_extra_class_images=cfg.train.augment.mine_extra_class_images, show_gt_boxes=cfg.visualization.train.show_gt_boxes_dataloader, logger_prefix=logger_prefix) if cfg.eval.train_subset_for_eval_size > 0: logger.info( "Creating sub-training set of size {0} for evaluation".format( cfg.eval.train_subset_for_eval_size)) datasets_train_subset_for_eval = [ dataset_train.copy_subset(cfg.eval.train_subset_for_eval_size) ] else: datasets_train_subset_for_eval = [] return dataloader_train, datasets_train_subset_for_eval
def evaluate_detections(self, all_boxes, output_dir, mAP_iou_threshold=0.5): predictions = [] gt_boxes = [] roidb = self.roidb for i_image, roi in enumerate(roidb): image_size = FeatureMapSize(w=roi["width"], h=roi["height"]) if roi["boxes"].size > 0: roi_gt_boxes = BoxList(roi["boxes"], image_size, mode="xyxy") else: roi_gt_boxes = BoxList.create_empty(image_size) roi_gt_boxes.add_field( "labels", torch.as_tensor(roi["gt_classes"], dtype=torch.int32)) roi_gt_boxes.add_field( "difficult", torch.as_tensor(roi["gt_ishard"], dtype=torch.int32)) gt_boxes.append(roi_gt_boxes) roi_detections = [] for i_class, class_boxes in enumerate(all_boxes): assert len(class_boxes) == len(roidb), \ "Number of detection for class {0} image{1} ({2}) inconsistent with the length of roidb ({3})".format(i_class, i_image, len(class_boxes), len(roidb)) boxes = class_boxes[i_image] if len(boxes) > 0: assert boxes.shape[ 1] == 5, "Detections should be of shape (:,5), but are {0} for class {1}, image {2}".format( boxes.shape, i_class, i_image) bbox = BoxList(boxes[:, :4], image_size, mode="xyxy") scores = boxes[:, -1] bbox.add_field( "scores", torch.as_tensor(scores, dtype=torch.float32)) bbox.add_field( "labels", torch.full(scores.shape, i_class, dtype=torch.int32)) roi_detections.append(bbox) if roi_detections: roi_detections = cat_boxlist(roi_detections) else: roi_detections = BoxList.create_empty(image_size) roi_detections.add_field( "scores", torch.zeros((0, ), dtype=torch.float32)) roi_detections.add_field("labels", torch.zeros((0, ), dtype=torch.int32)) predictions.append(roi_detections) if False: self.visualize_detections(i_image, gt=roi_gt_boxes, dets=roi_detections) ap_data = do_voc_evaluation(predictions, gt_boxes, iou_thresh=mAP_iou_threshold, use_07_metric=False) print("mAP@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["map"])) print("mAPw@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["map_weighted"])) print("recall@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["recall"])) return ap_data['map']