def get_rec_field_and_stride_after_concat_nets(receptive_field_netA,
                                                   stride_netA,
                                                   receptive_field_netB,
                                                   stride_netB):
        """We are concatenating the two networks  net(x) = netB(netA(x)), both with strides and receptive fields.
        This functions computes the stride and receptive field of the combination
        """
        if isinstance(receptive_field_netA, FeatureMapSize):
            assert isinstance(stride_netA, FeatureMapSize) and isinstance(
                receptive_field_netB, FeatureMapSize) and isinstance(
                    stride_netB, FeatureMapSize
                ), "All inputs should be either of type FeatureMapSize or int"
            rec_field_w, stride_w = Os2dHeadCreator.get_rec_field_and_stride_after_concat_nets(
                receptive_field_netA.w, stride_netA.w, receptive_field_netB.w,
                stride_netB.w)
            rec_field_h, stride_h = Os2dHeadCreator.get_rec_field_and_stride_after_concat_nets(
                receptive_field_netA.h, stride_netA.h, receptive_field_netB.h,
                stride_netB.h)
            return FeatureMapSize(w=rec_field_w,
                                  h=rec_field_h), FeatureMapSize(w=stride_w,
                                                                 h=stride_h)

        rec_field = stride_netA * (receptive_field_netB -
                                   1) + receptive_field_netA
        stride = stride_netA * stride_netB
        return rec_field, stride
    def __init__(self, do_simple_affine, is_cuda, use_inverse_geom_model):
        super(Os2dAlignment, self).__init__()

        self.model_type = "affine" if not do_simple_affine else "simple_affine"  # "affine" or "simple_affine"
        self.use_inverse_geom_model = use_inverse_geom_model

        # create the parameter regression network
        if self.model_type == "affine":
            transform_net_output_dim = 6
        elif self.model_type == "simple_affine":
            transform_net_output_dim = 4
        else:
            raise (RuntimeError("Unknown transformation model \"{0}\"".format(
                self.model_type)))

        # all these numbers are semantically different, but are set to 15 due to the details in the model architecture
        # these number have to be compatible with the network regressing transformation parameters
        # following the weakalign code, we use 15 here
        # all the sizes are in (H, W) format
        # NOTE: tenchically the code should work with non-square grids, but this was never tested, so expect bugs
        self.out_grid_size = FeatureMapSize(w=15, h=15)
        self.reference_feature_map_size = FeatureMapSize(w=15, h=15)
        self.network_stride = FeatureMapSize(w=1, h=1)
        self.network_receptive_field = FeatureMapSize(w=15, h=15)

        self.input_feature_dim = self.reference_feature_map_size.w * self.reference_feature_map_size.h
        self.parameter_regressor = TransformationNet(
            output_dim=transform_net_output_dim,
            use_cuda=is_cuda,
            normalization=
            'batchnorm',  # if not self.use_group_norm else 'groupnorm',
            kernel_sizes=[7, 5],
            channels=[128, 64],
            input_feature_dim=self.input_feature_dim)
Example #3
0
def resnet101_c4(use_group_norm=False):
    """
    Constructs the ResNet101 C4 feature extractor
    Args:
        use_group_norm (bool) - if True use torch.nn.GroupNorm with GROUPNORM_NUMGROUPS groups as normalization layers,
            otherwise use torch.nn.BatchNorm2d
    """
    return _resnet_fe(resnet101,
                      4,
                      use_group_norm,
                      feature_map_stride=FeatureMapSize(h=16, w=16),
                      feature_map_receptive_field=FeatureMapSize(h=16, w=16))
Example #4
0
 def get_class_images_and_sizes(self, class_ids, do_augmentation=False):
     if self.mine_extra_class_images and do_augmentation:
         # select random label image if several are mined
         class_images = []
         for class_id in class_ids:
             if class_id in self.label_image_collection:
                 num_mined = len(self.label_image_collection[class_id])
                 random_int = torch.randint(num_mined + 1, (1, ),
                                            dtype=torch.long)
                 if random_int == 0:
                     # use the original image
                     class_image = self.dataset.gt_images_per_classid[
                         class_id]
                 else:
                     # use the selected mined image
                     class_image = self.label_image_collection[class_id][
                         random_int - 1]
             else:
                 # nothing was mined for this class
                 class_image = self.dataset.gt_images_per_classid[class_id]
             class_images.append(class_image)
     else:
         class_images = [
             self.dataset.gt_images_per_classid[class_id]
             for class_id in class_ids
         ]
     class_image_sizes = [FeatureMapSize(img=img) for img in class_images]
     return class_images, class_image_sizes
Example #5
0
    def detect(self, target, source) -> Tuple[BoxList, torch.Tensor]:
        target = self._preprocess(target, cfg.model.class_image_size)
        source = self._preprocess(source, self.source_img_size)

        with torch.no_grad():
            loc_prediction_batch, class_prediction_batch, _, fm_size, transform_corners_batch = \
                self.net(images=source, class_images=target)

        image_loc_scores_pyramid = [loc_prediction_batch[0]]
        image_class_scores_pyramid = [class_prediction_batch[0]]
        img_size_pyramid = [FeatureMapSize(img=source)]
        transform_corners_pyramid = [transform_corners_batch[0]]

        class_ids = [0]
        boxes = self.box_coder.decode_pyramid(image_loc_scores_pyramid, image_class_scores_pyramid,
                                         img_size_pyramid, class_ids,
                                         nms_iou_threshold=cfg.eval.nms_iou_threshold,
                                         nms_score_threshold=cfg.eval.nms_score_threshold,
                                         transform_corners_pyramid=transform_corners_pyramid)
        boxes.remove_field("default_boxes")

        scores = boxes.get_field("scores")

        good_ids = torch.nonzero(scores.float() > self.score_threshold).view(-1)
        if good_ids.numel() > 0:
            _, ids = scores[good_ids].sort(descending=False)
            good_ids = good_ids[ids[-self.max_detections:]]
            boxes = boxes[good_ids].cpu()
            scores = scores[good_ids].cpu()
            boxes = boxes.bbox_xyxy
            boxes[:, [0,2]] /= source.shape[3]
            boxes[:, [1,3]] /= source.shape[2]
            return boxes, scores
        else:
            return None, None
def get_feature_map_size_for_network(img_size, net, is_cuda=False):
    """get_feature_map_size_for_network computes the size of the feature map when the network is applied to an image of specific size.
    The function creates a dummy image of required size, and just runs a network on it.
    This approach is very robust, but can be quite slow, so these calls shoulb be cached.
    Args:
        img_size (FeatureMapSize) - size of the input image
        net - the net to run
        is_cuda (bool) -flag showing where to put the dummy image on a GPU.
    Output:
        feature_map_size (FeatureMapSize) - the size of the feature map
    """
    dummy_image = torch.zeros(
        1, 3, img_size.h,
        img_size.w)  # batch_size, num_channels, height, width
    if is_cuda:
        dummy_image = dummy_image.cuda()

    with torch.no_grad():
        dummy_feature_maps = net(dummy_image)
        feature_map_size = FeatureMapSize(img=dummy_feature_maps)

    if is_cuda:
        torch.cuda.empty_cache()

    return feature_map_size
Example #7
0
 def get_the_boxes(image_filename):
     file_with_boxes = os.path.join(
         image_path, get_box_file_for_image_file(image_filename))
     # get image size - recompute boxes
     boxes = read_boxes_from(file_with_boxes)
     img = read_image(os.path.join(image_path, image_filename))
     imsize = FeatureMapSize(img=img)
     # choose the correct box if have two of them
     # From INSTRE documentation:
     # Specially, for each tuple-class in INSTRE-M, there are two corresponding object classes in INSTRE-S1.
     # In each annotation file for a INSTRE-M image, the first line records the object labeled as [a] in INSTRE-S1
     # and the second line records the object labeled as [b] in INSTRE-S1.
     #
     # CAUTION! the matlab file has boxes in x1, y1, x2, y2, but the .txt files in x, y, w, h
     query_path_split = query_image_path_original.split("/")
     image_filename_split = image_filename.split("/")
     if query_path_split[0].lower(
     ) == "instre-s1" and image_filename_split[0].lower(
     ) == "instre-m":
         assert len(
             boxes
         ) == 2, f"INSTRE-M images should have exactly two boxes, but have {boxes}"
         assert query_path_split[1][2] in ["a", "b"]
         i_box = 0 if query_path_split[1][2] == "a" else 1
         boxes = [convert_the_box_from_xywh(boxes[i_box], imsize)]
     elif query_path_split[0].lower() == "instre-s1" and image_filename_split[0].lower() == "instre-s1" or \
             query_path_split[0].lower() == "instre-s2" and image_filename_split[0].lower() == "instre-s2":
         boxes = [
             convert_the_box_from_xywh(box, imsize) for box in boxes
         ]
     else:
         raise RuntimeError(
             f"Should not be happening, query {query_image_path_original}, image {image_filename}, boxes {boxes}"
         )
     return boxes
Example #8
0
    def read_annotation(xml_file: str):
        tree = ElementTree.parse(xml_file)
        root = tree.getroot()

        filename = root.find('filename').text
        im_size = root.find("size")
        width = int(im_size.find("width").text)
        height = int(im_size.find("height").text)
        im_size = FeatureMapSize(h=height, w=width)

        bboxes = []
        class_ids = []
        difficult_flags = []

        for boxes in root.iter("object"):
            ymin, xmin, ymax, xmax = None, None, None, None
            difficult_flag = int(boxes.find("difficult").text)
            class_id = boxes.find("name").text
            for box in boxes.findall("bndbox"):
                assert ymin is None
                ymin = int(box.find("ymin").text)
                xmin = int(box.find("xmin").text)
                ymax = int(box.find("ymax").text)
                xmax = int(box.find("xmax").text)

            cur_box = [xmin, ymin, xmax, ymax]
            bboxes.append(cur_box)
            difficult_flags.append(difficult_flag)
            class_ids.append(class_id)

        return filename, bboxes, class_ids, difficult_flags, im_size
Example #9
0
def decode_scores_show_detections(dataloader, images, class_ids, class_scores,
                                  loc_scores, corners):
    num_images = images.size(0)
    for i_image in range(num_images):
        # show elements with the largest losses
        img_size_pyramid = [FeatureMapSize(img=images[i_image])]
        image_loc_scores_pyramid = [loc_scores[i_image]]
        image_cls_scores_pyramid = [class_scores[i_image]]
        corners_pyramid = [corners[i_image]]

        # decode image predictions
        boxes_one_image = \
            dataloader.box_coder.decode_pyramid(image_loc_scores_pyramid, image_cls_scores_pyramid,
                                                img_size_pyramid, class_ids,
                                                nms_iou_threshold=cfg.eval.nms_iou_threshold, nms_score_threshold=cfg.eval.nms_score_threshold,
                                                transform_corners_pyramid=corners_pyramid)

        show_annotated_image(
            img=dataloader.unnorm_image(images[i_image]),
            boxes=boxes_one_image,
            labels=boxes_one_image.get_field("labels"),
            scores=boxes_one_image.get_field("scores"),
            default_boxes=boxes_one_image.get_field("default_boxes"),
            transform_corners=boxes_one_image.get_field("transform_corners"),
            class_ids=class_ids,
            score_threshold=cfg.visualization.train.score_threshold,
            max_dets=cfg.visualization.train.max_detections,
            showfig=True)
Example #10
0
    def _transform_image_gt(self,
                            img,
                            do_augmentation=True,
                            hflip=False,
                            vflip=False,
                            do_resize=True):
        do_augmentation = do_augmentation and self.data_augmentation is not None

        # batch level data augmentation
        img, _ = transforms_boxes.transpose(img,
                                            hflip=hflip,
                                            vflip=vflip,
                                            boxes=None,
                                            transform_list=None)

        if do_augmentation:
            # color distortion
            img = self.data_augmentation.random_distort(img)
            # random crop
            img = self.data_augmentation.random_crop_label_image(img)

        # resize image
        if do_resize:
            random_interpolation = self.data_augmentation.random_interpolation if do_augmentation else False

            # get the new size - while preserving aspect ratio
            size_old = FeatureMapSize(img=img)
            h, w = get_image_size_after_resize_preserving_aspect_ratio(
                h=size_old.h, w=size_old.w, target_size=self.gt_image_size)
            size_new = FeatureMapSize(w=w, h=h)

            img, _ = transforms_boxes.resize(
                img,
                target_size=size_new,
                random_interpolation=random_interpolation)

        transforms_th = [transforms.ToTensor()]
        if self.img_normalization is not None:
            transforms_th += [
                transforms.Normalize(self.img_normalization["mean"],
                                     self.img_normalization["std"])
            ]
        img = transforms.Compose(transforms_th)(img)
        return img
Example #11
0
def get_image_sizes(dataset):
    print("Reading images from {}".format(dataset.image_path))
    image_sizes_by_id = OrderedDict()
    images_in_dataset = dataset.gtboxframe.groupby(["imageid", "imagefilename"]).size().reset_index()
    for _, datum in tqdm(images_in_dataset.iterrows()):
        img = dataset._get_dataset_image_by_id(datum["imageid"])
        im_size = FeatureMapSize(img=img)
        image_sizes_by_id[datum["imageid"]] = im_size
    print("Found {} images".format(len(image_sizes_by_id)))
    return image_sizes_by_id
Example #12
0
def convert_boxlist_maskrcnn_to_os2d(boxlist_maskrcnn):
    image_size = FeatureMapSize(w=boxlist_maskrcnn.size[0],
                                h=boxlist_maskrcnn.size[1])
    boxlist = BoxList_os2d(boxlist_maskrcnn.convert("xyxy").bbox,
                           image_size,
                           mode="xyxy")
    # add extra fields
    for f in boxlist_maskrcnn.fields():
        boxlist.add_field(f, boxlist_maskrcnn.get_field(f))
    return boxlist
Example #13
0
    def _read_dataset_images(self):
        # create caches
        self.image_path_per_image_id = OrderedDict()
        self.image_size_per_image_id = OrderedDict()
        self.image_per_image_id = OrderedDict()
        for image_id, image_file in zip(self.image_ids, self.image_file_names):
            if image_id not in self.image_path_per_image_id :
                # store the image path
                img_path = os.path.join(self.image_path, image_file)
                self.image_path_per_image_id[image_id] = img_path
                # get image size (needed for bucketing)
                img = self._get_dataset_image_by_id(image_id)
                self.image_size_per_image_id[image_id] = FeatureMapSize(img=img)

        self.logger.info("{1} {0} data images".format(len(self.image_path_per_image_id), "Read" if self.cache_images else "Found"))
Example #14
0
    def _get_dataset_image_by_id(self, image_id):
        assert image_id in self.image_path_per_image_id, "Can work only with checked images"

        if image_id not in self.image_per_image_id :
            img_path = self.image_path_per_image_id[image_id]
            img = read_image(img_path)
            img_size = FeatureMapSize(img=img)
            if max(img_size.w, img_size.h) != self.image_size:
                h, w = get_image_size_after_resize_preserving_aspect_ratio(img_size.h, img_size.w, self.image_size)
                img = img.resize((w, h), resample=Image.ANTIALIAS) # resize images in case they were not of the correct size on disk
            if self.cache_images:
                self.image_per_image_id[image_id] = img
        else:
            img = self.image_per_image_id[image_id]

        return img
    def forward(self,
                images=None,
                class_images=None,
                feature_maps=None,
                class_head=None,
                train_mode=False,
                fine_tune_features=True):
        """ Forward pass of the OS2D model. Cant function in several different regimes:
            [training mode] Extract features from input and class images, and applies the model to get 
                clasificaton/localization scores of all classes on all images
                Args:
                    images (tensor) - batch of input images
                    class_images (list of tensors) - list of class images (possibly of different sizes)
                    train_mode (bool) - should be True
                    fine_tune_features (bool) - flag showing whether to enable gradients over features
            [evaluation mode]
                    feature_maps (tensor) - pre-extracted feature maps, sized batch_size x feature_dim x height x width
                    class_head (Os2dHead) - head created to detect some classes,
                        inside has class_feature_maps, sized class_batch_size x feature_dim x class_height x class_width
                    train_mode (bool) - should be False
        Outputs:
            loc_scores (tensor) - localization prediction, sized batch_size x num_classes x 4 x num_anchors (bbox parameterization)
            class_scores (tensor) - classification prediction, sized batch_size x num_classes x num_anchors
            class_scores_transform_detached (tensor) - same, but with transofrms detached from the computational graph
                used not to tune transofrmation on the negative examples
            fm_sizes (FeatureMapSize) - size of the output score map, num_anchors == fm_sizes.w * fm_sizes.h
            transform_corners (tensor) - points defining parallelograms showing transformations, sized batch_size x num_classes x 8 x num_anchors
        """
        with torch.set_grad_enabled(train_mode and fine_tune_features):
            # extract features
            if feature_maps is None:
                assert images is not None, "If feature_maps is None than images cannot be None"
                feature_maps = self.net_feature_maps(images)

            # get features for labels
            if class_head is None:
                assert class_images is not None, "If class_conv_layer is None than class_images cannot be None"
                class_feature_maps = self.net_label_features(class_images)
                class_head = self.os2d_head_creator.create_os2d_head(
                    class_feature_maps)

        # process features maps of different pyramid levels
        loc_scores, class_scores, class_scores_transform_detached, transform_corners = \
            self.apply_class_heads_to_feature_maps(feature_maps, class_head)

        fm_size = FeatureMapSize(img=feature_maps)
        return loc_scores, class_scores, class_scores_transform_detached, fm_size, transform_corners
def save_cropped_boxes(dataset, tgt_image_path, extension=".jpg", num_random_crops_per_image=0):
    # crop all the boxes
    db = {"cids":[], "cluster":[], "gtbboxid":[], "classid":[], "imageid":[], "difficult":[], "type":[], "size":[], "bbox":[]}

    for image_id in tqdm(dataset.image_ids):
        img = dataset._get_dataset_image_by_id(image_id)
        boxes = dataset.get_image_annotation_for_imageid(image_id)

        assert boxes.has_field("labels"), "GT boxes need a field 'labels'"
        # remove all fields except "labels" and "difficult"
        for f in boxes.fields():
            if f not in ["labels", "difficult"]:
                boxes.remove_field(f)
        if not boxes.has_field("difficult"):
            boxes.add_field("difficult", torch.zeros(len(boxes), dtype=torch.bool))

        num_gt_boxes = len(boxes)
        im_size = FeatureMapSize(img=img)
        assert im_size == boxes.image_size

        eval_scale = dataset.get_eval_scale()

        # sample random boxes if needed
        if num_random_crops_per_image > 0:
            boxes_random = torch.rand(num_random_crops_per_image, 4)
            x1 = torch.min(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w
            x2 = torch.max(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w
            y1 = torch.min(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h
            y2 = torch.max(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h
            boxes_random = torch.stack([x1, y1, x2, y2], 1).floor()

            # crop boxes that are too small
            min_size = 10.0 / eval_scale * max(im_size.w, im_size.h)
            mask_bad_boxes = (boxes_random[:,0] + min_size > boxes_random[:,2]) | (boxes_random[:,1] + min_size > boxes_random[:,3])
            good_boxes = torch.nonzero(~mask_bad_boxes).view(-1)
            boxes_random = boxes_random[good_boxes]

            boxes_random = BoxList(boxes_random, im_size, mode="xyxy")

            boxes_random.add_field("labels", torch.full([len(boxes_random)], -1, dtype=torch.long))
            boxes_random.add_field("difficult", torch.zeros(len(boxes_random), dtype=torch.bool))
            boxes = cat_boxlist([boxes, boxes_random])

        if boxes is not None:
            for i_box in range(len(boxes)):
                # box format: left, top, right, bottom
                box = boxes[i_box].bbox_xyxy.view(-1)
                box = [b.item() for b in box]
                cropped_img = img.crop(box)

                if i_box < num_gt_boxes:
                    lbl = boxes[i_box].get_field("labels").item()
                    dif_flag = boxes[i_box].get_field("difficult").item()
                    box_id = i_box
                    box_type = "GT"
                else:
                    lbl = -1
                    dif_flag = 0
                    box_id = i_box
                    box_type = "RN"

                # create the file name to be used with cirtorch.datasets.datahelpers.cid2filename and their dataloader
                cid = "box{box_id:05d}_lbl{label:05d}_dif{dif:01d}_im{image_id:05d}{box_type}".format(box_id=box_id, image_id = image_id, label = lbl, dif = dif_flag, box_type=box_type)
                file_name = cid2filename(cid, prefix=tgt_image_path)

                # save the image
                image_path, _ = os.path.split(file_name)
                mkdir(image_path)
                if extension:
                    cropped_img.save("{}{}".format(file_name, extension))
                else:
                    # cirtorch uses files with empty extension for training for some reason, need to support that
                    cropped_img.save("{}".format(file_name), format="jpeg")

                # add to the db structure
                db["cids"].append(cid)
                db["cluster"].append(lbl)  # use labels as clusters not to sample negatives from the same object
                db["classid"].append(lbl)
                db["gtbboxid"].append(box_id)
                db["imageid"].append(image_id)
                db["difficult"].append(dif_flag)
                if i_box < num_gt_boxes:
                    db["type"].append("gtproposal")
                else:
                    db["type"].append("randomcrop")
                db["size"].append(cropped_img.size)
                db["bbox"].append(box)  # format (x1,y1,x2,y2)

    return db
Example #17
0
def build_imagenet_test_episodes(subset_name, data_path, logger):
    episode_id = int(subset_name.split('-')[-1])
    epi_data_name = "epi_inloc_in_domain_1_5_10_500"
    image_size = 1000

    dataset_path = os.path.join(data_path, "ImageNet-RepMet")
    roidb_path = os.path.join(dataset_path, "RepMet_CVPR2019_data", "data",
                              "Imagenet_LOC", "voc_inloc_roidb.pkl")
    with open(roidb_path, 'rb') as fid:
        roidb = pickle.load(fid, encoding='latin1')
    episodes_path = os.path.join(dataset_path, "RepMet_CVPR2019_data", "data",
                                 "Imagenet_LOC", "episodes",
                                 f"{epi_data_name}.pkl")
    with open(episodes_path, 'rb') as fid:
        episode_data = pickle.load(fid, encoding='latin1')

    logger.info(f"Extracting episode {episode_id} out of {len(episode_data)}")
    episode = episode_data[episode_id]

    dataset_image_path = os.path.join(data_path, "ImageNet-RepMet", "ILSVRC")

    SWAP_IMG_PATH_SRC = "/dccstor/leonidka1/data/imagenet/ILSVRC/"

    def _get_image_path(image_path):
        image_path = image_path.replace(SWAP_IMG_PATH_SRC, "")
        return image_path

    # episode["epi_cats"] - list of class ids
    # episode["query_images"] - list of path to the episode images
    # episode["epi_cats_names"] - list of names of the episode classes
    # episode["train_boxes"] - list of box data about class boxes

    num_classes = len(episode["epi_cats"])

    gt_path = os.path.join(dataset_path, epi_data_name)
    gt_path = os.path.join(gt_path, f"classes_episode_{episode_id}")
    gt_image_path = os.path.join(gt_path, "images")
    mkdir(gt_image_path)
    classdatafile = os.path.join(
        gt_path, f"classes_{epi_data_name}_episode_{episode_id}.csv")
    if not os.path.isfile(classdatafile):
        logger.info(
            f"Did not find data file {classdatafile}, creating it from the RepMet source data"
        )
        # create the annotation file from the raw dataset
        gtboxframe = []  # will be creating dataframe from a list of dicts

        gt_filename_by_id = {}
        for i_class in range(len(episode["train_boxes"])):
            train_boxes_data = episode["train_boxes"][i_class]
            class_id = train_boxes_data[0]
            assert class_id in episode[
                "epi_cats"], f"class_id={class_id} should be listed in episode['epi_cats']={episode['epi_cats']}"

            query_image_path_original = _get_image_path(train_boxes_data[2])
            query_bbox = train_boxes_data[3]
            query_bbox = query_bbox.flatten()

            classfilename = f"{class_id:05d}_{'_'.join(query_image_path_original.split('/'))}"

            if class_id not in gt_filename_by_id:
                logger.info(
                    f"Adding query #{len(gt_filename_by_id)} - {class_id}: {query_image_path_original}"
                )

                if not os.path.isfile(classfilename) or True:
                    query_img = read_image(
                        os.path.join(dataset_image_path,
                                     query_image_path_original))
                    query_img_cropped_box = query_img.crop(query_bbox)
                    query_img_cropped_box.save(
                        os.path.join(gt_image_path, classfilename))

                gt_filename_by_id[class_id] = classfilename
            else:
                logger.info(
                    f"WARNING: class {class_id} has multiple entries in GT image {query_image_path_original}, using the first box as GT"
                )

        for class_id in episode["epi_cats"]:
            if class_id not in gt_filename_by_id:
                logger.info(
                    f"WARNING: ground truth for class {class_id} not found in episode {episode_id}"
                )

        def convert_the_box_to_relative(box, imsize):
            lx = float(box[0]) / imsize.w
            ty = float(box[1]) / imsize.h
            rx = float(box[2]) / imsize.w
            by = float(box[3]) / imsize.h
            return lx, ty, rx, by

        def find_image_path_in_roidb(image_file_name, roidb):
            for i_image, im_data in enumerate(roidb["roidb"]):
                if im_data["flipped"]:
                    raise RuntimeError(
                        f"Image {i_image} data {im_data} has flipped flag on")
                if im_data["image"] == image_file_name:
                    return i_image
            return None

        for image_file_name in episode["query_images"]:
            # add one bbox to the annotation
            #     required_columns = ["imageid", "imagefilename", "classid", "classfilename", "gtbboxid", "difficult", "lx", "ty", "rx", "by"]
            image_id = find_image_path_in_roidb(image_file_name, roidb)
            im_data = roidb["roidb"][image_id]
            image_file_name = _get_image_path(image_file_name)

            imsize = FeatureMapSize(w=int(im_data["width"]),
                                    h=int(im_data["height"]))

            boxes_xyxy = im_data["boxes"]
            classes = im_data["gt_classes"]

            for box, class_id in zip(boxes_xyxy, classes):
                if class_id in gt_filename_by_id:
                    item = OrderedDict()
                    item["imageid"] = int(image_id)
                    item["imagefilename"] = image_file_name
                    item["classid"] = int(class_id)
                    item["classfilename"] = gt_filename_by_id[class_id]
                    item["gtbboxid"] = len(gtboxframe)
                    item["difficult"] = 0
                    item["lx"], item["ty"], item["rx"], item[
                        "by"] = convert_the_box_to_relative(box, imsize)
                    gtboxframe.append(item)

        gtboxframe = pd.DataFrame(gtboxframe)
        gtboxframe.to_csv(classdatafile)

    gtboxframe = pd.read_csv(classdatafile)

    return gtboxframe, gt_image_path, dataset_image_path, image_size
Example #18
0
    def _transform_image_to_pyramid(self,
                                    image_id,
                                    boxes=None,
                                    do_augmentation=True,
                                    hflip=False,
                                    vflip=False,
                                    pyramid_scales=(1, ),
                                    mined_data=None):
        img = self._get_dataset_image_by_id(image_id)
        img_size = FeatureMapSize(img=img)

        do_augmentation = do_augmentation and self.data_augmentation is not None
        num_pyramid_levels = len(pyramid_scales)

        use_mined_crop = mined_data is not None
        if use_mined_crop:
            crop_position = mined_data["crop_position_xyxy"]

        if boxes is None:
            boxes = BoxList.create_empty(img_size)
        mask_cutoff_boxes = torch.zeros(len(boxes), dtype=torch.bool)
        mask_difficult_boxes = torch.zeros(len(boxes), dtype=torch.bool)

        box_inverse_transform = TransformList()
        # batch level data augmentation
        img, boxes = transforms_boxes.transpose(
            img,
            hflip=hflip,
            vflip=vflip,
            boxes=boxes,
            transform_list=box_inverse_transform)

        if use_mined_crop:
            # update crop_position_xyxy with the symmetries
            if hflip or vflip:
                _, crop_position = transforms_boxes.transpose(
                    img, hflip=hflip, vflip=vflip, boxes=crop_position)

        if do_augmentation:
            if self.data_augmentation.do_random_crop:
                if not use_mined_crop:
                    img, boxes, mask_cutoff_boxes, mask_difficult_boxes = \
                        self.data_augmentation.random_crop(img,
                                                           boxes=boxes,
                                                           transform_list=box_inverse_transform)
                else:
                    img, boxes, mask_cutoff_boxes, mask_difficult_boxes = \
                        self.data_augmentation.crop_image(img, crop_position,
                                                          boxes=boxes,
                                                          transform_list=box_inverse_transform)

                img, boxes = transforms_boxes.resize(
                    img,
                    target_size=self.data_augmentation.random_crop_size,
                    random_interpolation=self.data_augmentation.
                    random_interpolation,
                    boxes=boxes,
                    transform_list=box_inverse_transform)

            # color distortion
            img = self.data_augmentation.random_distort(img)

        random_interpolation = self.data_augmentation.random_interpolation if do_augmentation else False
        img_size = FeatureMapSize(img=img)
        pyramid_sizes = [
            FeatureMapSize(w=int(img_size.w * s), h=int(img_size.h * s))
            for s in pyramid_scales
        ]
        img_pyramid = []
        boxes_pyramid = []
        pyramid_box_inverse_transform = []
        for p_size in pyramid_sizes:
            box_inverse_transform_this_scale = copy.deepcopy(
                box_inverse_transform)
            p_img, p_boxes = transforms_boxes.resize(
                img,
                target_size=p_size,
                random_interpolation=random_interpolation,
                boxes=boxes,
                transform_list=box_inverse_transform_this_scale)

            pyramid_box_inverse_transform.append(
                box_inverse_transform_this_scale)
            img_pyramid.append(p_img)
            boxes_pyramid.append(p_boxes)

        transforms_th = [transforms.ToTensor()]
        if self.img_normalization is not None:
            transforms_th += [
                transforms.Normalize(self.img_normalization["mean"],
                                     self.img_normalization["std"])
            ]

        for i_p in range(num_pyramid_levels):
            img_pyramid[i_p] = transforms.Compose(transforms_th)(
                img_pyramid[i_p])

        return img_pyramid, boxes_pyramid, mask_cutoff_boxes, mask_difficult_boxes, pyramid_box_inverse_transform
def make_iterator_extract_scores_from_images_batched(dataloader,
                                                     maskrcnn_model,
                                                     maskrcnn_config,
                                                     logger,
                                                     image_batch_size=None,
                                                     is_cuda=False):
    logger.info("Starting iterations over images")

    # get images of all classes
    class_images, class_aspect_ratios, class_ids = dataloader.get_all_class_images(
    )
    num_classes = len(class_images)
    assert len(class_aspect_ratios) == num_classes
    assert len(class_ids) == num_classes
    query_img_sizes = [FeatureMapSize(img=img) for img in class_images]

    # loop over all images
    iterator_batches = dataloader.make_iterator_for_all_images(
        image_batch_size)
    for batch_ids, pyramids_batch, box_transforms_batch, initial_img_size_batch in iterator_batches:
        t_start_batch = time.time()
        # extract features at all pyramid levels
        batch_images_pyramid = []
        bboxes_xyxy = []
        labels = []
        scores = []
        num_pyramid_levels = len(pyramids_batch)
        for batch_images in pyramids_batch:
            if is_cuda:
                batch_images = batch_images.cuda()

            # print("Image size:", images_b.size())

            batch_images = [
                dataloader.unnorm_image(img) for img in batch_images
            ]
            batch_images = torch.stack(batch_images, 0)

            bboxes_xyxy_, labels_, scores_ = run_maskrcnn_on_images(
                maskrcnn_model, maskrcnn_config, batch_images)

            bboxes_xyxy.append(bboxes_xyxy_)
            labels.append(labels_)
            scores.append(scores_)
            batch_images_pyramid.append(batch_images)

        for i_image_in_batch, image_id in enumerate(batch_ids):
            # get data from all pyramid levels
            bboxes_xyxy_p = []
            labels_p = []
            scores_p = []
            for i_p in range(num_pyramid_levels):
                bboxes_xyxy_p.append(bboxes_xyxy[i_p][i_image_in_batch])
                labels_p.append(labels[i_p][i_image_in_batch])
                scores_p.append(scores[i_p][i_image_in_batch])

            # get a pyramid of one image[i_p]
            one_image_pyramid = [
                p[i_image_in_batch] for p in batch_images_pyramid
            ]

            # extract the box transformations
            box_reverse_transforms = box_transforms_batch[i_image_in_batch]

            # get the boxes in the correct format
            bboxes_xyxy_p = [
                BoxList(bbox, FeatureMapSize(img=img), mode="xyxy")
                for bbox, img in zip(bboxes_xyxy_p, one_image_pyramid)
            ]
            bboxes_xyxy_p = [
                t(bb) for t, bb in zip(box_reverse_transforms, bboxes_xyxy_p)
            ]

            # add labels and scores into the box structure
            for bb, l, s in zip(bboxes_xyxy_p, labels_p, scores_p):
                bb.add_field("labels", l)
                bb.add_field("scores", s)

            # get the size of the initial image
            initial_img_size = initial_img_size_batch[i_image_in_batch]

            yield image_id, bboxes_xyxy_p, one_image_pyramid, query_img_sizes, class_ids, initial_img_size
def evaluate(dataloader,
             detector,
             cfg_maskrcnn,
             retrievalnet,
             opt,
             cfg_eval,
             cfg_visualization,
             is_cuda=False,
             logger_prefix="detector-retrieval"):
    logger = logging.getLogger(f"{logger_prefix}.evaluate")
    dataset_name = dataloader.get_name()
    dataset_scale = dataloader.get_eval_scale()
    logger.info("Starting to eval on {0}, scale {1}".format(
        dataset_name, dataset_scale))

    t_start_eval = time.time()
    detector.eval()
    retrievalnet.eval()

    ## setup retrievalnet
    # setting up the multi-scale parameters
    ms = [1]
    msp = 1
    if opt.retrieval_multiscale:
        ms = [1, 1. / math.sqrt(2), 1. / 2]
        if retrievalnet.meta[
                "pooling"] == "gem" and retrievalnet.whiten is None:
            msp = retrievalnet.pool.p.data.tolist()[0]
    #setup whitening
    if opt.retrieval_whitening_path is not None:
        logger.info("Whitening is precomputed, loading it from {0}".format(
            opt.retrieval_whitening_path))
        whitening_data = torch.load(opt.retrieval_whitening_path)

        if ( (opt.retrieval_multiscale and "ms" in whitening_data) or \
             (not opt.retrieval_multiscale and "ss" in whitening_data ) ):

            if opt.retrieval_multiscale:
                Lw = copy.deepcopy(whitening_data["ms"])
            else:
                Lw = copy.deepcopy(whitening_data["ss"])
        else:
            raise RuntimeError(
                "Whitening should be precomputed with the network")

        # convert whitening data to torch tensors
        Lw["m"], Lw["P"] = torch.from_numpy(Lw["m"]), torch.from_numpy(Lw["P"])
        if is_cuda:
            Lw["m"], Lw["P"] = Lw["m"].cuda(), Lw["P"].cuda()
    else:
        Lw = None

    with torch.no_grad(
    ):  # do evaluation in forward mode only (for speed and memory)
        # extract features from query images
        query_images, _, _ = dataloader.get_all_class_images(do_resize=False)
        if is_cuda:
            query_images = [img.cuda() for img in query_images]
        query_images = [img[0] for img in query_images
                        ]  # get rid of the batch dimension
        query_images = [
            resize_image_tensor(img, opt.retrieval_image_size)
            for img in query_images
        ]
        query_images = [dataloader.unnorm_image(img) for img in query_images]

        query_images_with_aug = []
        for im in query_images:
            query_images_with_aug.append(im)
            if not cfg_eval.class_image_augmentation:
                num_class_views = 1
            elif cfg_eval.class_image_augmentation == "rotation90":
                im90 = im.rot90(1, [1, 2])
                im180 = im90.rot90(1, [1, 2])
                im270 = im180.rot90(1, [1, 2])
                query_images_with_aug.append(im90)
                query_images_with_aug.append(im180)
                query_images_with_aug.append(im270)
                num_class_views = 4
            elif cfg_eval.class_image_augmentation == "horflip":
                im_flipped = im.flip(2)
                query_images_with_aug.append(im_flipped)
                num_class_views = 2
            else:
                raise RuntimeError(
                    f"Unknown value of class_image_augmentation: {cfg_eval.class_image_augmentation}"
                )
        query_images = query_images_with_aug

        query_vectors = extract_vectors_from_images(retrievalnet,
                                                    query_images,
                                                    ms=ms,
                                                    msp=msp)
        # apply whitening if defined
        if Lw is not None:
            query_vectors = whitenapply(query_vectors, Lw["m"], Lw["P"])
        query_vectors = torch.transpose(query_vectors, 0, 1)

        # prepare looping over all iamges
        iterator = make_iterator_extract_scores_from_images_batched(
            dataloader,
            detector,
            cfg_maskrcnn,
            logger,
            image_batch_size=cfg_eval.batch_size,
            is_cuda=is_cuda)

        boxes, labels, scores = [], [], []
        gt_boxes = []
        image_ids = []
        losses = OrderedDict()

        # loop over all dataset images
        num_evaluted_images = 0
        for data in iterator:
            image_id, boxes_one_image, image_pyramid, query_img_sizes, class_ids, initial_img_size = data
            image_ids.append(image_id)
            logger.info(f"Image {num_evaluted_images}: id {image_id}")

            num_evaluted_images += 1
            img_size_pyramid = [
                FeatureMapSize(img=img) for img in image_pyramid
            ]

            gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(
                image_id)
            gt_boxes.append(gt_boxes_one_image)

            # vizualize GT for debug
            if cfg_visualization.show_gt_boxes:
                visualizer.show_gt_boxes(image_id, gt_boxes_one_image,
                                         class_ids, dataloader)

            # decode image predictions
            # merge boxes_one_image, labels_one_image, scores_one_image from different pyramid layers
            boxes_one_image = cat_boxlist(boxes_one_image)
            # do NMS
            good_indices = nms(
                boxes_one_image,
                opt.nms_iou_threshold_detector_score,
                nms_score_threshold=opt.nms_score_threshold_detector_score)
            boxes_one_image = boxes_one_image[good_indices]

            # extract feature vectors from the predictions
            image_original = dataloader._transform_image(image_id,
                                                         do_augmentation=True,
                                                         hflip=False,
                                                         vflip=False)[0]
            if is_cuda:
                image_original = image_original.cuda()
            image_patches = crop_resize_image_patches(
                image_original,
                boxes_one_image,
                opt.retrieval_image_size,
                logger,
                unnorm_image=dataloader.unnorm_image,
                is_cuda=is_cuda)
            # filter out cases when failed to crop a box: outside of the image
            good_indices = [
                i for i, p in enumerate(image_patches) if p is not None
            ]
            if good_indices:
                # non empty
                image_patches = [p for p in image_patches if p is not None]
                boxes_one_image = boxes_one_image[good_indices]

                image_vectors = extract_vectors_from_images(retrievalnet,
                                                            image_patches,
                                                            ms=ms,
                                                            msp=msp)

                # compute class scores from image_vectors and query_vectors (already transposed)
                if Lw is not None:
                    # apply whitening if defined
                    image_vectors = whitenapply(image_vectors, Lw["m"],
                                                Lw["P"])
                scores_retrieval = torch.mm(query_vectors, image_vectors)

                num_queries = scores_retrieval.size(0)
                num_detections = scores_retrieval.size(1)
                list_of_active_label = torch.LongTensor(class_ids)
                if cfg_eval.class_image_augmentation:
                    list_of_active_label = torch.stack(
                        [list_of_active_label] * num_class_views, 1).view(-1)

                # take all labels for all boxes - will sort them by scores at eval
                scores_one_image = scores_retrieval.view(-1)
                boxes_one_image = cat_boxlist([boxes_one_image] * num_queries)
                labels_one_image = torch.stack([list_of_active_label] *
                                               num_detections,
                                               1).contiguous().view(-1)
                # add scores and labels: overwrite if existed
                boxes_one_image.add_field("labels", labels_one_image)
                boxes_one_image.add_field("scores", scores_one_image)

                # NMS using the retrieval scores
                good_indices = nms(
                    boxes_one_image,
                    cfg_eval.nms_iou_threshold,
                    nms_score_threshold=cfg_eval.nms_score_threshold,
                    do_separate_per_label=not cfg_eval.nms_across_classes)
                boxes_one_image = boxes_one_image[good_indices]
            else:
                boxes_one_image.add_field(
                    "labels",
                    torch.zeros(0,
                                dtype=torch.long,
                                device=boxes_one_image.bbox_xyxy.device))
                boxes_one_image.add_field(
                    "scores",
                    torch.zeros(0,
                                dtype=torch.float,
                                device=boxes_one_image.bbox_xyxy.device))

            boxes.append(boxes_one_image.cpu())

            if cfg_visualization.show_detections:
                # do not pass class_ids - this is already taken care of
                visualizer.show_detections(boxes_one_image,
                                           image_id,
                                           dataloader,
                                           cfg_visualization,
                                           class_ids=None)

    # normalize by number of steps
    for k in losses:
        losses[k] /= num_evaluted_images

    # Save detection if requested
    if cfg_visualization.path_to_save_detections:
        data = {
            "image_ids": image_ids,
            "boxes_xyxy": [bb.bbox_xyxy for bb in boxes],
            "labels": [bb.get_field("labels") for bb in boxes],
            "scores": [bb.get_field("scores") for bb in boxes],
            "gt_boxes_xyxy": [bb.bbox_xyxy for bb in gt_boxes],
            "gt_labels": [bb.get_field("labels") for bb in gt_boxes],
            "gt_difficults": [bb.get_field("difficult") for bb in gt_boxes]
        }

        dataset_name = dataloader.get_name()
        os.makedirs(cfg_visualization.path_to_save_detections, exist_ok=True)
        save_path = os.path.join(cfg_visualization.path_to_save_detections,
                                 dataset_name + "_detections.pth")
        torch.save(data, save_path)

    # compute mAP
    for mAP_iou_threshold in cfg_eval.mAP_iou_thresholds:
        logger.info("Evaluating at IoU th {:0.2f}".format(mAP_iou_threshold))
        ap_data = do_voc_evaluation(boxes,
                                    gt_boxes,
                                    iou_thresh=mAP_iou_threshold,
                                    use_07_metric=False)
        losses["mAP@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map"]
        losses["mAPw@{:0.2f}".format(
            mAP_iou_threshold)] = ap_data["map_weighted"]
        losses["recall@{:0.2f}".format(mAP_iou_threshold)] = ap_data["recall"]
        losses["AP_joint_classes@{:0.2f}".format(
            mAP_iou_threshold)] = ap_data["ap_joint_classes"]

        # per class AP information
        for i_class, (ap, recall, n_pos) in enumerate(
                zip(ap_data["ap_per_class"], ap_data["recall_per_class"],
                    ap_data["n_pos"])):
            if not np.isnan(ap):
                assert i_class in class_ids, "Could not find class_id in the list of ids"
                logger.info(
                    "Class {0} (local {3}), AP {1:0.4f}, #obj {2}, recall {4:0.4f}"
                    .format(i_class, ap, n_pos, class_ids.index(i_class),
                            recall))

    # save timing
    losses["eval_time"] = (time.time() - t_start_eval)
    logger.info("Evaluated on {0}, scale {1}".format(dataset_name,
                                                     dataset_scale))
    print_meters(losses, logger)
    return losses
    def resample_of_correlation_map_simple(corr_maps,
                                           resampling_grids_grid_coord,
                                           class_pool_mask):
        """This function resamples the correlation tensor according to the grids of points representing the transformations produces by the transformation network.
        This function is left hear for understanding, use resample_of_correlation_map_fast, which is faster.
        Args:
            corr_maps (Tensor[float], size=batch_size x class_batch_size x (h^T*w^T) x h^A x w^A):
                This tensor contains correlations between of features of the input and class feature maps.
                This function resamples this tensor.
                CAUTION: this tensor shows be viewed to batch_size x class_batch_size x w^T x h^T x h^A x w^A (note the switch of w^T and h^T dimensions)
                This happens to be able to load models of the weakalign repo
            resampling_grids_grid_coord (Tensor[float], size=batch_size x class_batch_size x h^A x w^A x h^T x w^T x 2):
                This tensor contains non-integer coordinates of the points that show where we need to resample
            class_pool_mask (Tensor[float]): size=class_batch_size x 1 x h^T x w^T
                This tensor contains the mask, by which the resampled correlations are multiplied before final average pooling.
                It masks out the border features of the class feature maps.

        Returns:
            matches_pooled (Tensor[float]): size=batch_size x class_batch_size x x 1 x h^A x w^A

        Time comparison resample_of_correlation_map_simple vs resample_of_correlation_map_fast:
            for 2 images, 11 labels, train_patch_width 400, train_patch_height 600 (fm width = 25, fm height = 38)
                CPU time simple: 0.14s
                CPU time fast: 0.11s
                GPU=Geforce GTX 1080Ti
                GPU time simple: 0.010s
                GPU time fast: 0.006s
        """

        batch_size = corr_maps.size(0)
        class_batch_size = corr_maps.size(1)
        template_fm_size = FeatureMapSize(
            h=resampling_grids_grid_coord.size(-3),
            w=resampling_grids_grid_coord.size(-2))
        image_fm_size = FeatureMapSize(img=corr_maps)
        assert template_fm_size.w * template_fm_size.h == corr_maps.size(
            2
        ), 'the number of channels in the correlation map = {0} should match the size of the resampling grid = {1}'.format(
            corr_maps.size(2), template_fm_size)

        # use a single batch dimension
        corr_maps = corr_maps.view(batch_size * class_batch_size,
                                   corr_maps.size(2), image_fm_size.h,
                                   image_fm_size.w)
        resampling_grids_grid_coord = resampling_grids_grid_coord.view(
            batch_size * class_batch_size, image_fm_size.h, image_fm_size.w,
            template_fm_size.h, template_fm_size.w, 2)

        # extract matches from all channels one by one in a loop, and then combine them (using the average pooling w.r.t. the mask of active points defined by class_pool_mask)
        matches_all_channels = []
        # the order of the loops matters
        for template_x in range(template_fm_size.w):
            for template_y in range(template_fm_size.h):
                # note the weird order of coordinates - related to the transposed coordinates in the weakalign network
                channel_id = template_x * template_fm_size.h + template_y

                channel = corr_maps[:, channel_id:channel_id + 1, :, :]
                points = resampling_grids_grid_coord[:, :, :, template_y,
                                                     template_x, :]

                matches_one_channel = F.grid_sample(channel,
                                                    points,
                                                    mode="bilinear",
                                                    padding_mode='border',
                                                    align_corners=True)
                matches_all_channels.append(matches_one_channel)
        matches_all_channels = torch.stack(matches_all_channels, -1)

        # start pooling: fix all dimensions explicitly mostly to be safe
        matches_all_channels = matches_all_channels.view(
            batch_size, class_batch_size, image_fm_size.h, image_fm_size.w,
            template_fm_size.h * template_fm_size.w)
        mask = class_pool_mask.view(1, class_batch_size, 1, 1,
                                    template_fm_size.h * template_fm_size.w)
        matches_all_channels = matches_all_channels * mask

        matches_pooled = matches_all_channels.sum(4)
        matches_pooled = matches_pooled.view(batch_size, class_batch_size, 1,
                                             image_fm_size.h, image_fm_size.w)
        return matches_pooled
def make_iterator_extract_scores_from_images_batched(dataloader, net, logger, image_batch_size, is_cuda,
                                                     num_random_pyramid_scales=0, num_random_negative_labels=-1,
                                                     class_image_augmentation=""):
    """
    Generator to loop over dataset and apply the model to all elements.
    The iterator will loop over images one by one.
    Used in evaluate and .train.mine_hard_patches

    Args:
        dataloader - the dataloader to get data
        net - the network to use
        logger - the created logger
        image_batch_size (int) - the number of images to put in one batch
        is_cuda (bool) - use GPUs or not
        num_random_pyramid_scales (int) - numnber of random pyramid scales to try, default (0) means the standard scales from config
            passed to dataloader.make_iterator_for_all_images
        num_random_negative_labels (int) - number of random negative labels to try, default (-1) means to add all possible labels
        class_image_augmentation (str) - type of class image augmentation to do, default - no augmentation, support "rotation90" and "horflip"

    Returns:
        Creates an iterator over tuples of data:
        image_id (int)
        image_loc_scores_p (list of tensors) - localization scores to get bounding boxes when decoding
            len(image_loc_scores_p) = num pyramid levels, tensor size: num_labels x 4 x num_anchors
        image_class_scores_p (list of tensors) - clasification scores to recognize classes when decoding
            len(image_class_scores_p) = num pyramid levels, tensor size: num_labels x num_anchors
        one_image_pyramid (list of tensors) - input images at all pyramid levels
        batch_query_img_sizes (list of FeatureMapSize) - sizes of used query images (used in mine_hard_patches)
            len(batch_query_img_sizes) = num query images
        batch_class_ids (list of int) - class ids of used query images; len(batch_class_ids) = num query images,
        box_reverse_transforms (list of os2d.structures.transforms.TransformList) - reverse transforms to convert boxes
            from the coordinates of each resized image to the original global coordinates
            len(box_reverse_transforms) = num pyramid levels
        image_fm_sizes_p (list of FeatureMapSize) - sizes of the feature maps of the current pyramid
            len(image_fm_sizes_p) = num pyramid levels
        transform_corners_p (list of tensors) - corners of the parallelogram after the transformation mapping (used for visualization)
            len(transform_corners_p) = num pyramid levels, tensor size: num_labels x 8 x num_anchors
    """

    logger.info("Extracting scores from all images")
    # get images of all classes
    class_images, class_aspect_ratios, class_ids = dataloader.get_all_class_images()
    num_classes = len(class_images)
    assert len(class_aspect_ratios) == num_classes
    assert len(class_ids) == num_classes
    query_img_sizes = [FeatureMapSize(img=img) for img in class_images]
    
    # the current code works only with class batch == 1, this in inefficient in some place, but good in others
    # is there a better way?
    class_batch_size = 1

    # extract all class convolutions from batched class images
    class_conv_layer_batched = []
    logger.info("Extracting weights from {0} classes{1}".format(num_classes,
        f" with {class_image_augmentation} augmentation" if class_image_augmentation else ""))
    for i in range(0, num_classes, class_batch_size):
        batch_class_ids = class_ids[i : i + class_batch_size]

        batch_class_images = []
        for i_label in range(len(batch_class_ids)):
            im = class_images[i + i_label].squeeze(0)
            if is_cuda:
                im = im.cuda()
            batch_class_images.append(im)
            if not class_image_augmentation:
                num_class_views = 1
            elif class_image_augmentation == "rotation90":
                im90 = im.rot90(1, [1, 2])
                im180 = im90.rot90(1, [1, 2])
                im270 = im180.rot90(1, [1, 2])
                batch_class_images.append(im90)
                batch_class_images.append(im180)
                batch_class_images.append(im270)
                num_class_views = 4
            elif class_image_augmentation == "horflip":
                im_flipped = im.flip(2)
                batch_class_images.append(im_flipped)
                num_class_views = 2
            else:
                raise RuntimeError(f"Unknown value of class_image_augmentation: {class_image_augmentation}")

        for b_im in batch_class_images:
            class_feature_maps = net.net_label_features([b_im])
            class_conv_layer = net.os2d_head_creator.create_os2d_head(class_feature_maps)
            class_conv_layer_batched.append(class_conv_layer)
    
    # loop over all images
    iterator_batches = dataloader.make_iterator_for_all_images(image_batch_size, num_random_pyramid_scales=num_random_pyramid_scales)
    for batch_ids, pyramids_batch, box_transforms_batch, initial_img_size_batch in iterator_batches:
        t_start_batch = time.time()
        # select labels to use for search at this batch
        if num_random_negative_labels >= 0 :
            # randomly shuffle labels
            neg_labels = torch.randperm(len(class_conv_layer_batched))
            neg_labels = neg_labels[:num_random_negative_labels]
            # add positive labels
            pos_labels = dataloader.get_class_ids_for_image_ids(batch_ids)
            pos_labels = dataloader.convert_label_ids_global_to_local(pos_labels, class_ids)
            batch_labels_local = torch.cat([neg_labels, pos_labels], 0).unique()
        else:
            # take all the labels - needed for evaluation
            batch_labels_local = torch.arange(len(class_conv_layer_batched))
        
        batch_class_ids = [class_ids[l // num_class_views] for l in batch_labels_local]
        batch_query_img_sizes = [query_img_sizes[l // num_class_views] for l in batch_labels_local]

        # extract features at all pyramid levels
        batch_images_pyramid = []
        loc_scores = []
        class_scores = []
        fm_sizes = []
        transform_corners = []
        num_pyramid_levels = len(pyramids_batch)
        
        t_cum_features = 0.0
        t_cum_labels = 0.0
        for batch_images in pyramids_batch:
            if is_cuda:
                batch_images = batch_images.cuda()
            
            t_start_features = time.time()
            feature_maps = net.net_feature_maps(batch_images)
            torch.cuda.synchronize()
            t_cum_features += time.time() - t_start_features

            # batch class images
            loc_scores.append([])
            class_scores.append([])
            fm_sizes.append([])
            transform_corners.append([])
            t_start_labels = time.time()
            assert class_batch_size == 1, "the iterator on images works only with labels batches of size 1"

            for i_class_batch in batch_labels_local:
                # apply net at this pyramid level
                loc_s_p, class_s_p, _, fm_sizes_p, transform_corners_p = \
                     net(class_head=class_conv_layer_batched[i_class_batch],
                         feature_maps=feature_maps)
                loc_scores[-1].append(loc_s_p)
                class_scores[-1].append(class_s_p)
                fm_sizes[-1].append(fm_sizes_p)
                transform_corners[-1].append(transform_corners_p)
            torch.cuda.synchronize()
            t_cum_labels += time.time() - t_start_labels

            if not feature_maps.requires_grad:
                # explicitly remove a possibly large chunk of GPU memory
                del feature_maps

            batch_images_pyramid.append(batch_images)

        timing_str = "Feature time: {0}, Label time: {1}, ".format(time_for_printing(t_cum_features, mode="s"),
                                                          time_for_printing(t_cum_labels, mode="s"))

        # loc_scores, class_scores: pyramid_level x class_batch x image_in_batch x
        for i_image_in_batch, image_id in enumerate(batch_ids):
            # get scores from all pyramid levels
            image_loc_scores_p, image_class_scores_p, image_fm_sizes_p = [], [], []
            transform_corners_p = []
            for i_p in range(num_pyramid_levels):
                if loc_scores is not None and loc_scores[0] is not None and loc_scores[0][0] is not None:
                    image_loc_scores_p.append(torch.cat([s[i_image_in_batch] for s in loc_scores[i_p]], 0))
                else:
                    image_loc_scores_p.append(None)
                image_class_scores_p.append(torch.cat([s[i_image_in_batch] for s in class_scores[i_p]], 0))

                if transform_corners is not None and transform_corners[0] is not None and transform_corners[0][0] is not None:
                    transform_corners_p.append(torch.cat([s[i_image_in_batch] for s in transform_corners[i_p]], 0))
                else:
                    transform_corners_p.append(None)

                image_fm_sizes_p.append(fm_sizes[i_p][0])

            # get a pyramid of one image[i_p]
            one_image_pyramid = [p[i_image_in_batch] for p in batch_images_pyramid]

            # extract the box transformations
            box_reverse_transforms = box_transforms_batch[i_image_in_batch]

            logger.info(timing_str + "Net time: {0}".format(time_since(t_start_batch)))
            yield image_id, image_loc_scores_p, image_class_scores_p, one_image_pyramid,\
                  batch_query_img_sizes, batch_class_ids, box_reverse_transforms, image_fm_sizes_p, transform_corners_p
    def resample_of_correlation_map_fast(corr_maps,
                                         resampling_grids_grid_coord,
                                         class_pool_mask):
        """This function resamples the correlation tensor according to the grids of points representing the transformations produces by the transformation network.
        This is a more efficient version of resample_of_correlation_map_simple
        Args:
            corr_maps (Tensor[float], size=batch_size x class_batch_size x (h^T*w^T) x h^A x w^A):
                This tensor contains correlations between of features of the input and class feature maps.
                This function resamples this tensor.
                CAUTION: this tensor shows be viewed to batch_size x class_batch_size x w^T x h^T x h^A x w^A (note the switch of w^T and h^T dimensions)
                This happens to be able to load models of the weakalign repo
            resampling_grids_grid_coord (Tensor[float], size=batch_size x class_batch_size x h^A x w^A x h^T x w^T x 2):
                This tensor contains non-integer coordinates of the points that show where we need to resample
            class_pool_mask (Tensor[float]): size=class_batch_size x 1 x h^T x w^T
                This tensor contains the mask, by which the resampled correlations are multiplied before final average pooling.
                It masks out the border features of the class feature maps.

        Returns:
            matches_pooled (Tensor[float]): size=batch_size x class_batch_size x x 1 x h^A x w^A

        Time comparison resample_of_correlation_map_simple vs resample_of_correlation_map_fast:
            for 2 images, 11 labels, train_patch_width 400, train_patch_height 600 (fm width = 25, fm height = 38)
                CPU time simple: 0.14s
                CPU time fast: 0.11s
                GPU=Geforce GTX 1080Ti
                GPU time simple: 0.010s
                GPU time fast: 0.006s
        """
        batch_size = corr_maps.size(0)
        class_batch_size = corr_maps.size(1)
        template_fm_size = FeatureMapSize(
            h=resampling_grids_grid_coord.size(-3),
            w=resampling_grids_grid_coord.size(-2))
        image_fm_size = FeatureMapSize(img=corr_maps)
        assert template_fm_size.w * template_fm_size.h == corr_maps.size(
            2
        ), 'the number of channels in the correlation map = {0} should match the size of the resampling grid = {1}'.format(
            corr_maps.size(2), template_fm_size)

        # memory efficient computation will be done by merging the Y coordinate
        # and the index of the channel in corr_map into one single float

        # merge the two dimensions together
        corr_map_merged_y_and_id_in_corr_map = corr_maps.contiguous().view(
            batch_size * class_batch_size, 1, -1, image_fm_size.w)

        # note the weird order of coordinates - related to the transposed coordinates in the Ignacio's network
        y_grid, x_grid = torch.meshgrid(torch.arange(template_fm_size.h),
                                        torch.arange(template_fm_size.w))
        index_in_corr_map = y_grid + x_grid * template_fm_size.h

        # clamp to strict [-1, 1]
        # convert to torch.double to get more accuracy
        resampling_grids_grid_coord_ = resampling_grids_grid_coord.clamp(
            -1, 1).to(dtype=torch.double)
        resampling_grids_grid_coord_x_ = resampling_grids_grid_coord_.narrow(
            -1, 0, 1)
        resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_.narrow(
            -1, 1, 1)
        # adjust the y coordinate to take into account the index in the corr_map:
        # convert from [-1, 1] to [0, image_fm_size[0]]
        resampling_grids_grid_coord_y_ = (resampling_grids_grid_coord_y_ +
                                          1) / 2 * (image_fm_size.h - 1)
        # merge with the index in corr map [0]
        resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_.view(
            [-1] + list(index_in_corr_map.size()))
        index_in_corr_map = index_in_corr_map.unsqueeze(0)
        index_in_corr_map = index_in_corr_map.to(
            device=resampling_grids_grid_coord_.device,
            dtype=resampling_grids_grid_coord_.dtype)
        resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_ + index_in_corr_map * image_fm_size.h
        # convert back to [-1, -1]
        resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_ / (
            image_fm_size.h * template_fm_size.h * template_fm_size.w -
            1) * 2 - 1
        resampling_grids_grid_coord_y_ = resampling_grids_grid_coord_y_.view_as(
            resampling_grids_grid_coord_x_)
        resampling_grids_grid_coord_merged_y_and_id_in_corr_map = torch.cat(
            [resampling_grids_grid_coord_x_, resampling_grids_grid_coord_y_],
            dim=-1)

        # flatten the resampling grid
        resampling_grids_grid_coord_merged_y_and_id_in_corr_map_1d = \
            resampling_grids_grid_coord_merged_y_and_id_in_corr_map.view(batch_size * class_batch_size, -1, 1, 2)
        # extract the required points
        matches_all_channels = F.grid_sample(
            corr_map_merged_y_and_id_in_corr_map.to(dtype=torch.double),
            resampling_grids_grid_coord_merged_y_and_id_in_corr_map_1d,
            mode="bilinear",
            padding_mode='border',
            align_corners=True)

        matches_all_channels = matches_all_channels.view(
            batch_size, class_batch_size, 1, image_fm_size.h * image_fm_size.w,
            template_fm_size.h * template_fm_size.w)
        matches_all_channels = matches_all_channels.to(dtype=torch.float)

        # combine extracted matches using the average pooling w.r.t. the mask of active points defined by class_pool_mask)
        mask = class_pool_mask.view(1, class_batch_size, 1, 1,
                                    template_fm_size.h * template_fm_size.w)
        matches_all_channels = matches_all_channels * mask

        matches_pooled = matches_all_channels.sum(4)
        matches_pooled = matches_pooled.view(batch_size, class_batch_size, 1,
                                             image_fm_size.h, image_fm_size.w)
        return matches_pooled
def evaluate(dataloader, net, cfg, criterion=None, print_per_class_results=False):
    """
    Evaluation of the provided model at one dataset

    Args:
        dataloader - the dataloader to get data
        net - the network to use
        cfg - config with all the parameters
        criterion - criterion (usually the same one as used for training), can be None, will just not compute related metrics
        print_per_class_results - flag showing whether to printout extra data (per class AP) - usually used at the final evaluation

    Returns:
        losses (OrderedDict) - all computed metrics, e.g., losses["[email protected]"] - mAP at IoU threshold 0.5
    """
    logger = logging.getLogger("OS2D.evaluate")
    dataset_name = dataloader.get_name()
    dataset_scale = dataloader.get_eval_scale()
    logger.info("Starting to eval on {0}, scale {1}".format(dataset_name, dataset_scale))
    t_start_eval = time.time()
    net.eval()

    iterator = make_iterator_extract_scores_from_images_batched(dataloader, net, logger,
                                                                image_batch_size=cfg.eval.batch_size,
                                                                is_cuda=cfg.is_cuda,
                                                                class_image_augmentation=cfg.eval.class_image_augmentation)

    boxes = []
    gt_boxes = []
    losses = OrderedDict()
    image_ids = []

    # loop over all dataset images
    num_evaluted_images = 0
    for data in iterator:
        image_id, image_loc_scores_pyramid, image_class_scores_pyramid,\
                    image_pyramid, query_img_sizes, class_ids,\
                    box_reverse_transform, image_fm_sizes_p, transform_corners_pyramid\
                    = data
        image_ids.append(image_id)

        num_evaluted_images += 1
        img_size_pyramid = [FeatureMapSize(img=img) for img in image_pyramid]

        num_labels = len(class_ids)
        gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(image_id)
        gt_boxes.append(gt_boxes_one_image)
        
        # compute losses
        if len(gt_boxes_one_image) > 0:
            # there is some annotation for this image
            gt_labels_one_image = gt_boxes_one_image.get_field("labels")
            dataloader.update_box_labels_to_local(gt_boxes_one_image, class_ids)

            loc_targets_pyramid, class_targets_pyramid = \
                    dataloader.box_coder.encode_pyramid(gt_boxes_one_image,
                                                        img_size_pyramid, num_labels,
                                                        default_box_transform_pyramid=box_reverse_transform)

            # return the original labels back
            gt_boxes_one_image.add_field("labels", gt_labels_one_image)

            # vizualize GT for debug
            if cfg.visualization.eval.show_gt_boxes:
                visualizer.show_gt_boxes(image_id, gt_boxes_one_image, class_ids, dataloader)

            if cfg.is_cuda:
                loc_targets_pyramid = [loc_targets.cuda() for loc_targets in loc_targets_pyramid]
                class_targets_pyramid = [class_targets.cuda() for class_targets in class_targets_pyramid]
                transform_corners_pyramid = [transform_corners.cuda() for transform_corners in transform_corners_pyramid]

            add_batch_dim = lambda list_of_tensors: [t.unsqueeze(0) for t in list_of_tensors]
            if criterion is not None:
                # if criterion is provided, use it to compute all metrics it can
                losses_iter = criterion(add_batch_dim(image_loc_scores_pyramid) if image_loc_scores_pyramid[0] is not None else None,
                                        add_batch_dim(loc_targets_pyramid),
                                        add_batch_dim(image_class_scores_pyramid),
                                        add_batch_dim(class_targets_pyramid)
                                        )
            
                # convert to floats
                for l in losses_iter:
                    losses_iter[l] = losses_iter[l].mean().item()
                # printing
                print_meters(losses_iter, logger)
                # update logs
                add_to_meters_in_dict(losses_iter, losses)
        
        # decode image predictions
        boxes_one_image = \
            dataloader.box_coder.decode_pyramid(image_loc_scores_pyramid, image_class_scores_pyramid,
                                                img_size_pyramid, class_ids,
                                                nms_iou_threshold=cfg.eval.nms_iou_threshold,
                                                nms_score_threshold=cfg.eval.nms_score_threshold,
                                                inverse_box_transforms=box_reverse_transform,
                                                transform_corners_pyramid=transform_corners_pyramid)

        boxes.append(boxes_one_image.cpu())
        
        if cfg.visualization.eval.show_detections:
            visualizer.show_detection_from_dataloader(boxes_one_image, image_id, dataloader, cfg.visualization.eval, class_ids=None)
        
        if cfg.visualization.eval.show_class_heatmaps:
            visualizer.show_class_heatmaps(image_id, class_ids, image_fm_sizes_p, class_targets_pyramid, image_class_scores_pyramid,
                                            cfg_local=cfg.visualization.eval,
                                            class_image_augmentation=cfg.eval.class_image_augmentation)

        if cfg.is_cuda:
            torch.cuda.empty_cache()

    # normalize by number of steps
    for k in losses:
        losses[k] /= num_evaluted_images

    # Save detection if requested
    path_to_save_detections = cfg.visualization.eval.path_to_save_detections
    if path_to_save_detections:
        data = {"image_ids" : image_ids,
                "boxes_xyxy" : [bb.bbox_xyxy for bb in boxes], 
                "labels" : [bb.get_field("labels") for bb in boxes],
                "scores" : [bb.get_field("scores") for bb in boxes],
                "gt_boxes_xyxy" : [bb.bbox_xyxy for bb in gt_boxes],
                "gt_labels" : [bb.get_field("labels") for bb in gt_boxes],
                "gt_difficults" : [bb.get_field("difficult") for bb in gt_boxes]
        }
        dataset_name = dataloader.get_name()
        os.makedirs(path_to_save_detections, exist_ok=True)
        save_path = os.path.join(path_to_save_detections, dataset_name + "_detections.pth")
        torch.save(data, save_path) 

    # compute mAP
    for mAP_iou_threshold in cfg.eval.mAP_iou_thresholds:
        logger.info("Evaluating at IoU th {:0.2f}".format(mAP_iou_threshold))
        ap_data = do_voc_evaluation(boxes, gt_boxes, iou_thresh=mAP_iou_threshold, use_07_metric=False)
        losses["mAP@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map"]
        losses["mAPw@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map_weighted"]
        losses["recall@{:0.2f}".format(mAP_iou_threshold)] = ap_data["recall"]
    
        if print_per_class_results:
            # per class AP information
            for i_class, (ap, recall, n_pos) in enumerate(zip(ap_data["ap_per_class"], ap_data["recall_per_class"], ap_data["n_pos"])):
                if not np.isnan(ap):
                    assert i_class in class_ids, "Could not find class_id in the list of ids"
                    logger.info("Class {0} (local {3}), AP {1:0.4f}, #obj {2}, recall {4:0.4f}".format(i_class,
                                                                                                       ap,
                                                                                                       n_pos,
                                                                                                       class_ids.index(i_class),
                                                                                                       recall))
    # save timing
    losses["eval_time"] = (time.time() - t_start_eval)
    logger.info("Evaluated on {0}, scale {1}".format(dataset_name, dataset_scale))
    print_meters(losses, logger)
    return losses
    def forward(self, feature_maps):
        """
        Args:
            feature_maps (Tensor[float], size b^A x d x h^A x w^A) - contains the feature map of the input image
            b^A - batch size
            d - feature dimensionality
            h^A - height of the feature map
            w^A - width of the feature map
​
        Returns:
                # here b^C is the class batch size, i.e., the number of class images contained in self.class_batch_size passed when creating this object
            output_localization (Tensor[float], size b^A x b^C x 4 x h^A x w^A) - the localization output w.r.t. the standard box encoding - computed by DetectionBoxCoder.build_loc_targets
            output_recognition (Tensor[float], size size b^A x b^C x 1 x h^A x w^A) - the recognition output for each of the classes - the correlation, linearly converted to [0, 1] segment, the higher the better match to the class
            output_recognition_transform_detached (Tensor[float], size b^A x b^C x 1 x h^A x w^A) - same to output_recognition, but with the computational graph detached from the transformation (for backward  that does not update the transofrmation - intended for the negatives)
            corner_coordinates (Tensor[float], size size b^A x b^C x 8 x h^A x w^A) - the corners of the default boxes after the transofrmation, datached from the computational graph, for visualisation only
        """
        # get dims
        batch_size = feature_maps.size(0)
        feature_dim = feature_maps.size(1)
        image_fm_size = FeatureMapSize(img=feature_maps)
        class_fm_size = FeatureMapSize(img=self.class_feature_maps)
        feature_dim_for_regression = class_fm_size.h * class_fm_size.w

        class_feature_dim = self.class_feature_maps.size(1)
        assert feature_dim == class_feature_dim, "Feature dimensionality of input={0} and class={1} feature maps has to equal".format(
            feature_dim, class_feature_dim)

        # L2-normalize the feature map
        feature_maps = normalize_feature_map_L2(feature_maps, 1e-5)

        # get correlations all to all
        corr_maps = torch.einsum("bfhw,afxy->abwhxy", self.class_feature_maps,
                                 feature_maps)
        # need to try to optimize this with opt_einsum: https://optimized-einsum.readthedocs.io/en/latest/
        # CAUTION: note the switch of dimensions hw to wh. This is done for compatability with the FeatureCorrelation class by Ignacio Rocco https://github.com/ignacio-rocco/ncnet/blob/master/lib/model.py (to be able to load their models)

        # reshape to have the correlation map of dimensions similar to the standard tensor for image feature maps
        corr_maps = corr_maps.contiguous().view(
            batch_size * self.class_batch_size, feature_dim_for_regression,
            image_fm_size.h, image_fm_size.w)

        # compute the grids to resample corr maps
        resampling_grids_local_coord = self.aligner(corr_maps)

        # build classifications outputs
        cor_maps_for_recognition = corr_maps.contiguous().view(
            batch_size, self.class_batch_size, feature_dim_for_regression,
            image_fm_size.h, image_fm_size.w)
        resampling_grids_local_coord = resampling_grids_local_coord.contiguous(
        ).view(batch_size, self.class_batch_size, image_fm_size.h,
               image_fm_size.w, self.aligner.out_grid_size.h,
               self.aligner.out_grid_size.w, 2)

        # need to recompute resampling_grids to [-1, 1] coordinates w.r.t. the feature maps to sample points with F.grid_sample
        # first get the list of boxes that corresponds to the receptive fields of the parameter regression network: box sizes are the receptive field sizes, stride is the network stride
        default_boxes_xyxy_wrt_fm = self.box_grid_generator_feature_map_level.create_strided_boxes_columnfirst(
            fm_size=image_fm_size)

        default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.view(
            1, 1, image_fm_size.h, image_fm_size.w, 4)
        # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x  box_grid_height x box_grid_width x 4
        default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.to(
            resampling_grids_local_coord.device)
        resampling_grids_fm_coord = convert_box_coordinates_local_to_global(
            resampling_grids_local_coord, default_boxes_xyxy_wrt_fm)

        # covert to coordinates normalized to [-1, 1] (to be compatible with torch.nn.functional.grid_sample)
        resampling_grids_fm_coord_x = resampling_grids_fm_coord.narrow(
            -1, 0, 1)
        resampling_grids_fm_coord_y = resampling_grids_fm_coord.narrow(
            -1, 1, 1)
        resampling_grids_fm_coord_unit = torch.cat([
            resampling_grids_fm_coord_x / (image_fm_size.w - 1) * 2 - 1,
            resampling_grids_fm_coord_y / (image_fm_size.h - 1) * 2 - 1
        ],
                                                   dim=-1)
        # clamp to fit the image plane
        resampling_grids_fm_coord_unit = resampling_grids_fm_coord_unit.clamp(
            -1, 1)

        # extract and pool matches
        # # slower code:
        # matches_summed = self.resample_of_correlation_map_simple(cor_maps_for_recognition,
        #                                                          resampling_grids_fm_coord_unit,
        #                                                          self.class_pool_mask)

        # we use faster, but somewhat more obscure version
        matches_summed = self.resample_of_correlation_map_fast(
            cor_maps_for_recognition, resampling_grids_fm_coord_unit,
            self.class_pool_mask)
        if matches_summed.requires_grad:
            matches_summed_transform_detached = self.resample_of_correlation_map_fast(
                cor_maps_for_recognition,
                resampling_grids_fm_coord_unit.detach(), self.class_pool_mask)
        else:
            # Optimization to make eval faster
            matches_summed_transform_detached = matches_summed

        # build localization targets
        default_boxes_xyxy_wrt_image = self.box_grid_generator_image_level.create_strided_boxes_columnfirst(
            fm_size=image_fm_size)

        default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.view(
            1, 1, image_fm_size.h, image_fm_size.w, 4)
        # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x  box_grid_height x box_grid_width x 4
        default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.to(
            resampling_grids_local_coord.device)
        resampling_grids_image_coord = convert_box_coordinates_local_to_global(
            resampling_grids_local_coord, default_boxes_xyxy_wrt_image)

        num_pooled_points = self.aligner.out_grid_size.w * self.aligner.out_grid_size.h
        resampling_grids_x = resampling_grids_image_coord.narrow(
            -1, 0, 1).contiguous().view(-1, num_pooled_points)
        resampling_grids_y = resampling_grids_image_coord.narrow(
            -1, 1, 1).contiguous().view(-1, num_pooled_points)
        class_boxes_xyxy = torch.stack([
            resampling_grids_x.min(dim=1)[0],
            resampling_grids_y.min(dim=1)[0],
            resampling_grids_x.max(dim=1)[0],
            resampling_grids_y.max(dim=1)[0]
        ], 1)

        # extract rectangle borders to draw complete boxes
        corner_coordinates = resampling_grids_image_coord[:, :, :, :, [
            0, -1
        ]][:, :, :, :, :, [0, -1]]  # only the corners
        corner_coordinates = corner_coordinates.detach_()
        corner_coordinates = corner_coordinates.view(
            batch_size, self.class_batch_size, image_fm_size.h,
            image_fm_size.w,
            8)  # batch_size x label_batch_size x fm_height x fm_width x 8
        corner_coordinates = corner_coordinates.transpose(3, 4).transpose(
            2, 3)  # batch_size x label_batch_size x 5 x fm_height x fm_width

        class_boxes = BoxList(class_boxes_xyxy.view(-1, 4),
                              image_fm_size,
                              mode="xyxy")
        default_boxes_wrt_image = BoxList(default_boxes_xyxy_wrt_image.view(
            -1, 4),
                                          image_fm_size,
                                          mode="xyxy")
        default_boxes_with_image_batches = cat_boxlist(
            [default_boxes_wrt_image] * batch_size * self.class_batch_size)

        output_localization = Os2dBoxCoder.build_loc_targets(
            class_boxes, default_boxes_with_image_batches)  # num_boxes x 4
        output_localization = output_localization.view(
            batch_size, self.class_batch_size, image_fm_size.h,
            image_fm_size.w,
            4)  # batch_size x label_batch_size x fm_height x fm_width x 4
        output_localization = output_localization.transpose(3, 4).transpose(
            2, 3)  # batch_size x label_batch_size x 4 x fm_height x fm_width

        output_recognition = (matches_summed - 1.0) / 2.0
        output_recognition_transform_detached = (
            matches_summed_transform_detached - 1.0) / 2.0
        return output_localization, output_recognition, output_recognition_transform_detached, corner_coordinates
def mine_hard_patches(dataloader, net, cfg, criterion):
    """Mine patches that are hard: classification false positives and negative, localization errors
    At each level of sampled image pyramid, we need to cut out a piece of size appropriate for training
    (levels are defined by cfg.train.mining.num_random_pyramid_scales, cfg.train.mining.num_random_negative_classes)

    Args:
        dataloader - dataloader to use (often the same as the one for training)
        net - the network to use
        cfg - config with all the parameters
        criterion - criterion (usually the same one as used for training)

    Returns:
        hardnegdata_per_imageid (OrderedDict) - mined data, keys are the image ids;
            further used in dataloader.set_hard_negative_data(hardnegdata_per_imageid) when preparing batches
    """
    logger = logging.getLogger("OS2D.mining_hard_patches")
    logger.info("Starting to mine hard patches")
    t_start_mining = time.time()
    net.eval()
    num_batches = len(dataloader)
    hardnegdata_per_imageid = OrderedDict()

    iterator = make_iterator_extract_scores_from_images_batched(dataloader, net, logger,
                                                                image_batch_size=cfg.eval.batch_size,
                                                                is_cuda=cfg.is_cuda,
                                                                num_random_pyramid_scales=cfg.train.mining.num_random_pyramid_scales,
                                                                num_random_negative_labels=cfg.train.mining.num_random_negative_classes)

    boxes = []
    gt_boxes = []
    losses = OrderedDict()

    # loop over all dataset images
    for data in iterator:
        t_item_start = time.time()

        image_id, image_loc_scores_pyramid, image_class_scores_pyramid, \
                    image_pyramid, query_img_sizes, \
                    batch_class_ids, box_reverse_transform_pyramid, image_fm_sizes_p, transform_corners_pyramid \
                = data

        img_size_pyramid = [FeatureMapSize(img=image) for image in image_pyramid]

        gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(image_id)
        gt_boxes.append(gt_boxes_one_image)

        # compute losses
        # change labels to the ones local to the current image
        dataloader.update_box_labels_to_local(gt_boxes_one_image, batch_class_ids)
        num_labels = len(batch_class_ids)

        loc_targets_pyramid, class_targets_pyramid = \
                dataloader.box_coder.encode_pyramid(gt_boxes_one_image, img_size_pyramid, num_labels,
                                                    default_box_transform_pyramid=box_reverse_transform_pyramid)

        # vizualize GT for debug
        if cfg.visualization.mining.show_gt_boxes:
            visualizer.show_gt_boxes(image_id, gt_boxes_one_image, batch_class_ids, dataloader)

        # compute losses
        if cfg.is_cuda:
            loc_targets_pyramid = [loc_targets.cuda() for loc_targets in loc_targets_pyramid]
            class_targets_pyramid = [class_targets.cuda() for class_targets in class_targets_pyramid]

        add_batch_dim = lambda list_of_tensors: [t.unsqueeze(0) for t in list_of_tensors]
        loc_scores_pyramid = add_batch_dim(image_loc_scores_pyramid)
        
        cls_targets_remapped_pyramid = []
        for loc_scores, img_size, box_reverse_transform in zip(loc_scores_pyramid, img_size_pyramid, box_reverse_transform_pyramid):
            # loop over the pyramid levels
            cls_targets_remapped, ious_anchor, ious_anchor_corrected = \
                dataloader.box_coder.remap_anchor_targets(loc_scores, [img_size], query_img_sizes, [gt_boxes_one_image],
                                                          box_reverse_transform=[box_reverse_transform])
            cls_targets_remapped_pyramid.append(cls_targets_remapped)

        losses_iter, losses_per_anchor = criterion(loc_scores_pyramid,
                                                    add_batch_dim(loc_targets_pyramid),
                                                    add_batch_dim(image_class_scores_pyramid),
                                                    add_batch_dim(class_targets_pyramid),
                                                    cls_targets_remapped=cls_targets_remapped_pyramid,
                                                    patch_mining_mode=True)

        if cfg.visualization.mining.show_class_heatmaps:
            visualizer.show_class_heatmaps(image_id, batch_class_ids, image_fm_sizes_p, class_targets_pyramid, image_class_scores_pyramid,
                                            cfg_local=cfg.visualization.mining)

        assert dataloader.data_augmentation is not None, "Can mine hard patches only through data augmentation"
        crop_size = dataloader.data_augmentation.random_crop_size

        # convert to floats
        for l in losses_iter:
            losses_iter[l] = losses_iter[l].mean().item()
        # printing
        print_meters(losses_iter, logger)
        # update logs
        add_to_meters_in_dict(losses_iter, losses)

        # construct crop boxes for all the anchors and NMS them - NMS pos ang neg anchors separately
        query_fm_sizes = [dataloader.box_coder._get_feature_map_size_per_image_size(sz) for sz in query_img_sizes]
        
        crops = []
        achors = []
        labels_of_anchors = []
        pyramid_level_of_anchors = []
        losses_of_anchors = []
        corners_of_anchors = []
        losses_loc_of_anchors = []
        pos_mask_of_anchors = []
        pos_loc_mask_of_anchors = []
        neg_mask_of_anchors = []
        anchor_indices = []
        i_image_in_batch = 0 # only one image comes here
        for i_p, img_size in enumerate(img_size_pyramid):
            for i_label, query_fm_size in enumerate(query_fm_sizes):
                crop_position, anchor_position, anchor_index = \
                    dataloader.box_coder.output_box_grid_generator.get_box_to_cut_anchor(img_size,
                                                                                         crop_size,
                                                                                         image_fm_sizes_p[i_p],
                                                                                         box_reverse_transform_pyramid[i_p])
                cur_corners = transform_corners_pyramid[i_p][i_label].transpose(0,1)
                cur_corners = dataloader.box_coder.apply_transform_to_corners(cur_corners, box_reverse_transform_pyramid[i_p], img_size)
                if cfg.is_cuda:
                    crop_position, anchor_position = crop_position.cuda(), anchor_position.cuda()
                crops.append(crop_position)
                achors.append(anchor_position)
                device = crop_position.bbox_xyxy.device
                losses_of_anchors.append(losses_per_anchor["cls_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy))
                pos_mask_of_anchors.append(losses_per_anchor["pos_mask"][i_p][i_image_in_batch, i_label].to(device=device))
                neg_mask_of_anchors.append(losses_per_anchor["neg_mask"][i_p][i_image_in_batch, i_label].to(device=device))
                losses_loc_of_anchors.append(losses_per_anchor["loc_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy))
                pos_loc_mask_of_anchors.append(losses_per_anchor["pos_for_regression"][i_p][i_image_in_batch, i_label].to(device=device))
                corners_of_anchors.append(cur_corners.to(crop_position.bbox_xyxy))

                num_anchors = len(crop_position)
                labels_of_anchors.append(torch.full([num_anchors], i_label, dtype=torch.long))
                pyramid_level_of_anchors.append(torch.full([num_anchors], i_p, dtype=torch.long))
                anchor_indices.append(anchor_index)

        # stack all
        crops = cat_boxlist(crops)
        achors = cat_boxlist(achors)
        labels_of_anchors  = torch.cat(labels_of_anchors, 0)
        pyramid_level_of_anchors = torch.cat(pyramid_level_of_anchors, 0)
        losses_of_anchors = torch.cat(losses_of_anchors, 0)
        losses_loc_of_anchors = torch.cat(losses_loc_of_anchors, 0)
        pos_mask_of_anchors = torch.cat(pos_mask_of_anchors, 0)
        pos_loc_mask_of_anchors = torch.cat(pos_loc_mask_of_anchors, 0)
        neg_mask_of_anchors = torch.cat(neg_mask_of_anchors, 0)
        anchor_indices = torch.cat(anchor_indices, 0)
        corners_of_anchors = torch.cat(corners_of_anchors, 0)

        def nms_masked_and_collect_data(mask, crops_xyxy, scores, nms_iou_threshold_in_mining, max_etries=None):
            mask_ids = torch.nonzero(mask).squeeze(1)
            boxes_selected = copy.deepcopy(crops_xyxy[mask])
            boxes_selected.add_field("scores", scores[mask])
            remaining_boxes = nms(boxes_selected, nms_iou_threshold_in_mining)
            remaining_boxes = mask_ids[remaining_boxes]

            # sort and take the topk, because NMS is not sorting by default
            ids = torch.argsort(scores[remaining_boxes], descending=True)
            if max_etries is not None:
                ids = ids[:max_etries]
            remaining_boxes = remaining_boxes[ids]

            return remaining_boxes

        nms_iou_threshold_in_mining = cfg.train.mining.nms_iou_threshold_in_mining
        num_hard_patches_per_image = cfg.train.mining.num_hard_patches_per_image

        # hard negatives
        hard_negs = nms_masked_and_collect_data(neg_mask_of_anchors, crops, losses_of_anchors,
                                                nms_iou_threshold_in_mining,
                                                num_hard_patches_per_image)

        # hard positives for classification
        hard_pos  = nms_masked_and_collect_data(pos_mask_of_anchors, crops, losses_of_anchors,
                                                nms_iou_threshold_in_mining,
                                                num_hard_patches_per_image)

        # hard positives for localization
        hard_pos_loc  = nms_masked_and_collect_data(pos_loc_mask_of_anchors, crops, losses_loc_of_anchors,
                                                    nms_iou_threshold_in_mining,
                                                    num_hard_patches_per_image)

        # merge all together
        def standardize(v):
            return v.item() if type(v) == torch.Tensor else v
        def add_item(data, role, pyramid_level, label_local, anchor_index, crop_position_xyxy, anchor_position_xyxy, transform_corners):
            new_item = OrderedDict()
            new_item["pyramid_level"] = standardize(pyramid_level)
            new_item["label_local"] = standardize(label_local)
            new_item["anchor_index"] = standardize(anchor_index)
            new_item["role"] = role
            new_item["crop_position_xyxy"] = crop_position_xyxy
            new_item["anchor_position_xyxy"] = anchor_position_xyxy
            new_item["transform_corners"] = transform_corners
            data.append(new_item)

        hardnegdata = []
        for i in hard_negs:
            add_item(hardnegdata, "neg", pyramid_level_of_anchors[i],
                        labels_of_anchors[i], anchor_indices[i],
                        crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu())
        for i in hard_pos:
            add_item(hardnegdata, "pos", pyramid_level_of_anchors[i],
                        labels_of_anchors[i], anchor_indices[i],
                        crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu())
        for i in hard_pos_loc:
            add_item(hardnegdata, "pos_loc", pyramid_level_of_anchors[i],
                        labels_of_anchors[i], anchor_indices[i],
                        crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu())

        # extract loss values and compute the box positions to crop
        for a in hardnegdata:
            a["label_global"] = standardize(batch_class_ids[ a["label_local"] ])
            a["loss"] = standardize(losses_per_anchor["cls_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]])
            a["loss_loc"] = standardize(losses_per_anchor["loc_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]])
            a["score"] = standardize(image_class_scores_pyramid[a["pyramid_level"]][a["label_local"], a["anchor_index"]])
            a["image_id"] = standardize(image_id)

        hardnegdata_per_imageid[image_id] = hardnegdata

        if cfg.visualization.mining.show_mined_patches:
            visualizer.show_mined_patches(image_id, batch_class_ids, dataloader, hardnegdata)

        logger.info("Item time: {0}, since mining start: {1}".format(time_since(t_item_start), time_since(t_start_mining)))
    logger.info("Hard negative mining finished in {0}".format(time_since(t_start_mining)))
    return hardnegdata_per_imageid
Example #27
0
    def _prepare_batch(self, image_ids, use_all_labels=False):
        batch_images = []
        batch_class_images = []
        batch_loc_targets = []
        batch_class_targets = []

        # flag to use hard neg mining
        use_mined_data = self.hardnegdata_per_imageid is not None
        # select which mined boxes to use
        if use_mined_data:
            # for half of the images select hard positives, for half - hard negatives
            # the order of images in a batch is random, so no need to randomize here
            batch_size = len(image_ids)
            num_neg_patches = batch_size // 2
            role_to_select = ["neg"] * num_neg_patches + ["pos"] * (
                batch_size - num_neg_patches)
            mined_data = {}
            for image_id, role in zip(image_ids, role_to_select):
                mined_data_for_image = self.hardnegdata_per_imageid[image_id]
                # filter for the correct role
                mined_data_for_image = [
                    d for d in mined_data_for_image
                    if d["role"][:len(role)] == role
                ]
                if len(mined_data_for_image) == 0:
                    mined_data_for_image = self.hardnegdata_per_imageid[
                        image_id]
                assert len(
                    mined_data_for_image
                ) > 0, "Could not find mined {0} for image {1}".format(
                    role, image_id)
                # select random element
                i_rand = torch.randint(len(mined_data_for_image), (1, ),
                                       dtype=torch.long).item()
                mined_data[image_id] = mined_data_for_image[i_rand]
                # self.logger.info("Image {0}, mined data: {1}".format(image_id,  mined_data[image_id]))

        # collect labels for this batch
        batch_data = self.dataset.get_dataframe_for_image_ids(image_ids)

        if not use_all_labels:
            class_ids = batch_data["classid"].unique()
            # select labels for mined hardnegs
            if use_mined_data:
                # select labels that are compatible with mining
                mined_labels = [
                    mined_data[image_id]["label_global"]
                    for image_id in mined_data
                ]
            else:
                mined_labels = []

            # randomly prune label images if too many
            max_batch_labels = self.max_batch_labels if self.max_batch_labels is not None else class_ids.size + len(
                mined_labels) + 1

            class_ids = np.unique(class_ids)
            np.random.shuffle(class_ids)
            class_ids = class_ids[:max_batch_labels - len(mined_labels)]

            class_ids = np.unique(
                np.concatenate((class_ids, np.array(mined_labels).astype(
                    class_ids.dtype)),
                               axis=0))
        else:
            class_ids = self.dataset.get_class_ids()
        class_ids = sorted(list(class_ids))

        # decide on batch level data augmentation
        if self.data_augmentation is not None:
            batch_vflip = random.random(
            ) < 0.5 if self.data_augmentation.batch_random_vflip else False
            batch_hflip = random.random(
            ) < 0.5 if self.data_augmentation.batch_random_hflip else False
        else:
            batch_vflip = False
            batch_hflip = False

        # prepare class images
        num_classes = len(class_ids)
        class_images, class_image_sizes = self.get_class_images_and_sizes(
            class_ids, do_augmentation=True)
        batch_class_images = [
            self._transform_image_gt(img, hflip=batch_hflip, vflip=batch_vflip)
            for img in class_images
        ]
        # get the image sizes after resize in self._transform_image_gt, format - width, height
        class_image_sizes = [
            FeatureMapSize(img=img) for img in batch_class_images
        ]

        # prepare images and boxes
        img_size = None
        batch_box_inverse_transform = []
        batch_boxes = []
        batch_img_size = []
        for image_id in image_ids:
            # get annotation
            boxes = self.get_image_annotation_for_imageid(image_id)

            # convert global indices to local
            # if use_global_labels==False then local indices will be w.r.t. labels in this batch
            # if use_global_labels==True then local indices will be w.r.t. labels in the whole dataset (not class_ids)
            self.update_box_labels_to_local(boxes, class_ids)

            # prepare image and boxes: convert image to tensor, data augmentation: some boxes might be cut off the image
            image_mined_data = None if not use_mined_data else mined_data[
                image_id]
            img, boxes, mask_cutoff_boxes, mask_difficult_boxes, box_inverse_transform = \
                     self._transform_image(image_id, boxes, hflip=batch_hflip, vflip=batch_vflip, mined_data=image_mined_data)

            # mask_difficult_boxes is set True for boxes that are largely chopped off, those are not used for training
            if boxes.has_field("difficult"):
                old_difficult = boxes.get_field("difficult")
                boxes.add_field("difficult",
                                old_difficult | mask_difficult_boxes)
            boxes.get_field("labels")[mask_cutoff_boxes] = -2

            # vizualize groundtruth images and boxes - to debug data augmentation
            if self.show_gt_boxes and self.data_augmentation is not None:
                visualizer.show_gt_boxes(image_id,
                                         boxes,
                                         class_ids,
                                         self,
                                         image_to_show=img)

            # check image size in this batch
            if img_size is None:
                img_size = FeatureMapSize(img=img)
            else:
                assert img_size == FeatureMapSize(
                    img=img), "Images in a batch should be of the same size"

            loc_targets, class_targets = self.box_coder.encode(
                boxes, img_size, num_classes)
            batch_loc_targets.append(loc_targets)
            batch_class_targets.append(class_targets)
            batch_images.append(img)
            batch_box_inverse_transform.append([box_inverse_transform])
            batch_boxes.append(boxes)
            batch_img_size.append(img_size)

        # stack data
        batch_images = torch.stack(batch_images, 0)
        batch_loc_targets = torch.stack(batch_loc_targets, 0)
        batch_class_targets = torch.stack(batch_class_targets, 0)

        return batch_images, batch_class_images, batch_loc_targets, batch_class_targets, class_ids, class_image_sizes, \
               batch_box_inverse_transform, batch_boxes, batch_img_size
Example #28
0
def build_train_dataloader_from_config(cfg,
                                       box_coder,
                                       img_normalization,
                                       dataset_train=None,
                                       data_path="",
                                       logger_prefix="OS2D.train"):
    """Construct dataloaders to use for training.
    Args:
        cfg - config object, training is done on cfg.train.dataset_name dataset
        box_coder (Os2dBoxCoder)
        img_normalization (dict) - normalization to use, keys "mean" and "std" have lists of 3 floats each
        dataset_train (DatasetOneShotDetection) - one needs either to provide a dataset object or a path to create such object from config
        data_path (str) - root path to search for datasets
        logger_prefix (str) - prefix to ass to the logger outputs
    Output:
        dataloader_train (DataloaderOneShotDetection) - the dataloader for training
        datasets_train_subset_for_eval (list of DatasetOneShotDetection) - subsets of the training set to pass to evaluation dataloaders
    """
    if dataset_train is None:
        assert data_path, "If explicit dataset_train is not provided one needs to provide a data_path to create one"
        dataset_train = build_dataset_by_name(
            data_path,
            cfg.train.dataset_name,
            eval_scale=cfg.train.dataset_scale,
            cache_images=cfg.train.cache_images,
            no_image_reading=not cfg.train.do_training)

    logger = logging.getLogger(logger_prefix + ".dataloader")
    # create training dataloader
    random_crop_size = FeatureMapSize(w=cfg.train.augment.train_patch_width,
                                      h=cfg.train.augment.train_patch_height)
    evaluation_scale = dataset_train.eval_scale / dataset_train.image_size

    pyramid_scales_eval = cfg.eval.scales_of_image_pyramid
    pyramid_scales_eval = [p * evaluation_scale for p in pyramid_scales_eval]

    dataloader_train = DataloaderOneShotDetection(
        dataset=dataset_train,
        box_coder=box_coder,
        batch_size=cfg.train.batch_size,
        class_batch_size=cfg.train.class_batch_size,
        img_normalization=img_normalization,
        random_flip_batches=cfg.train.augment.random_flip_batches,
        random_crop_size=random_crop_size,
        random_crop_scale=evaluation_scale,
        jitter_aspect_ratio=cfg.train.augment.jitter_aspect_ratio,
        scale_jitter=cfg.train.augment.scale_jitter,
        min_box_coverage=cfg.train.augment.min_box_coverage,
        random_color_distortion=cfg.train.augment.random_color_distortion,
        random_crop_class_images=cfg.train.augment.random_crop_class_images,
        gt_image_size=cfg.model.class_image_size,
        pyramid_scales_eval=pyramid_scales_eval,
        do_augmentation=True,
        mine_extra_class_images=cfg.train.augment.mine_extra_class_images,
        show_gt_boxes=cfg.visualization.train.show_gt_boxes_dataloader,
        logger_prefix=logger_prefix)

    if cfg.eval.train_subset_for_eval_size > 0:
        logger.info(
            "Creating sub-training set of size {0} for evaluation".format(
                cfg.eval.train_subset_for_eval_size))
        datasets_train_subset_for_eval = [
            dataset_train.copy_subset(cfg.eval.train_subset_for_eval_size)
        ]
    else:
        datasets_train_subset_for_eval = []
    return dataloader_train, datasets_train_subset_for_eval
Example #29
0
    def evaluate_detections(self,
                            all_boxes,
                            output_dir,
                            mAP_iou_threshold=0.5):
        predictions = []
        gt_boxes = []
        roidb = self.roidb
        for i_image, roi in enumerate(roidb):
            image_size = FeatureMapSize(w=roi["width"], h=roi["height"])
            if roi["boxes"].size > 0:
                roi_gt_boxes = BoxList(roi["boxes"], image_size, mode="xyxy")
            else:
                roi_gt_boxes = BoxList.create_empty(image_size)
            roi_gt_boxes.add_field(
                "labels", torch.as_tensor(roi["gt_classes"],
                                          dtype=torch.int32))
            roi_gt_boxes.add_field(
                "difficult",
                torch.as_tensor(roi["gt_ishard"], dtype=torch.int32))

            gt_boxes.append(roi_gt_boxes)

            roi_detections = []
            for i_class, class_boxes in enumerate(all_boxes):
                assert len(class_boxes) == len(roidb), \
                    "Number of detection for class {0} image{1} ({2}) inconsistent with the length of roidb ({3})".format(i_class, i_image, len(class_boxes), len(roidb))
                boxes = class_boxes[i_image]
                if len(boxes) > 0:
                    assert boxes.shape[
                        1] == 5, "Detections should be of shape (:,5), but are {0} for class {1}, image {2}".format(
                            boxes.shape, i_class, i_image)
                    bbox = BoxList(boxes[:, :4], image_size, mode="xyxy")
                    scores = boxes[:, -1]
                    bbox.add_field(
                        "scores", torch.as_tensor(scores, dtype=torch.float32))
                    bbox.add_field(
                        "labels",
                        torch.full(scores.shape, i_class, dtype=torch.int32))
                    roi_detections.append(bbox)

            if roi_detections:
                roi_detections = cat_boxlist(roi_detections)
            else:
                roi_detections = BoxList.create_empty(image_size)
                roi_detections.add_field(
                    "scores", torch.zeros((0, ), dtype=torch.float32))
                roi_detections.add_field("labels",
                                         torch.zeros((0, ), dtype=torch.int32))
            predictions.append(roi_detections)

            if False:
                self.visualize_detections(i_image,
                                          gt=roi_gt_boxes,
                                          dets=roi_detections)

        ap_data = do_voc_evaluation(predictions,
                                    gt_boxes,
                                    iou_thresh=mAP_iou_threshold,
                                    use_07_metric=False)
        print("mAP@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["map"]))
        print("mAPw@{:0.2f}: {:0.4f}".format(mAP_iou_threshold,
                                             ap_data["map_weighted"]))
        print("recall@{:0.2f}: {:0.4f}".format(mAP_iou_threshold,
                                               ap_data["recall"]))

        return ap_data['map']