def test_call_pytorch_with_coco_detection_annotations(self):
        # prepare image and target
        image = Image.open(
            "./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt",
                  "r") as f:
            target = json.loads(f.read())

        target = {"image_id": 39769, "annotations": target}

        # encode them
        feature_extractor = DetrFeatureExtractor.from_pretrained(
            "facebook/detr-resnet-50")
        encoding = feature_extractor(images=image,
                                     annotations=target,
                                     return_tensors="pt")

        # verify pixel values
        expected_shape = torch.Size([1, 3, 800, 1066])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3],
                              expected_slice,
                              atol=1e-4)

        # verify area
        expected_area = torch.tensor([
            5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156,
            165732.3438
        ])
        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
        # verify boxes
        expected_boxes_shape = torch.Size([6, 4])
        self.assertEqual(encoding["labels"][0]["boxes"].shape,
                         expected_boxes_shape)
        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
        assert torch.allclose(encoding["labels"][0]["boxes"][0],
                              expected_boxes_slice,
                              atol=1e-3)
        # verify image_id
        expected_image_id = torch.tensor([39769])
        assert torch.allclose(encoding["labels"][0]["image_id"],
                              expected_image_id)
        # verify is_crowd
        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
        assert torch.allclose(encoding["labels"][0]["iscrowd"],
                              expected_is_crowd)
        # verify class_labels
        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
        assert torch.allclose(encoding["labels"][0]["class_labels"],
                              expected_class_labels)
        # verify orig_size
        expected_orig_size = torch.tensor([480, 640])
        assert torch.allclose(encoding["labels"][0]["orig_size"],
                              expected_orig_size)
        # verify size
        expected_size = torch.tensor([800, 1066])
        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
    def test_call_pytorch_with_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())

        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}

        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")

        # encode them
        # TODO replace by .from_pretrained facebook/detr-resnet-50-panoptic
        feature_extractor = DetrFeatureExtractor(format="coco_panoptic")
        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")

        # verify pixel values
        expected_shape = torch.Size([1, 3, 800, 1066])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)

        # verify area
        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
        assert torch.allclose(encoding["labels"][0]["area"], expected_area)
        # verify boxes
        expected_boxes_shape = torch.Size([6, 4])
        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
        assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
        # verify image_id
        expected_image_id = torch.tensor([39769])
        assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)
        # verify is_crowd
        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
        assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)
        # verify class_labels
        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
        assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)
        # verify masks
        expected_masks_sum = 822338
        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
        # verify orig_size
        expected_orig_size = torch.tensor([480, 640])
        assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)
        # verify size
        expected_size = torch.tensor([800, 1066])
        assert torch.allclose(encoding["labels"][0]["size"], expected_size)
def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our DETR structure.
    """

    # load default config
    config = DetrConfig()
    # set backbone and dilation attributes
    if "resnet101" in model_name:
        config.backbone = "resnet101"
    if "dc5" in model_name:
        config.dilation = True
    is_panoptic = "panoptic" in model_name
    if is_panoptic:
        config.num_labels = 250
    else:
        config.num_labels = 91
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    # load feature extractor
    format = "coco_panoptic" if is_panoptic else "coco_detection"
    feature_extractor = DetrFeatureExtractor(format=format)

    # prepare image
    img = prepare_img()
    encoding = feature_extractor(images=img, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    logger.info(f"Converting model {model_name}...")

    # load original model from torch hub
    detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval()
    state_dict = detr.state_dict()
    # rename keys
    for src, dest in rename_keys:
        if is_panoptic:
            src = "detr." + src
        rename_key(state_dict, src, dest)
    state_dict = rename_backbone_keys(state_dict)
    # query, key and value matrices need special treatment
    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
    prefix = "detr.model." if is_panoptic else "model."
    for key in state_dict.copy().keys():
        if is_panoptic:
            if (
                key.startswith("detr")
                and not key.startswith("class_labels_classifier")
                and not key.startswith("bbox_predictor")
            ):
                val = state_dict.pop(key)
                state_dict["detr.model" + key[4:]] = val
            elif "class_labels_classifier" in key or "bbox_predictor" in key:
                val = state_dict.pop(key)
                state_dict["detr." + key] = val
            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
                continue
            else:
                val = state_dict.pop(key)
                state_dict[prefix + key] = val
        else:
            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
                val = state_dict.pop(key)
                state_dict[prefix + key] = val
    # finally, create HuggingFace model and load state dict
    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
    model.load_state_dict(state_dict)
    model.eval()
    # verify our conversion
    original_outputs = detr(pixel_values)
    outputs = model(pixel_values)
    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
    if is_panoptic:
        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)

    # Save model and feature extractor
    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
 def default_feature_extractor(self):
     return DetrFeatureExtractor.from_pretrained(
         "facebook/detr-resnet-50") if is_vision_available() else None
Example #5
0
def plot_save(pil_img, prob, boxes, image_name):
    for i, p, (xmin, ymin, xmax, ymax) in enumerate(zip(prob, boxes.tolist())):
        cl = p.argmax()
        if model.config.id2label[cl.item()] == 'person':
            if int(p[cl]) > 0.95:
                im = pil_img.crop((xmin, ymin, xmax, ymax))
                im.save(save_path + str(i) + '_' + image_name + '.jpg')


for image_path in image_list:
    image_name = image_path.split('/')[-1]
    image = Image.open(image_path)
    #url = "http://images.cocodataset.org/train2014/COCO_train2014_000000384029.jpg"
    #image = Image. open( requests. get( url, stream = True). raw)
    #
    feature_extractor = DetrFeatureExtractor.from_pretrained(
        'facebook/detr-resnet-50')
    model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')

    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    # 分類名やバウンディングボックスを描画する

    # colors for visualization
    COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098],
              [0.929, 0.694, 0.125], [0.494, 0.184, 0.556],
              [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

    # keep only predictions of queries with 0.9+ confidence (excluding no-object class)
    probas = outputs.logits.softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.9