def test_inference_image_segmentation_city(self): # only resize + normalize feature_extractor = SegformerFeatureExtractor(image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False) model = SegformerForSemanticSegmentation.from_pretrained( "nvidia/segformer-b1-finetuned-cityscapes-1024-1024").to( torch_device) image = prepare_img() encoded_inputs = feature_extractor(images=image, return_tensors="pt") pixel_values = encoded_inputs.pixel_values.to(torch_device) with torch.no_grad(): outputs = model(pixel_values) expected_shape = torch.Size((1, model.config.num_labels, 128, 128)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([ [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]], [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]], [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]], ]).to(torch_device) self.assertTrue( torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-1))
def test_inference_image_segmentation_city(self): # only resize + normalize feature_extractor = SegformerFeatureExtractor(image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False) model = SegformerForSemanticSegmentation.from_pretrained( "nvidia/segformer-b1-finetuned-cityscapes-1024-1024").to( torch_device) image = prepare_img() encoded_inputs = feature_extractor(images=image, return_tensors="pt") pixel_values = encoded_inputs.pixel_values.to(torch_device) with torch.no_grad(): outputs = model(pixel_values) expected_shape = torch.Size((1, model.config.num_labels, 512, 512)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([ [[-13.5729, -13.5729, -13.6149], [-13.5729, -13.5729, -13.6149], [-13.6697, -13.6697, -13.7224]], [[-17.1638, -17.1638, -17.0022], [-17.1638, -17.1638, -17.0022], [-17.1754, -17.1754, -17.0358]], [[-3.6452, -3.6452, -3.5670], [-3.6452, -3.6452, -3.5670], [-3.5744, -3.5744, -3.5079]], ]).to(torch_device) self.assertTrue( torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-1))
def test_inference_image_segmentation_ade(self): # only resize + normalize feature_extractor = SegformerFeatureExtractor(image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False) model = SegformerForSemanticSegmentation.from_pretrained( "nvidia/segformer-b0-finetuned-ade-512-512").to(torch_device) image = prepare_img() encoded_inputs = feature_extractor(images=image, return_tensors="pt") pixel_values = encoded_inputs.pixel_values.to(torch_device) with torch.no_grad(): outputs = model(pixel_values) expected_shape = torch.Size((1, model.config.num_labels, 128, 128)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([ [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]], [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]], [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]], ]).to(torch_device) self.assertTrue( torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4))
def test_pad(self): # Initialize feature_extractor (note that padding should only be applied when random cropping) feature_extractor = SegformerFeatureExtractor( align=False, do_random_crop=True, crop_size=self.feature_extract_tester.crop_size, do_pad=True ) # create random PyTorch tensors image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True) for image in image_inputs: self.assertIsInstance(image, torch.Tensor) # Test not batched input encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values self.assertEqual( encoded_images.shape, ( 1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.crop_size[::-1], ), ) # Test batched encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values self.assertEqual( encoded_images.shape, ( self.feature_extract_tester.batch_size, self.feature_extract_tester.num_channels, *self.feature_extract_tester.crop_size[::-1], ), )
def test_inference_image_segmentation_ade(self): # only resize + normalize feature_extractor = SegformerFeatureExtractor(image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False) model = SegformerForSemanticSegmentation.from_pretrained( "nvidia/segformer-b0-finetuned-ade-512-512").to(torch_device) image = prepare_img() encoded_inputs = feature_extractor(images=image, return_tensors="pt") pixel_values = encoded_inputs.pixel_values.to(torch_device) with torch.no_grad(): outputs = model(pixel_values) expected_shape = torch.Size((1, model.config.num_labels, 512, 512)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([ [[-4.6309, -4.6309, -4.7425], [-4.6309, -4.6309, -4.7425], [-4.7011, -4.7011, -4.8136]], [[-12.1391, -12.1391, -12.2858], [-12.1391, -12.1391, -12.2858], [-12.2309, -12.2309, -12.3758]], [[-12.5134, -12.5134, -12.6328], [-12.5134, -12.5134, -12.6328], [-12.5576, -12.5576, -12.6865]], ]).to(torch_device) self.assertTrue( torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4))
def test_aligned_resize(self): # Initialize feature_extractor: version 1 feature_extractor = SegformerFeatureExtractor(do_random_crop=False, do_pad=False) # Create random PyTorch tensor image = torch.randn((3, 256, 304)) # Verify shape encoded_images = feature_extractor(image, return_tensors="pt").pixel_values expected_shape = (1, 3, 512, 608) self.assertEqual(encoded_images.shape, expected_shape) # Initialize feature_extractor: version 2 feature_extractor = SegformerFeatureExtractor(image_scale=(1024, 2048), do_random_crop=False, do_pad=False) # create random PyTorch tensor image = torch.randn((3, 1024, 2048)) # Verify shape encoded_images = feature_extractor(image, return_tensors="pt").pixel_values expected_shape = (1, 3, 1024, 2048) self.assertEqual(encoded_images.shape, expected_shape)
def test_resize(self): # Initialize feature_extractor: version 1 (no align, keep_ratio=True) feature_extractor = SegformerFeatureExtractor( image_scale=(1333, 800), align=False, do_random_crop=False, do_pad=False ) # Create random PyTorch tensor image = torch.randn((3, 288, 512)) # Verify shape encoded_images = feature_extractor(image, return_tensors="pt").pixel_values expected_shape = (1, 3, 750, 1333) self.assertEqual(encoded_images.shape, expected_shape) # Initialize feature_extractor: version 2 (keep_ratio=False) feature_extractor = SegformerFeatureExtractor( image_scale=(1280, 800), align=False, keep_ratio=False, do_random_crop=False, do_pad=False ) # Verify shape encoded_images = feature_extractor(image, return_tensors="pt").pixel_values expected_shape = (1, 3, 800, 1280) self.assertEqual(encoded_images.shape, expected_shape)
def test_random_crop(self): from datasets import load_dataset ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") image = Image.open(ds[0]["file"]) segmentation_map = Image.open(ds[1]["file"]) w, h = image.size # Initialize feature_extractor feature_extractor = SegformerFeatureExtractor(crop_size=[w - 20, h - 20], do_pad=False) # Encode image + segmentation map encoded_images = feature_extractor(images=image, segmentation_maps=segmentation_map, return_tensors="pt") # Verify shape of pixel_values self.assertEqual(encoded_images.pixel_values.shape[-2:], (h - 20, w - 20)) # Verify shape of labels self.assertEqual(encoded_images.labels.shape[-2:], (h - 20, w - 20))
def __init__( self, device: torch.device, model_name: str = "nvidia/segformer-b3-finetuned-cityscapes-1024-1024", ): """ INIT :param torch.device device: - The device to use. :param str model_name: - The name of the model to use (https://huggingface.co/models) """ print(f"Loading feature extractor for {model_name}") self.feature_extractor = SegformerFeatureExtractor.from_pretrained( model_name) print(f"Loading segmentation model for {model_name}") self.model = SegformerForSemanticSegmentation.from_pretrained( model_name) self.device = device self.model = self.model.to(device=self.device) self.image_transforms = transforms.Compose( [ToTensor(), MergeImages(), SequenceResize()])
def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our SegFormer structure. """ # load default SegFormer configuration config = SegformerConfig() encoder_only = False # set attributes based on model_name repo_id = "datasets/huggingface/label-files" if "segformer" in model_name: size = model_name[len("segformer.") : len("segformer.") + 2] if "ade" in model_name: config.num_labels = 150 filename = "ade20k-id2label.json" expected_shape = (1, 150, 128, 128) elif "city" in model_name: config.num_labels = 19 filename = "cityscapes-id2label.json" expected_shape = (1, 19, 128, 128) else: raise ValueError(f"Model {model_name} not supported") elif "mit" in model_name: encoder_only = True size = model_name[4:6] config.num_labels = 1000 filename = "imagenet-1k-id2label.json" expected_shape = (1, 1000) else: raise ValueError(f"Model {model_name} not supported") # set config attributes id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} if size == "b0": pass elif size == "b1": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 256 elif size == "b2": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 4, 6, 3] elif size == "b3": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 4, 18, 3] elif size == "b4": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 8, 27, 3] elif size == "b5": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 6, 40, 3] else: raise ValueError(f"Size {size} not supported") # load feature extractor (only resize + normalize) feature_extractor = SegformerFeatureExtractor( image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False ) # prepare image image = prepare_img() pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values logger.info(f"Converting model {model_name}...") # load original state dict if encoder_only: state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) else: state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))["state_dict"] # rename keys state_dict = rename_keys(state_dict, encoder_only=encoder_only) if not encoder_only: del state_dict["decode_head.conv_seg.weight"] del state_dict["decode_head.conv_seg.bias"] # key and value matrices need special treatment read_in_k_v(state_dict, config) # create HuggingFace model and load state dict if encoder_only: config.reshape_last_stage = False model = SegformerForImageClassification(config) else: model = SegformerForSemanticSegmentation(config) model.load_state_dict(state_dict) model.eval() # forward pass outputs = model(pixel_values) logits = outputs.logits # set expected_slice based on model name # ADE20k checkpoints if model_name == "segformer.b0.512x512.ade.160k": expected_slice = torch.tensor( [ [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]], [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]], [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]], ] ) elif model_name == "segformer.b1.512x512.ade.160k": expected_slice = torch.tensor( [ [[-7.5820, -8.7231, -8.3215], [-8.0600, -10.3529, -10.0304], [-7.5208, -9.4103, -9.6239]], [[-12.6918, -13.8994, -13.7137], [-13.3196, -15.7523, -15.4789], [-12.9343, -14.8757, -14.9689]], [[-11.1911, -11.9421, -11.3243], [-11.3342, -13.6839, -13.3581], [-10.3909, -12.1832, -12.4858]], ] ) elif model_name == "segformer.b2.512x512.ade.160k": expected_slice = torch.tensor( [ [[-11.8173, -14.3850, -16.3128], [-14.5648, -16.5804, -18.6568], [-14.7223, -15.7387, -18.4218]], [[-15.7290, -17.9171, -19.4423], [-18.3105, -19.9448, -21.4661], [-17.9296, -18.6497, -20.7910]], [[-15.0783, -17.0336, -18.2789], [-16.8771, -18.6870, -20.1612], [-16.2454, -17.1426, -19.5055]], ] ) elif model_name == "segformer.b3.512x512.ade.160k": expected_slice = torch.tensor( [ [[-9.0878, -10.2081, -10.1891], [-9.3144, -10.7941, -10.9843], [-9.2294, -10.3855, -10.5704]], [[-12.2316, -13.9068, -13.6102], [-12.9161, -14.3702, -14.3235], [-12.5233, -13.7174, -13.7932]], [[-14.6275, -15.2490, -14.9727], [-14.3400, -15.9687, -16.2827], [-14.1484, -15.4033, -15.8937]], ] ) elif model_name == "segformer.b4.512x512.ade.160k": expected_slice = torch.tensor( [ [[-12.3144, -13.2447, -14.0802], [-13.3614, -14.5816, -15.6117], [-13.3340, -14.4433, -16.2219]], [[-19.2781, -20.4128, -20.7506], [-20.6153, -21.6566, -22.0998], [-19.9800, -21.0430, -22.1494]], [[-18.8739, -19.7804, -21.1834], [-20.1233, -21.6765, -23.2944], [-20.0315, -21.2641, -23.6944]], ] ) elif model_name == "segformer.b5.640x640.ade.160k": expected_slice = torch.tensor( [ [[-9.5524, -12.0835, -11.7348], [-10.5229, -13.6446, -14.5662], [-9.5842, -12.8851, -13.9414]], [[-15.3432, -17.5323, -17.0818], [-16.3330, -18.9255, -19.2101], [-15.1340, -17.7848, -18.3971]], [[-12.6072, -14.9486, -14.6631], [-13.7629, -17.0907, -17.7745], [-12.7899, -16.1695, -17.1671]], ] ) # Cityscapes checkpoints elif model_name == "segformer.b0.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-11.9295, -13.4057, -14.8106], [-13.3431, -14.8179, -15.3781], [-14.2836, -15.5942, -16.1588]], [[-11.4906, -12.8067, -13.6564], [-13.1189, -14.0500, -14.1543], [-13.8748, -14.5136, -14.8789]], [[0.5374, 0.1067, -0.4742], [0.1141, -0.2255, -0.7099], [-0.3000, -0.5924, -1.3105]], ] ) elif model_name == "segformer.b0.512x1024.city.160k": expected_slice = torch.tensor( [ [[-7.8217, -9.8767, -10.1717], [-9.4438, -10.9058, -11.4047], [-9.7939, -12.3495, -12.1079]], [[-7.1514, -9.5336, -10.0860], [-9.7776, -11.6822, -11.8439], [-10.1411, -12.7655, -12.8972]], [[0.3021, 0.0805, -0.2310], [-0.0328, -0.1605, -0.2714], [-0.1408, -0.5477, -0.6976]], ] ) elif model_name == "segformer.b0.640x1280.city.160k": expected_slice = torch.tensor( [ [ [-1.1372e01, -1.2787e01, -1.3477e01], [-1.2536e01, -1.4194e01, -1.4409e01], [-1.3217e01, -1.4888e01, -1.5327e01], ], [ [-1.4791e01, -1.7122e01, -1.8277e01], [-1.7163e01, -1.9192e01, -1.9533e01], [-1.7897e01, -1.9991e01, -2.0315e01], ], [ [7.6723e-01, 4.1921e-01, -7.7878e-02], [4.7772e-01, 9.5557e-03, -2.8082e-01], [3.6032e-01, -2.4826e-01, -5.1168e-01], ], ] ) elif model_name == "segformer.b0.768x768.city.160k": expected_slice = torch.tensor( [ [[-9.4959, -11.3087, -11.7479], [-11.0025, -12.6540, -12.3319], [-11.4064, -13.0487, -12.9905]], [[-9.8905, -11.3084, -12.0854], [-11.1726, -12.7698, -12.9583], [-11.5985, -13.3278, -14.1774]], [[0.2213, 0.0192, -0.2466], [-0.1731, -0.4213, -0.4874], [-0.3126, -0.6541, -1.1389]], ] ) elif model_name == "segformer.b1.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]], [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]], [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]], ] ) elif model_name == "segformer.b2.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-16.0976, -16.4856, -17.3962], [-16.6234, -19.0342, -19.7685], [-16.0900, -18.0661, -19.1180]], [[-18.4750, -18.8488, -19.5074], [-19.4030, -22.1570, -22.5977], [-19.1191, -20.8486, -22.3783]], [[-4.5178, -5.5037, -6.5109], [-5.0884, -7.2174, -8.0334], [-4.4156, -5.8117, -7.2970]], ] ) elif model_name == "segformer.b3.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-14.2081, -14.4732, -14.1977], [-14.5867, -16.4423, -16.6356], [-13.4441, -14.9685, -16.8696]], [[-14.4576, -14.7073, -15.0451], [-15.0816, -17.6237, -17.9873], [-14.4213, -16.0199, -18.5992]], [[-4.7349, -4.9588, -5.0966], [-4.3210, -6.9325, -7.2591], [-3.4312, -4.7484, -7.1917]], ] ) elif model_name == "segformer.b4.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-11.7737, -11.9526, -11.3273], [-13.6692, -14.4574, -13.8878], [-13.8937, -14.6924, -15.9345]], [[-14.6706, -14.5330, -14.1306], [-16.1502, -16.8180, -16.4269], [-16.8338, -17.8939, -20.1746]], [[1.0491, 0.8289, 1.0310], [1.1044, 0.5219, 0.8055], [1.0899, 0.6926, 0.5590]], ] ) elif model_name == "segformer.b5.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-12.5641, -13.4777, -13.0684], [-13.9587, -15.8983, -16.6557], [-13.3109, -15.7350, -16.3141]], [[-14.7074, -15.4352, -14.5944], [-16.6353, -18.1663, -18.6120], [-15.1702, -18.0329, -18.1547]], [[-1.7990, -2.0951, -1.7784], [-2.6397, -3.8245, -3.9686], [-1.5264, -2.8126, -2.9316]], ] ) else: predicted_class_idx = logits.argmax(-1).item() print("Predicted class:", model.config.id2label[predicted_class_idx]) # verify logits if not encoder_only: assert logits.shape == expected_shape assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2) # finally, save model and feature extractor logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path) feature_extractor.save_pretrained(pytorch_dump_folder_path)