def create_and_check_for_image_classification(self, config, pixel_values, labels): config.num_labels = self.type_sequence_label_size model = ViTForImageClassification(config) model.to(torch_device) model.eval() result = model(pixel_values, labels=labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
def test_inference_image_classification_head(self): model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device) feature_extractor = self.default_feature_extractor image = prepare_img() inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) # forward pass outputs = model(**inputs) # verify the logits expected_shape = torch.Size((1, 1000)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device) self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
def test_inference_image_classification_head(self): model = ViTForImageClassification.from_pretrained( "google/vit-base-patch16-224").to(torch_device) feature_extractor = self.default_feature_extractor image = prepare_img() inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) # forward pass # currently failing # see https://discuss.pytorch.org/t/runtimeerror-expected-object-of-scalar-type-double-but-got-scalar-type-float-for-argument-2-weight/38961/2 outputs = model(inputs["pixel_values"]) # outputs = model(**inputs) # verify the logits expected_shape = torch.Size((1, 1000)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device) self.assertTrue( torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our ViT structure. """ # define default ViT configuration config = ViTConfig() base_model = False # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size if vit_name[-5:] == "in21k": base_model = True config.patch_size = int(vit_name[-12:-10]) config.image_size = int(vit_name[-9:-6]) else: config.num_labels = 1000 repo_id = "datasets/huggingface/label-files" filename = "imagenet-1k-id2label.json" id2label = json.load( open(cached_download(hf_hub_url(repo_id, filename)), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} config.patch_size = int(vit_name[-6:-4]) config.image_size = int(vit_name[-3:]) # size of the architecture if "deit" in vit_name: if vit_name[9:].startswith("tiny"): config.hidden_size = 192 config.intermediate_size = 768 config.num_hidden_layers = 12 config.num_attention_heads = 3 elif vit_name[9:].startswith("small"): config.hidden_size = 384 config.intermediate_size = 1536 config.num_hidden_layers = 12 config.num_attention_heads = 6 else: pass else: if vit_name[4:].startswith("small"): config.hidden_size = 768 config.intermediate_size = 2304 config.num_hidden_layers = 8 config.num_attention_heads = 8 elif vit_name[4:].startswith("base"): pass elif vit_name[4:].startswith("large"): config.hidden_size = 1024 config.intermediate_size = 4096 config.num_hidden_layers = 24 config.num_attention_heads = 16 elif vit_name[4:].startswith("huge"): config.hidden_size = 1280 config.intermediate_size = 5120 config.num_hidden_layers = 32 config.num_attention_heads = 16 # load original model from timm timm_model = timm.create_model(vit_name, pretrained=True) timm_model.eval() # load state_dict of original model, remove and rename some keys state_dict = timm_model.state_dict() if base_model: remove_classification_head_(state_dict) rename_keys = create_rename_keys(config, base_model) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, config, base_model) # load HuggingFace model if vit_name[-5:] == "in21k": model = ViTModel(config).eval() else: model = ViTForImageClassification(config).eval() model.load_state_dict(state_dict) # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor if "deit" in vit_name: feature_extractor = DeiTFeatureExtractor(size=config.image_size) else: feature_extractor = ViTFeatureExtractor(size=config.image_size) encoding = feature_extractor(images=prepare_img(), return_tensors="pt") pixel_values = encoding["pixel_values"] outputs = model(pixel_values) if base_model: timm_pooled_output = timm_model.forward_features(pixel_values) assert timm_pooled_output.shape == outputs.pooler_output.shape assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3) else: timm_logits = timm_model(pixel_values) assert timm_logits.shape == outputs.logits.shape assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model {vit_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path)
import os, sys import torch from transformers import ViTFeatureExtractor, ViTForImageClassification from PIL import Image import requests BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH, '..')) from lpdutils.lpimagedataset import LPImageDataSet # to choose a different model by image size, patch size, and parameters number, see README feature_extractor = ViTFeatureExtractor.from_pretrained( 'google/vit-large-patch16-224', cache_dir=os.getenv("cache_dir", "../../models")) model = ViTForImageClassification.from_pretrained( 'google/vit-large-patch16-224', cache_dir=os.getenv("cache_dir", "../../models")) # load local dataset batch_size = 2 num_workers = 2 my_dataset = LPImageDataSet(os.path.join( os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'imagenet'), transform=LPImageDataSet.transform) imageloader = torch.utils.data.DataLoader(my_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True, collate_fn=LPImageDataSet.collate_fn)
def create_and_check_for_image_classification(self, config, pixel_values, labels): config.num_labels = self.type_sequence_label_size model = ViTForImageClassification(config) model.to(torch_device) model.eval() result = model(pixel_values, labels=labels) self.parent.assertEqual( result.logits.shape, (self.batch_size, self.type_sequence_label_size)) # test greyscale images config.num_channels = 1 model = ViTForImageClassification(config) model.to(torch_device) model.eval() pixel_values = floats_tensor( [self.batch_size, 1, self.image_size, self.image_size]) result = model(pixel_values) self.parent.assertEqual( result.logits.shape, (self.batch_size, self.type_sequence_label_size))
def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True): """ Copy/paste/tweak model's weights to our ViT structure. """ # define default ViT configuration config = ViTConfig() # patch_size if model_name[-1] == "8": config.patch_size = 8 # set labels if required if not base_model: config.num_labels = 1000 repo_id = "datasets/huggingface/label-files" filename = "imagenet-1k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} # size of the architecture if model_name in ["dino_vits8", "dino_vits16"]: config.hidden_size = 384 config.intermediate_size = 1536 config.num_hidden_layers = 12 config.num_attention_heads = 6 # load original model from torch hub original_model = torch.hub.load("facebookresearch/dino:main", model_name) original_model.eval() # load state_dict of original model, remove and rename some keys state_dict = original_model.state_dict() if base_model: remove_classification_head_(state_dict) rename_keys = create_rename_keys(config, base_model=base_model) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, config, base_model) # load HuggingFace model if base_model: model = ViTModel(config, add_pooling_layer=False).eval() else: model = ViTForImageClassification(config).eval() model.load_state_dict(state_dict) # Check outputs on an image, prepared by ViTFeatureExtractor feature_extractor = ViTFeatureExtractor() encoding = feature_extractor(images=prepare_img(), return_tensors="pt") pixel_values = encoding["pixel_values"] outputs = model(pixel_values) if base_model: final_hidden_state_cls_token = original_model(pixel_values) assert torch.allclose(final_hidden_state_cls_token, outputs.last_hidden_state[:, 0, :], atol=1e-1) else: logits = original_model(pixel_values) assert logits.shape == outputs.logits.shape assert torch.allclose(logits, outputs.logits, atol=1e-3) Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model {model_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path)
transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) noop_transform = transforms.Compose( [transforms.Resize(input_image_size), transforms.ToTensor()]) # ### Data loaders # # First we specify the pre-trained ViT model we are going to use. The model [`"google/vit-base-patch16-224"`](https://huggingface.co/google/vit-base-patch16-224) is pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 classes) at resolution 224x224. # # We'll use a pre-trained ViT feature extractor that matches the ViT model to preprocess the input images. VITMODEL = 'google/vit-base-patch16-224' model = ViTForImageClassification.from_pretrained( VITMODEL, num_labels=43, ignore_mismatched_sizes=True).to(device) optimizer = optim.Adam(model.parameters(), lr=1e-5) feature_extractor = ViTFeatureExtractor.from_pretrained(VITMODEL) # The we define a "collator" function. This is just a function passed to the `DataLoader` which will pre-process each batch of data. In our case we will pass the images through the `ViTFeatureExtractor` which will process the images into the correct format for ViT. class ImageClassificationCollator: def __init__(self, feature_extractor): self.feature_extractor = feature_extractor def __call__(self, batch): x = self.feature_extractor([x[0] for x in batch], return_tensors='pt') x['labels'] = torch.tensor([x[1] for x in batch], dtype=torch.int64) return x