def create_and_check_for_image_classification(self, config, pixel_values, labels):
     config.num_labels = self.type_sequence_label_size
     model = ViTForImageClassification(config)
     model.to(torch_device)
     model.eval()
     result = model(pixel_values, labels=labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
    def test_inference_image_classification_head(self):
        model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)

        feature_extractor = self.default_feature_extractor
        image = prepare_img()
        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        outputs = model(**inputs)

        # verify the logits
        expected_shape = torch.Size((1, 1000))
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)

        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
Example #3
0
    def test_inference_image_classification_head(self):
        model = ViTForImageClassification.from_pretrained(
            "google/vit-base-patch16-224").to(torch_device)

        feature_extractor = self.default_feature_extractor
        image = prepare_img()
        inputs = feature_extractor(images=image,
                                   return_tensors="pt").to(torch_device)

        # forward pass
        # currently failing
        # see https://discuss.pytorch.org/t/runtimeerror-expected-object-of-scalar-type-double-but-got-scalar-type-float-for-argument-2-weight/38961/2
        outputs = model(inputs["pixel_values"])
        # outputs = model(**inputs)

        # verify the logits
        expected_shape = torch.Size((1, 1000))
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor([-0.2744, 0.8215,
                                       -0.0836]).to(torch_device)

        self.assertTrue(
            torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our ViT structure.
    """

    # define default ViT configuration
    config = ViTConfig()
    base_model = False
    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
    if vit_name[-5:] == "in21k":
        base_model = True
        config.patch_size = int(vit_name[-12:-10])
        config.image_size = int(vit_name[-9:-6])
    else:
        config.num_labels = 1000
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(
            open(cached_download(hf_hub_url(repo_id, filename)), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        config.patch_size = int(vit_name[-6:-4])
        config.image_size = int(vit_name[-3:])
    # size of the architecture
    if "deit" in vit_name:
        if vit_name[9:].startswith("tiny"):
            config.hidden_size = 192
            config.intermediate_size = 768
            config.num_hidden_layers = 12
            config.num_attention_heads = 3
        elif vit_name[9:].startswith("small"):
            config.hidden_size = 384
            config.intermediate_size = 1536
            config.num_hidden_layers = 12
            config.num_attention_heads = 6
        else:
            pass
    else:
        if vit_name[4:].startswith("small"):
            config.hidden_size = 768
            config.intermediate_size = 2304
            config.num_hidden_layers = 8
            config.num_attention_heads = 8
        elif vit_name[4:].startswith("base"):
            pass
        elif vit_name[4:].startswith("large"):
            config.hidden_size = 1024
            config.intermediate_size = 4096
            config.num_hidden_layers = 24
            config.num_attention_heads = 16
        elif vit_name[4:].startswith("huge"):
            config.hidden_size = 1280
            config.intermediate_size = 5120
            config.num_hidden_layers = 32
            config.num_attention_heads = 16

    # load original model from timm
    timm_model = timm.create_model(vit_name, pretrained=True)
    timm_model.eval()

    # load state_dict of original model, remove and rename some keys
    state_dict = timm_model.state_dict()
    if base_model:
        remove_classification_head_(state_dict)
    rename_keys = create_rename_keys(config, base_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config, base_model)

    # load HuggingFace model
    if vit_name[-5:] == "in21k":
        model = ViTModel(config).eval()
    else:
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
    if "deit" in vit_name:
        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
    else:
        feature_extractor = ViTFeatureExtractor(size=config.image_size)
    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

    if base_model:
        timm_pooled_output = timm_model.forward_features(pixel_values)
        assert timm_pooled_output.shape == outputs.pooler_output.shape
        assert torch.allclose(timm_pooled_output,
                              outputs.pooler_output,
                              atol=1e-3)
    else:
        timm_logits = timm_model(pixel_values)
        assert timm_logits.shape == outputs.logits.shape
        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
Example #5
0
import os, sys
import torch
from transformers import ViTFeatureExtractor, ViTForImageClassification
from PIL import Image
import requests

BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH, '..'))
from lpdutils.lpimagedataset import LPImageDataSet

# to choose a different model by image size, patch size, and parameters number, see README
feature_extractor = ViTFeatureExtractor.from_pretrained(
    'google/vit-large-patch16-224',
    cache_dir=os.getenv("cache_dir", "../../models"))
model = ViTForImageClassification.from_pretrained(
    'google/vit-large-patch16-224',
    cache_dir=os.getenv("cache_dir", "../../models"))

# load local dataset
batch_size = 2
num_workers = 2
my_dataset = LPImageDataSet(os.path.join(
    os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'imagenet'),
                            transform=LPImageDataSet.transform)
imageloader = torch.utils.data.DataLoader(my_dataset,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          num_workers=num_workers,
                                          drop_last=True,
                                          collate_fn=LPImageDataSet.collate_fn)
Example #6
0
    def create_and_check_for_image_classification(self, config, pixel_values,
                                                  labels):
        config.num_labels = self.type_sequence_label_size
        model = ViTForImageClassification(config)
        model.to(torch_device)
        model.eval()
        result = model(pixel_values, labels=labels)
        self.parent.assertEqual(
            result.logits.shape,
            (self.batch_size, self.type_sequence_label_size))

        # test greyscale images
        config.num_channels = 1
        model = ViTForImageClassification(config)
        model.to(torch_device)
        model.eval()

        pixel_values = floats_tensor(
            [self.batch_size, 1, self.image_size, self.image_size])
        result = model(pixel_values)
        self.parent.assertEqual(
            result.logits.shape,
            (self.batch_size, self.type_sequence_label_size))
Example #7
0
def convert_vit_checkpoint(model_name,
                           pytorch_dump_folder_path,
                           base_model=True):
    """
    Copy/paste/tweak model's weights to our ViT structure.
    """

    # define default ViT configuration
    config = ViTConfig()
    # patch_size
    if model_name[-1] == "8":
        config.patch_size = 8
    # set labels if required
    if not base_model:
        config.num_labels = 1000
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    # size of the architecture
    if model_name in ["dino_vits8", "dino_vits16"]:
        config.hidden_size = 384
        config.intermediate_size = 1536
        config.num_hidden_layers = 12
        config.num_attention_heads = 6

    # load original model from torch hub
    original_model = torch.hub.load("facebookresearch/dino:main", model_name)
    original_model.eval()

    # load state_dict of original model, remove and rename some keys
    state_dict = original_model.state_dict()
    if base_model:
        remove_classification_head_(state_dict)
    rename_keys = create_rename_keys(config, base_model=base_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config, base_model)

    # load HuggingFace model
    if base_model:
        model = ViTModel(config, add_pooling_layer=False).eval()
    else:
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image, prepared by ViTFeatureExtractor
    feature_extractor = ViTFeatureExtractor()
    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

    if base_model:
        final_hidden_state_cls_token = original_model(pixel_values)
        assert torch.allclose(final_hidden_state_cls_token,
                              outputs.last_hidden_state[:, 0, :],
                              atol=1e-1)
    else:
        logits = original_model(pixel_values)
        assert logits.shape == outputs.logits.shape
        assert torch.allclose(logits, outputs.logits, atol=1e-3)

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
])

noop_transform = transforms.Compose(
    [transforms.Resize(input_image_size),
     transforms.ToTensor()])

# ### Data loaders
#
# First we specify the pre-trained ViT model we are going to use. The model [`"google/vit-base-patch16-224"`](https://huggingface.co/google/vit-base-patch16-224) is pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 classes) at resolution 224x224.
#
# We'll use a pre-trained ViT feature extractor that matches the ViT model to preprocess the input images.

VITMODEL = 'google/vit-base-patch16-224'
model = ViTForImageClassification.from_pretrained(
    VITMODEL, num_labels=43, ignore_mismatched_sizes=True).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)

feature_extractor = ViTFeatureExtractor.from_pretrained(VITMODEL)

# The we define a "collator" function. This is just a function passed to the `DataLoader` which will pre-process each batch of data. In our case we will pass the images through the `ViTFeatureExtractor` which will process the images into the correct format for ViT.


class ImageClassificationCollator:
    def __init__(self, feature_extractor):
        self.feature_extractor = feature_extractor

    def __call__(self, batch):
        x = self.feature_extractor([x[0] for x in batch], return_tensors='pt')
        x['labels'] = torch.tensor([x[1] for x in batch], dtype=torch.int64)
        return x