Ejemplo n.º 1
0
    def prepare_config_and_inputs(self):
        pixel_values = floats_tensor([
            self.batch_size, self.num_channels, self.image_size,
            self.image_size
        ])

        labels = None
        if self.use_labels:
            labels = ids_tensor([self.batch_size],
                                self.type_sequence_label_size)

        config = DeiTConfig(
            image_size=self.image_size,
            patch_size=self.patch_size,
            num_channels=self.num_channels,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            is_decoder=False,
            initializer_range=self.initializer_range,
        )

        return config, pixel_values, labels
Ejemplo n.º 2
0
 def get_config(self):
     return DeiTConfig(
         image_size=self.image_size,
         patch_size=self.patch_size,
         num_channels=self.num_channels,
         hidden_size=self.hidden_size,
         num_hidden_layers=self.num_hidden_layers,
         num_attention_heads=self.num_attention_heads,
         intermediate_size=self.intermediate_size,
         hidden_act=self.hidden_act,
         hidden_dropout_prob=self.hidden_dropout_prob,
         attention_probs_dropout_prob=self.attention_probs_dropout_prob,
         is_decoder=False,
         initializer_range=self.initializer_range,
     )
Ejemplo n.º 3
0
def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our DeiT structure.
    """

    # define default DeiT configuration
    config = DeiTConfig()
    # all deit models have fine-tuned heads
    base_model = False
    # dataset (fine-tuned on ImageNet 2012), patch_size and image_size
    config.num_labels = 1000
    repo_id = "datasets/huggingface/label-files"
    filename = "imagenet-1k-id2label.json"
    id2label = json.load(
        open(cached_download(hf_hub_url(repo_id, filename)), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}
    config.patch_size = int(deit_name[-6:-4])
    config.image_size = int(deit_name[-3:])
    # size of the architecture
    if deit_name[9:].startswith("tiny"):
        config.hidden_size = 192
        config.intermediate_size = 768
        config.num_hidden_layers = 12
        config.num_attention_heads = 3
    elif deit_name[9:].startswith("small"):
        config.hidden_size = 384
        config.intermediate_size = 1536
        config.num_hidden_layers = 12
        config.num_attention_heads = 6
    if deit_name[9:].startswith("base"):
        pass
    elif deit_name[4:].startswith("large"):
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16

    # load original model from timm
    timm_model = timm.create_model(deit_name, pretrained=True)
    timm_model.eval()

    # load state_dict of original model, remove and rename some keys
    state_dict = timm_model.state_dict()
    rename_keys = create_rename_keys(config, base_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config, base_model)

    # load HuggingFace model
    model = DeiTForImageClassificationWithTeacher(config).eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image, prepared by DeiTFeatureExtractor
    size = int(
        (256 / 224) * config.image_size
    )  # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
    feature_extractor = DeiTFeatureExtractor(size=size,
                                             crop_size=config.image_size)
    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

    timm_logits = timm_model(pixel_values)
    assert timm_logits.shape == outputs.logits.shape
    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)