def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    if config_path is not None:
        config = CLIPConfig.from_pretrained(config_path)
    else:
        config = CLIPConfig(projection_dim=512, text_config={}, vision_config={})

    hf_model = CLIPModel(config).eval()

    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
    pt_model = pt_model.eval()

    copy_text_model_and_projection(hf_model, pt_model)
    copy_vison_model_and_projection(hf_model, pt_model)
    hf_model.logit_scale = pt_model.logit_scale

    input_ids = torch.arange(0, 77).unsqueeze(0)
    pixel_values = torch.randn(1, 3, 224, 224)

    hf_logits_per_image, hf_logits_per_text = hf_model(
        input_ids=input_ids, pixel_values=pixel_values, return_dict=True
    )[1:3]
    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)

    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)

    hf_model.save_pretrained(pytorch_dump_folder_path)
    def prepare_config_and_inputs(self):
        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs(
        )
        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs(
        )

        config = CLIPConfig.from_text_vision_configs(text_config,
                                                     vision_config,
                                                     projection_dim=64)

        return config, input_ids, attention_mask, pixel_values
 def get_config(self):
     return CLIPConfig.from_text_vision_configs(
         self.text_model_tester.get_config(),
         self.vision_model_tester.get_config(),
         projection_dim=64)