def test_inference(self):
        model_name = "openai/clip-vit-base-patch32"
        model = TFCLIPModel.from_pretrained(model_name)
        processor = CLIPProcessor.from_pretrained(model_name)

        image = prepare_img()
        inputs = processor(text=["a photo of a cat", "a photo of a dog"],
                           images=image,
                           padding=True,
                           return_tensors="tf")

        outputs = model(**inputs, training=False)

        # verify the logits
        self.assertEqual(
            outputs.logits_per_image.shape,
            tf.TensorShape(
                (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
        )
        self.assertEqual(
            outputs.logits_per_text.shape,
            tf.TensorShape(
                (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
        )

        expected_logits = tf.constant([[24.5701, 19.3049]])

        tf.debugging.assert_near(outputs.logits_per_image,
                                 expected_logits,
                                 atol=1e-3)
Example #2
0
 def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
     model = TFCLIPModel(config)
     result = model(input_ids, pixel_values, attention_mask, training=False)
     self.parent.assertEqual(
         result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
     )
     self.parent.assertEqual(
         result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
     )
 def test_model_from_pretrained(self):
     for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = TFCLIPModel.from_pretrained(model_name)
         self.assertIsNotNone(model)