def test_inference(self): model_name = "openai/clip-vit-base-patch32" model = TFCLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) image = prepare_img() inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf") outputs = model(**inputs, training=False) # verify the logits self.assertEqual( outputs.logits_per_image.shape, tf.TensorShape( (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), ) self.assertEqual( outputs.logits_per_text.shape, tf.TensorShape( (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), ) expected_logits = tf.constant([[24.5701, 19.3049]]) tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): model = TFCLIPModel(config) result = model(input_ids, pixel_values, attention_mask, training=False) self.parent.assertEqual( result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) ) self.parent.assertEqual( result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) )
def test_model_from_pretrained(self): for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = TFCLIPModel.from_pretrained(model_name) self.assertIsNotNone(model)