def test_inference(self): model = VisionTextDualEncoderModel.from_pretrained( "clip-italian/clip-italian", logit_scale_init_value=1) processor = VisionTextDualEncoderProcessor.from_pretrained( "clip-italian/clip-italian") image = Image.open( "./tests/fixtures/tests_samples/COCO/000000039769.png") inputs = processor( text=["una foto di un gatto", "una foto di un cane"], images=image, padding=True, return_tensors="pt") outputs = model(**inputs) # verify the logits self.assertEqual( outputs.logits_per_image.shape, (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])) self.assertEqual( outputs.logits_per_text.shape, (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]), ) expected_logits = torch.tensor([[1.2284727, 0.3104122]]) self.assertTrue( torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
def check_save_load(self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs): vision_model, text_model = self.get_vision_text_model( vision_config, text_config) model = VisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model) model.to(torch_device) model.eval() with torch.no_grad(): output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask) out_1 = output[0].cpu().numpy() with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model = VisionTextDualEncoderModel.from_pretrained( tmpdirname).eval() model.to(torch_device) after_output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask) out_2 = after_output[0].cpu().numpy() max_diff = np.amax(np.abs(out_2 - out_1)) self.assertLessEqual(max_diff, 1e-5)
def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mask, pixel_values, **kwargs): pt_model.to(torch_device) pt_model.eval() # prepare inputs inputs_dict = { "input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values } pt_inputs = inputs_dict flax_inputs = {k: v.numpy() for k, v in pt_inputs.items()} with torch.no_grad(): pt_outputs = pt_model(**pt_inputs).to_tuple() fx_outputs = fx_model(**flax_inputs).to_tuple() self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]): self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2) # PT -> Flax with tempfile.TemporaryDirectory() as tmpdirname: pt_model.save_pretrained(tmpdirname) fx_model_loaded = FlaxVisionTextDualEncoderModel.from_pretrained( tmpdirname, from_pt=True) fx_outputs_loaded = fx_model_loaded(**flax_inputs).to_tuple() self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch") for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]): self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2) # Flax -> PT with tempfile.TemporaryDirectory() as tmpdirname: fx_model.save_pretrained(tmpdirname) pt_model_loaded = VisionTextDualEncoderModel.from_pretrained( tmpdirname, from_flax=True) pt_model_loaded.to(torch_device) pt_model_loaded.eval() with torch.no_grad(): pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple() self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch") for fx_output, pt_output_loaded in zip(fx_outputs[:4], pt_outputs_loaded[:4]): self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 4e-2)
def test_real_model_save_load_from_pretrained(self): model_2, inputs = self.get_pretrained_model_and_inputs() model_2.to(torch_device) with torch.no_grad(): outputs = model_2(**inputs) out_2 = outputs[0].cpu().numpy() with tempfile.TemporaryDirectory() as tmp_dirname: model_2.save_pretrained(tmp_dirname) model_1 = VisionTextDualEncoderModel.from_pretrained( tmp_dirname) model_1.to(torch_device) after_outputs = model_1(**inputs) out_1 = after_outputs[0].cpu().numpy() max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)