def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() feature_extractor = self.get_feature_extractor() processor_slow = OwlViTProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor) processor_slow.save_pretrained(self.tmpdirname) processor_slow = OwlViTProcessor.from_pretrained(self.tmpdirname, use_fast=False) processor_fast = OwlViTProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor) processor_fast.save_pretrained(self.tmpdirname) processor_fast = OwlViTProcessor.from_pretrained(self.tmpdirname) self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertIsInstance(processor_slow.feature_extractor, OwlViTFeatureExtractor) self.assertIsInstance(processor_fast.feature_extractor, OwlViTFeatureExtractor)
def test_tokenizer_decode(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] decoded_processor = processor.batch_decode(predicted_ids) decoded_tok = tokenizer.batch_decode(predicted_ids) self.assertListEqual(decoded_tok, decoded_processor)
def test_inference_object_detection(self): model_name = "google/owlvit-base-patch32" model = OwlViTForObjectDetection.from_pretrained(model_name).to( torch_device) processor = OwlViTProcessor.from_pretrained(model_name) image = prepare_img() inputs = processor( text=[["a photo of a cat", "a photo of a dog"]], images=image, max_length=16, padding="max_length", return_tensors="pt", ).to(torch_device) with torch.no_grad(): outputs = model(**inputs) num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size)**2) self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4))) expected_slice_boxes = torch.tensor([[0.0948, 0.0471, 0.1915], [0.3194, 0.0583, 0.6498], [0.1441, 0.0452, 0.2197]]).to(torch_device) self.assertTrue( torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
def test_inference(self): model_name = "google/owlvit-base-patch32" model = OwlViTModel.from_pretrained(model_name).to(torch_device) processor = OwlViTProcessor.from_pretrained(model_name) image = prepare_img() inputs = processor( text=[["a photo of a cat", "a photo of a dog"]], images=image, max_length=16, padding="max_length", return_tensors="pt", ).to(torch_device) # forward pass with torch.no_grad(): outputs = model(**inputs) # verify the logits self.assertEqual( outputs.logits_per_image.shape, torch.Size( (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), ) self.assertEqual( outputs.logits_per_text.shape, torch.Size( (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), ) expected_logits = torch.tensor([[4.4420, 0.6181]], device=torch_device) self.assertTrue( torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None): """ Copy/paste/tweak model's weights to transformers design. """ repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}") repo.git_pull() if config_path is not None: config = OwlViTConfig.from_pretrained(config_path) else: config = OwlViTConfig() hf_backbone = OwlViTModel(config).eval() hf_model = OwlViTForObjectDetection(config).eval() copy_text_model_and_projection(hf_backbone, pt_backbone) copy_vision_model_and_projection(hf_backbone, pt_backbone) hf_backbone.logit_scale = pt_backbone.logit_scale copy_flax_attn_params(hf_backbone, attn_params) hf_model.owlvit = hf_backbone copy_class_merge_token(hf_model, flax_params) copy_class_box_heads(hf_model, flax_params) # Save HF model hf_model.save_pretrained(repo.local_dir) # Initialize feature extractor feature_extractor = OwlViTFeatureExtractor( size=config.vision_config.image_size, crop_size=config.vision_config.image_size ) # Initialize tokenizer tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16) # Initialize processor processor = OwlViTProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) feature_extractor.save_pretrained(repo.local_dir) processor.save_pretrained(repo.local_dir) repo.git_add() repo.git_commit("Upload model and processor") repo.git_push()
def test_processor_with_text_list(self): model_name = "google/owlvit-base-patch32" processor = OwlViTProcessor.from_pretrained(model_name) input_text = ["cat", "nasa badge"] inputs = processor(text=input_text) seq_length = 16 self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"]) self.assertEqual(inputs["input_ids"].shape, (2, seq_length)) # test if it raises when no input is passed with pytest.raises(ValueError): processor()
def test_save_load_pretrained_additional_features(self): processor = OwlViTProcessor( tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) processor.save_pretrained(self.tmpdirname) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") feature_extractor_add_kwargs = self.get_feature_extractor( do_normalize=False) processor = OwlViTProcessor.from_pretrained(self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertIsInstance(processor.feature_extractor, OwlViTFeatureExtractor)
def test_tokenizer(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) input_str = "lower newer" encoded_processor = processor(text=input_str, return_tensors="np") encoded_tok = tokenizer(input_str, return_tensors="np") for key in encoded_tok.keys(): self.assertListEqual(encoded_tok[key][0].tolist(), encoded_processor[key][0].tolist())
def test_feature_extractor(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) image_input = self.prepare_image_inputs() input_feat_extract = feature_extractor(image_input, return_tensors="np") input_processor = processor(images=image_input, return_tensors="np") for key in input_feat_extract.keys(): self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_processor(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = OwlViTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) input_str = "lower newer" image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) # test if it raises when no input is passed with pytest.raises(ValueError): processor()
def test_processor_case(self): model_name = "google/owlvit-base-patch32" processor = OwlViTProcessor.from_pretrained(model_name) input_texts = ["cat", "nasa badge"] inputs = processor(text=input_texts) seq_length = 16 input_ids = inputs["input_ids"] predicted_ids = [ [49406, 2368, 49407, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [49406, 6841, 11301, 49407, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ] self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"]) self.assertEqual(inputs["input_ids"].shape, (2, seq_length)) self.assertListEqual(list(input_ids[0]), predicted_ids[0]) self.assertListEqual(list(input_ids[1]), predicted_ids[1])
def test_processor_with_nested_text_list(self): model_name = "google/owlvit-base-patch32" processor = OwlViTProcessor.from_pretrained(model_name) input_texts = [["cat", "nasa badge"], ["person"]] inputs = processor(text=input_texts) seq_length = 16 batch_size = len(input_texts) num_max_text_queries = max([len(texts) for texts in input_texts]) self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"]) self.assertEqual(inputs["input_ids"].shape, (batch_size * num_max_text_queries, seq_length)) # test if it raises when no input is passed with pytest.raises(ValueError): processor()