def get_pretrained_model_and_inputs(self): model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( "hf-internal-testing/tiny-random-deit", "hf-internal-testing/tiny-random-roberta") batch_size = 13 pixel_values = floats_tensor([ batch_size, model.encoder.config.num_channels, model.encoder.config.image_size, model.encoder.config.image_size, ]) # for DEiT, the sequence length is equal to the number of patches + 2 (for the [CLS] and distillation tokens) decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size) decoder_attention_mask = random_attention_mask([batch_size, 4]) inputs = { "pixel_values": pixel_values, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, } return model, inputs
def check_equivalence_tf_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs( config, decoder_config) # Using `_tf_model`, the test will fail, because the weights of `_tf_model` get extended before saving # the encoder/decoder models. # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see # https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245 # (the change in `src/transformers/modeling_tf_utils.py`) _tf_model = TFVisionEncoderDecoderModel(encoder_decoder_config) # Make sure model is built _tf_model(**inputs_dict) # Using `tf_model` to pass the test. encoder = _tf_model.encoder.__class__(encoder_decoder_config.encoder) decoder = _tf_model.decoder.__class__(encoder_decoder_config.decoder) # Make sure models are built encoder(encoder.dummy_inputs) decoder(decoder.dummy_inputs) tf_model = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder) with tempfile.TemporaryDirectory( ) as encoder_tmp_dirname, tempfile.TemporaryDirectory( ) as decoder_tmp_dirname: tf_model.encoder.save_pretrained(encoder_tmp_dirname) tf_model.decoder.save_pretrained(decoder_tmp_dirname) pt_model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_tf=True, decoder_from_tf=True) # This is only for copying some specific attributes of this particular model. pt_model.config = tf_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
def test_inference_handwritten(self): model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device) dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test") image = Image.open(dataset[0]["file"]).convert("RGB") processor = self.default_processor pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device) # forward pass decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]]).to(torch_device) outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids) logits = outputs.logits # verify the logits expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor( [-1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311] ).to(torch_device) self.assertTrue(torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-4))
def check_encoder_decoder_model_from_pretrained_configs( self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs ): encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) enc_dec_model = VisionEncoderDecoderModel(encoder_decoder_config) enc_dec_model.to(torch_device) enc_dec_model.eval() self.assertTrue(enc_dec_model.config.is_encoder_decoder) outputs_encoder_decoder = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) self.assertEqual( outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) )
def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self): config = self.get_encoder_decoder_config_small() # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights) encoder_pt = ViTModel(config.encoder).to(torch_device).eval() decoder_pt = GPT2LMHeadModel(config.decoder).to(torch_device).eval() encoder_decoder_pt = VisionEncoderDecoderModel( encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval() pixel_values = floats_tensor([ 13, encoder_pt.config.num_channels, encoder_pt.config.image_size, encoder_pt.config.image_size, ]) decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size) pt_pixel_values = torch.tensor(pixel_values.numpy(), device=torch_device, dtype=torch.float) pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long) logits_pt = encoder_decoder_pt( pixel_values=pt_pixel_values, decoder_input_ids=pt_decoder_input_ids).logits # PyTorch => TensorFlow with tempfile.TemporaryDirectory( ) as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2: encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1) encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2) encoder_decoder_tf = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( tmp_dirname_1, tmp_dirname_2, encoder_from_pt=True, decoder_from_pt=True) logits_tf = encoder_decoder_tf( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max( np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3) # Make sure `from_pretrained` following `save_pretrained` work and give the same result # (See https://github.com/huggingface/transformers/pull/14016) with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder_tf.save_pretrained(tmp_dirname) encoder_decoder_tf = TFVisionEncoderDecoderModel.from_pretrained( tmp_dirname) logits_tf_2 = encoder_decoder_tf( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3)
def test_inference_coco_en(self): loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc) model = VisionEncoderDecoderModel.from_pretrained(loc) model.to(torch_device) model.eval() # We will verify our results on an image of cute cats img = Image.open( "./tests/fixtures/tests_samples/COCO/000000039769.png") pixel_values = feature_extractor( images=img, return_tensors="pt").pixel_values.to(torch_device) decoder_input_ids = torch.tensor( [[model.config.decoder_start_token_id]]).to(torch_device) with torch.no_grad(): logits = model(pixel_values, decoder_input_ids)[0].detach().cpu().numpy() # verify the logits expected_shape = (1, 1, model.config.decoder.vocab_size) self.assertEqual(logits.shape, expected_shape) EXPECTED_LOGIT_SLICE = np.array([ -38.705807, -30.639929, -31.41903, -39.012012, -38.38696, -34.887207, -33.290855, -35.68447, -38.508484, -36.124645, ]) max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE)) self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): outputs = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True) output_ids = outputs.sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds, outputs.sequences_scores.detach().cpu().numpy() preds, scores = generate_step(pixel_values) EXPECTED_SCORES = np.array([-0.59562886]) max_diff = np.amax(np.abs(scores - EXPECTED_SCORES)) self.assertLessEqual(max_diff, 1e-4) # should produce # ["a cat laying on top of a couch next to another cat"] self.assertEqual( preds, ["a cat laying on top of a couch next to another cat"])
def check_save_and_load(self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) enc_dec_model.eval() with torch.no_grad(): outputs = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmpdirname: enc_dec_model.save_pretrained(tmpdirname) enc_dec_model = VisionEncoderDecoderModel.from_pretrained( tmpdirname) enc_dec_model.to(torch_device) after_outputs = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def test_inference_cordv2(self): processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2").to( torch_device ) dataset = load_dataset("hf-internal-testing/example-documents", split="test") image = dataset[2]["image"] pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device) decoder_input_ids = processor.tokenizer( "<s_cord-v2>", add_special_tokens=False, return_tensors="pt" ).input_ids.to(torch_device) # step 1: single forward pass with torch.no_grad(): outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids) logits = outputs.logits # verify the logits expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device) self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4)) # step 2: generation task_prompt = "<s_cord-v2>" decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids decoder_input_ids = decoder_input_ids.to(torch_device) outputs = model.generate( pixel_values, decoder_input_ids=decoder_input_ids, max_length=model.decoder.config.max_position_embeddings, early_stopping=True, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=1, bad_words_ids=[[processor.tokenizer.unk_token_id]], output_scores=True, return_dict_in_generate=True, ) sequence = processor.batch_decode(outputs.sequences)[0] sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token # verify generated sequence # fmt: off expected_sequence = "<s_menu><s_nm> CINNAMON SUGAR</s_nm><s_unitprice> 17,000</s_unitprice><s_cnt> 1 x</s_cnt><s_price> 17,000</s_price></s_menu><s_sub_total><s_subtotal_price> 17,000</s_subtotal_price></s_sub_total><s_total><s_total_price> 17,000</s_total_price><s_cashprice> 20,000</s_cashprice><s_changeprice> 3,000</s_changeprice></s_total>" # noqa: E231 # fmt: on self.assertEqual(sequence, expected_sequence) # verify scores self.assertEqual(len(outputs.scores), 43) self.assertTrue( torch.allclose( outputs.scores[0][0, :3], torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device), atol=1e-4 ) )
def test_inference_docvqa(self): processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa") model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to( torch_device ) dataset = load_dataset("hf-internal-testing/example-documents", split="test") image = dataset[0]["image"] pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device) decoder_input_ids = processor.tokenizer( "<s_docvqa>", add_special_tokens=False, return_tensors="pt" ).input_ids.to(torch_device) # step 1: single forward pass with torch.no_grad(): outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids) logits = outputs.logits # verify the logits expected_shape = torch.Size([1, 1, 57532]) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([24.2731, -6.4522, 32.4130]).to(torch_device) self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4)) # step 2: generation task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>" question = "When is the coffee break?" prompt = task_prompt.replace("{user_input}", question) decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids decoder_input_ids = decoder_input_ids.to(torch_device) outputs = model.generate( pixel_values, decoder_input_ids=decoder_input_ids, max_length=model.decoder.config.max_position_embeddings, early_stopping=True, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=1, bad_words_ids=[[processor.tokenizer.unk_token_id]], output_scores=True, return_dict_in_generate=True, ) sequence = processor.batch_decode(outputs.sequences)[0] sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token # verify generated sequence self.assertEqual( sequence, "<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>" ) # verify scores self.assertEqual(len(outputs.scores), 11) self.assertTrue( torch.allclose( outputs.scores[0][0, :3], torch.tensor([5.3153, -3.5276, 13.4781], device=torch_device), atol=1e-4 ) )
import pickle from tqdm import tqdm from PIL import Image output_dir = "outputs/" if not os.path.exists(output_dir): os.mkdir(output_dir) craft = Craft(output_dir=output_dir, crop_type="poly", cuda=False) from transformers import TrOCRProcessor, VisionEncoderDecoderModel from PIL import Image trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") trocr_model = VisionEncoderDecoderModel.from_pretrained( "microsoft/trocr-base-handwritten" ) def predictor(x, batch_size=1): print(f"{len(x)} input Images received.") results = [] for _ in x: print(f"{len(x)} input Images received.") try: craft.detect_text(_) crops = sorted( glob(f"outputs/{os.path.splitext(os.path.basename(_))[0]}_crops/*png"), key=lambda x: int(x.split("crop_")[1].split(".png")[0]), ) regions = [
def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our VisionEncoderDecoderModel structure. """ # define encoder and decoder configs based on checkpoint_url encoder_config = ViTConfig(image_size=384, qkv_bias=False) decoder_config = TrOCRConfig() # size of the architecture if "base" in checkpoint_url: decoder_config.encoder_hidden_size = 768 elif "large" in checkpoint_url: # use ViT-large encoder encoder_config.hidden_size = 1024 encoder_config.intermediate_size = 4096 encoder_config.num_hidden_layers = 24 encoder_config.num_attention_heads = 16 decoder_config.encoder_hidden_size = 1024 else: raise ValueError( "Should either find 'base' or 'large' in checkpoint URL") # the large-printed + stage1 checkpoints uses sinusoidal position embeddings, no layernorm afterwards if "large-printed" in checkpoint_url or "stage1" in checkpoint_url: decoder_config.tie_word_embeddings = False decoder_config.activation_function = "relu" decoder_config.max_position_embeddings = 1024 decoder_config.scale_embedding = True decoder_config.use_learned_position_embeddings = False decoder_config.layernorm_embedding = False # load HuggingFace model encoder = ViTModel(encoder_config, add_pooling_layer=False) decoder = TrOCRForCausalLM(decoder_config) model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder) model.eval() # load state_dict of original model, rename some keys state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["model"] rename_keys = create_rename_keys(encoder_config, decoder_config) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, encoder_config) # remove parameters we don't need del state_dict["encoder.deit.head.weight"] del state_dict["encoder.deit.head.bias"] del state_dict["decoder.version"] # add prefix to decoder keys for key, val in state_dict.copy().items(): val = state_dict.pop(key) if key.startswith("decoder") and "output_projection" not in key: state_dict["decoder.model." + key] = val else: state_dict[key] = val # load state dict model.load_state_dict(state_dict) # Check outputs on an image feature_extractor = ViTFeatureExtractor(size=encoder_config.image_size) tokenizer = RobertaTokenizer.from_pretrained("roberta-large") processor = TrOCRProcessor(feature_extractor, tokenizer) pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values # verify logits decoder_input_ids = torch.tensor( [[model.config.decoder.decoder_start_token_id]]) outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids) logits = outputs.logits expected_shape = torch.Size([1, 1, 50265]) if "trocr-base-handwritten" in checkpoint_url: expected_slice = torch.tensor([ -1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311 ]) elif "trocr-large-handwritten" in checkpoint_url: expected_slice = torch.tensor([ -2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702, 5.6113, 2.0170 ]) elif "trocr-base-printed" in checkpoint_url: expected_slice = torch.tensor([ -5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210 ]) elif "trocr-large-printed" in checkpoint_url: expected_slice = torch.tensor([ -6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466, -0.3081, -0.8106, -1.7535 ]) if "stage1" not in checkpoint_url: assert logits.shape == expected_shape, "Shape of logits not as expected" assert torch.allclose( logits[0, 0, :10], expected_slice, atol=1e-3), "First elements of logits not as expected" Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving processor to {pytorch_dump_folder_path}") processor.save_pretrained(pytorch_dump_folder_path)
def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): # load original model original_model = DonutModel.from_pretrained(model_name).eval() # load HuggingFace model encoder_config, decoder_config = get_configs(original_model) encoder = DonutSwinModel(encoder_config) decoder = MBartForCausalLM(decoder_config) model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder) model.eval() state_dict = original_model.state_dict() new_state_dict = convert_state_dict(state_dict, model) model.load_state_dict(new_state_dict) # verify results on scanned document dataset = load_dataset("hf-internal-testing/example-documents") image = dataset["test"][0]["image"].convert("RGB") tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) feature_extractor = DonutFeatureExtractor( do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]) processor = DonutProcessor(feature_extractor, tokenizer) pixel_values = processor(image, return_tensors="pt").pixel_values if model_name == "naver-clova-ix/donut-base-finetuned-docvqa": task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>" question = "When is the coffee break?" task_prompt = task_prompt.replace("{user_input}", question) elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip": task_prompt = "<s_rvlcdip>" elif model_name in [ "naver-clova-ix/donut-base-finetuned-cord-v1", "naver-clova-ix/donut-base-finetuned-cord-v1-2560", ]: task_prompt = "<s_cord>" elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2": task_prompt = "s_cord-v2>" elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket": task_prompt = "<s_zhtrainticket>" elif model_name in [ "naver-clova-ix/donut-proto", "naver-clova-ix/donut-base" ]: # use a random prompt task_prompt = "hello world" else: raise ValueError("Model name not supported") prompt_tensors = original_model.decoder.tokenizer( task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"] original_patch_embed = original_model.encoder.model.patch_embed( pixel_values) patch_embeddings, _ = model.encoder.embeddings(pixel_values) assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3) # verify encoder hidden states original_last_hidden_state = original_model.encoder(pixel_values) last_hidden_state = model.encoder(pixel_values).last_hidden_state assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2) # verify decoder hidden states original_logits = original_model(pixel_values, prompt_tensors, None).logits logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits assert torch.allclose(original_logits, logits, atol=1e-3) print("Looks ok!") if pytorch_dump_folder_path is not None: print(f"Saving model and processor to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) processor.save_pretrained(pytorch_dump_folder_path) if push_to_hub: model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")