コード例 #1
0
    def __init__(self, model_name="ydshieh/vit-gpt2-coco-en", device=None):
        """
        ```
        ImageCaptioner constructor

        Args:
          model_name(str): name of  image captioning model
          device(str): device to use (e.g., 'cuda', 'cpu')
        ```
        """
        if not I.PIL_INSTALLED:
            raise Exception(
                "PIL is not installed. Please install with: pip install pillow>=9.0.1"
            )

        super().__init__(device=device,
                         quantize=False,
                         min_transformers_version="4.12.3")
        self.model_name = model_name
        from transformers import (
            AutoTokenizer,
            VisionEncoderDecoderModel,
            ViTFeatureExtractor,
        )

        self.model = VisionEncoderDecoderModel.from_pretrained(
            self.model_name).to(self.torch_device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.extractor = ViTFeatureExtractor.from_pretrained(self.model_name)
コード例 #2
0
    def test_inference_coco_en(self):

        loc = "ydshieh/vit-gpt2-coco-en"

        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
        tokenizer = AutoTokenizer.from_pretrained(loc)
        model = VisionEncoderDecoderModel.from_pretrained(loc)
        model.to(torch_device)
        model.eval()

        # We will verify our results on an image of cute cats
        img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device)

        decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device)

        with torch.no_grad():
            logits = model(pixel_values, decoder_input_ids)[0].detach().cpu().numpy()

        # verify the logits
        expected_shape = (1, 1, model.config.decoder.vocab_size)
        self.assertEqual(logits.shape, expected_shape)

        EXPECTED_LOGIT_SLICE = np.array(
            [
                -38.705807,
                -30.639929,
                -31.41903,
                -39.012012,
                -38.38696,
                -34.887207,
                -33.290855,
                -35.68447,
                -38.508484,
                -36.124645,
            ]
        )
        max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE))
        self.assertLessEqual(max_diff, 1e-4)

        def generate_step(pixel_values):

            outputs = model.generate(
                pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True
            )
            output_ids = outputs.sequences
            preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            preds = [pred.strip() for pred in preds]

            return preds, outputs.sequences_scores.detach().cpu().numpy()

        preds, scores = generate_step(pixel_values)

        EXPECTED_SCORES = np.array([-0.59562886])
        max_diff = np.amax(np.abs(scores - EXPECTED_SCORES))
        self.assertLessEqual(max_diff, 1e-4)

        # should produce
        # ["a cat laying on top of a couch next to another cat"]
        self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
コード例 #3
0
    def test_inference_handwritten(self):
        model = VisionEncoderDecoderModel.from_pretrained(
            "microsoft/trocr-base-handwritten").to(torch_device)

        ds = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
        image = Image.open(ds[0]["file"]).convert("RGB")

        processor = self.default_processor
        pixel_values = processor(
            images=image, return_tensors="pt").pixel_values.to(torch_device)

        # forward pass
        decoder_input_ids = torch.tensor(
            [[model.config.decoder.decoder_start_token_id]]).to(torch_device)
        outputs = model(pixel_values=pixel_values,
                        decoder_input_ids=decoder_input_ids)
        logits = outputs.logits

        # verify the logits
        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor([
            -1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764,
            1.7560, 8.7358, -1.5311
        ]).to(torch_device)

        self.assertTrue(
            torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-4))
コード例 #4
0
    def test_inference_printed(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)

        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
        image = Image.open(dataset[1]["file"]).convert("RGB")

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)

        # forward pass
        decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]]).to(torch_device)
        outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
        logits = outputs.logits

        # verify the logits
        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
        self.assertEqual(outputs.logits.shape, expected_shape)

        is_pillow_less_than_9 = version.parse(PIL.__version__) < version.parse("9.0.0")

        if is_pillow_less_than_9:
            expected_slice = torch.tensor(
                [-5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210],
                device=torch_device,
            )
        else:
            expected_slice = torch.tensor(
                [-5.6844, -5.8372, 1.1518, -6.8984, 6.8587, -2.4453, 1.2347, -1.0241, -1.9649, -3.9109],
                device=torch_device,
            )

        self.assertTrue(torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-4))
コード例 #5
0
    def check_save_and_load(
        self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs
    ):
        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
        enc_dec_model.to(torch_device)
        enc_dec_model.eval()
        with torch.no_grad():
            outputs = enc_dec_model(
                pixel_values=pixel_values,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
            )
            out_2 = outputs[0].cpu().numpy()
            out_2[np.isnan(out_2)] = 0

            with tempfile.TemporaryDirectory() as tmpdirname:
                enc_dec_model.save_pretrained(tmpdirname)
                enc_dec_model = VisionEncoderDecoderModel.from_pretrained(tmpdirname)
                enc_dec_model.to(torch_device)

                after_outputs = enc_dec_model(
                    pixel_values=pixel_values,
                    decoder_input_ids=decoder_input_ids,
                    decoder_attention_mask=decoder_attention_mask,
                )
                out_1 = after_outputs[0].cpu().numpy()
                out_1[np.isnan(out_1)] = 0
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
コード例 #6
0
    def test_inference_rvlcdip(self):
        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip").to(
            torch_device
        )

        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
        image = dataset[1]["image"]

        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)

        # step 1: single forward pass
        decoder_input_ids = processor.tokenizer(
            "<s_rvlcdip>", add_special_tokens=False, return_tensors="pt"
        ).input_ids.to(torch_device)
        with torch.no_grad():
            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits

        # verify the logits
        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device)
        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))

        # step 2: generation
        task_prompt = "<s_rvlcdip>"
        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
        decoder_input_ids = decoder_input_ids.to(torch_device)

        outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            output_scores=True,
            return_dict_in_generate=True,
        )

        sequence = processor.batch_decode(outputs.sequences)[0]
        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token

        # verify generated sequence
        self.assertEqual(sequence, "<s_class><advertisement/></s_class>")

        # verify scores
        self.assertEqual(len(outputs.scores), 4)
        self.assertTrue(
            torch.allclose(
                outputs.scores[0][0, :3], torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device), atol=1e-4
            )
        )
    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):

        pt_model.to(torch_device)
        pt_model.eval()

        # prepare inputs
        flax_inputs = inputs_dict
        pt_inputs = {
            k: torch.tensor(v.tolist())
            for k, v in flax_inputs.items()
        }

        with torch.no_grad():
            pt_outputs = pt_model(**pt_inputs).to_tuple()

        fx_outputs = fx_model(**inputs_dict).to_tuple()
        self.assertEqual(len(fx_outputs), len(pt_outputs),
                         "Output lengths differ between Flax and PyTorch")
        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
            self.assert_almost_equals(fx_output, pt_output.numpy(), 1e-5)

        # PT -> Flax
        with tempfile.TemporaryDirectory() as tmpdirname:
            pt_model.save_pretrained(tmpdirname)
            fx_model_loaded = FlaxVisionEncoderDecoderModel.from_pretrained(
                tmpdirname, from_pt=True)

        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs),
                         "Output lengths differ between Flax and PyTorch")
        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(),
                                      1e-5)

        # Flax -> PT
        with tempfile.TemporaryDirectory() as tmpdirname:
            fx_model.save_pretrained(tmpdirname)
            pt_model_loaded = VisionEncoderDecoderModel.from_pretrained(
                tmpdirname, from_flax=True)

        pt_model_loaded.to(torch_device)
        pt_model_loaded.eval()

        with torch.no_grad():
            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()

        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded),
                         "Output lengths differ between Flax and PyTorch")
        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(),
                                      1e-5)
コード例 #8
0
    def test_real_model_save_load_from_pretrained(self):
        model_2, inputs = self.get_pretrained_model_and_inputs()
        model_2.to(torch_device)

        with torch.no_grad():
            outputs = model_2(**inputs)
            out_2 = outputs[0].cpu().numpy()
            out_2[np.isnan(out_2)] = 0

            with tempfile.TemporaryDirectory() as tmp_dirname:
                model_2.save_pretrained(tmp_dirname)
                model_1 = VisionEncoderDecoderModel.from_pretrained(tmp_dirname)
                model_1.to(torch_device)

                after_outputs = model_1(**inputs)
                out_1 = after_outputs[0].cpu().numpy()
                out_1[np.isnan(out_1)] = 0
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
コード例 #9
0
    def test_inference_cordv2(self):
        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2").to(
            torch_device
        )

        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
        image = dataset[2]["image"]

        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
        decoder_input_ids = processor.tokenizer(
            "<s_cord-v2>", add_special_tokens=False, return_tensors="pt"
        ).input_ids.to(torch_device)

        # step 1: single forward pass
        with torch.no_grad():
            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits

        # verify the logits
        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device)
        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))

        # step 2: generation
        task_prompt = "<s_cord-v2>"
        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
        decoder_input_ids = decoder_input_ids.to(torch_device)

        outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            output_scores=True,
            return_dict_in_generate=True,
        )

        sequence = processor.batch_decode(outputs.sequences)[0]
        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token

        # verify generated sequence
        # fmt: off
        expected_sequence = "<s_menu><s_nm> CINNAMON SUGAR</s_nm><s_unitprice> 17,000</s_unitprice><s_cnt> 1 x</s_cnt><s_price> 17,000</s_price></s_menu><s_sub_total><s_subtotal_price> 17,000</s_subtotal_price></s_sub_total><s_total><s_total_price> 17,000</s_total_price><s_cashprice> 20,000</s_cashprice><s_changeprice> 3,000</s_changeprice></s_total>"  # noqa: E231
        # fmt: on
        self.assertEqual(sequence, expected_sequence)

        # verify scores
        self.assertEqual(len(outputs.scores), 43)
        self.assertTrue(
            torch.allclose(
                outputs.scores[0][0, :3], torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device), atol=1e-4
            )
        )
コード例 #10
0
    def test_inference_docvqa(self):
        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to(
            torch_device
        )

        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
        image = dataset[0]["image"]

        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
        decoder_input_ids = processor.tokenizer(
            "<s_docvqa>", add_special_tokens=False, return_tensors="pt"
        ).input_ids.to(torch_device)

        # step 1: single forward pass
        with torch.no_grad():
            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits

        # verify the logits
        expected_shape = torch.Size([1, 1, 57532])
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor([24.2731, -6.4522, 32.4130]).to(torch_device)
        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))

        # step 2: generation
        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
        question = "When is the coffee break?"
        prompt = task_prompt.replace("{user_input}", question)
        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
        decoder_input_ids = decoder_input_ids.to(torch_device)

        outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            output_scores=True,
            return_dict_in_generate=True,
        )
        sequence = processor.batch_decode(outputs.sequences)[0]
        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token

        # verify generated sequence
        self.assertEqual(
            sequence, "<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>"
        )

        # verify scores
        self.assertEqual(len(outputs.scores), 11)
        self.assertTrue(
            torch.allclose(
                outputs.scores[0][0, :3], torch.tensor([5.3153, -3.5276, 13.4781], device=torch_device), atol=1e-4
            )
        )
コード例 #11
0
ファイル: predictor.py プロジェクト: notAI-tech/fastDeploy
import pickle
from tqdm import tqdm
from PIL import Image

output_dir = "outputs/"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

craft = Craft(output_dir=output_dir, crop_type="poly", cuda=False)

from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained(
    "microsoft/trocr-base-handwritten"
)


def predictor(x, batch_size=1):
    print(f"{len(x)} input Images received.")
    results = []
    for _ in x:
        print(f"{len(x)} input Images received.")
        try:
            craft.detect_text(_)
            crops = sorted(
                glob(f"outputs/{os.path.splitext(os.path.basename(_))[0]}_crops/*png"),
                key=lambda x: int(x.split("crop_")[1].split(".png")[0]),
            )
            regions = [