def check_encoder_decoder_model_from_pretrained(
        self,
        config,
        pixel_values,
        encoder_hidden_states,
        decoder_config,
        decoder_input_ids,
        decoder_attention_mask,
        return_dict,
        **kwargs
    ):
        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
        enc_dec_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
        outputs_encoder_decoder = enc_dec_model(
            pixel_values=pixel_values,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            return_dict=True,
        )

        self.assertEqual(
            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
        )
        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0])
        self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
    def check_pt_tf_equivalence(self, pt_model, tf_model, inputs_dict):

        pt_model.to(torch_device)
        pt_model.eval()

        # prepare inputs
        tf_inputs = inputs_dict
        pt_inputs = {k: torch.tensor(v.numpy()) for k, v in tf_inputs.items()}
        if "labels" in pt_inputs:
            pt_inputs["labels"] = pt_inputs["labels"].type(torch.LongTensor)

        # send pytorch inputs to the correct device
        pt_inputs = {
            k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v
            for k, v in pt_inputs.items()
        }

        with torch.no_grad():
            pt_outputs = pt_model(**pt_inputs).to_tuple()

        tf_outputs = tf_model(**inputs_dict)
        if "loss" in tf_outputs:
            tf_outputs.loss = tf.math.reduce_mean(tf_outputs.loss)
        tf_outputs = tf_outputs.to_tuple()
        self.assertEqual(len(tf_outputs), len(pt_outputs),
                         "Output lengths differ between TF and PyTorch")

        for tf_output, pt_output in zip(tf_outputs, pt_outputs):
            self.assert_almost_equals(tf_output.numpy(),
                                      pt_output.detach().to("cpu").numpy(),
                                      1e-3)

        # PT -> TF
        with tempfile.TemporaryDirectory(
        ) as encoder_tmp_dirname, tempfile.TemporaryDirectory(
        ) as decoder_tmp_dirname:

            pt_model.encoder.save_pretrained(encoder_tmp_dirname)
            pt_model.decoder.save_pretrained(decoder_tmp_dirname)
            tf_model_loaded = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_tmp_dirname,
                decoder_tmp_dirname,
                encoder_from_pt=True,
                decoder_from_pt=True)
            # This is only for copying some specific attributes of this particular model.
            tf_model_loaded.config = pt_model.config

        tf_outputs_loaded = tf_model_loaded(**inputs_dict)
        if "loss" in tf_outputs_loaded:
            tf_outputs_loaded.loss = tf.math.reduce_mean(
                tf_outputs_loaded.loss)
        tf_outputs_loaded = tf_outputs_loaded.to_tuple()
        self.assertEqual(len(tf_outputs_loaded), len(pt_outputs),
                         "Output lengths differ between TF and PyTorch")

        for tf_output_loaded, pt_output in zip(tf_outputs_loaded, pt_outputs):
            self.assert_almost_equals(tf_output_loaded.numpy(),
                                      pt_output.detach().to("cpu").numpy(),
                                      1e-3)
    def test_encoder_decoder_save_load_from_encoder_decoder(self):
        config = self.get_encoder_decoder_config_small()

        # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
        encoder = TFViTModel(config.encoder)
        encoder(encoder.dummy_inputs)
        decoder = TFGPT2LMHeadModel(config.decoder)
        decoder(decoder.dummy_inputs)

        encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder,
                                                           decoder=decoder)

        pixel_values = floats_tensor([
            13,
            encoder.config.num_channels,
            encoder.config.image_size,
            encoder.config.image_size,
        ])
        decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size)

        logits_orig = encoder_decoder_orig(
            pixel_values=pixel_values,
            decoder_input_ids=decoder_input_ids).logits

        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_path = os.path.join(tmp_dirname, "encoder")
            decoder_path = os.path.join(tmp_dirname, "decoder")

            encoder.save_pretrained(encoder_path)
            decoder.save_pretrained(decoder_path)

            encoder_decoder = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_path, decoder_path)

        logits_1 = encoder_decoder(pixel_values=pixel_values,
                                   decoder_input_ids=decoder_input_ids).logits

        self.assertTrue(
            logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3)

        max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=4)

        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_decoder.save_pretrained(tmp_dirname)
            encoder_decoder = TFVisionEncoderDecoderModel.from_pretrained(
                tmp_dirname)

        logits_2 = encoder_decoder(pixel_values=pixel_values,
                                   decoder_input_ids=decoder_input_ids).logits

        max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=4)
    def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self):
        config = self.get_encoder_decoder_config_small()

        # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
        encoder_pt = ViTModel(config.encoder).to(torch_device).eval()
        decoder_pt = GPT2LMHeadModel(config.decoder).to(torch_device).eval()

        encoder_decoder_pt = VisionEncoderDecoderModel(encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval()

        pixel_values = floats_tensor(
            [
                13,
                encoder_pt.config.num_channels,
                encoder_pt.config.image_size,
                encoder_pt.config.image_size,
            ]
        )
        decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size)

        pt_pixel_values = torch.tensor(pixel_values.numpy(), device=torch_device, dtype=torch.float)
        pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long)

        logits_pt = encoder_decoder_pt(pixel_values=pt_pixel_values, decoder_input_ids=pt_decoder_input_ids).logits

        # PyTorch => TensorFlow
        with tempfile.TemporaryDirectory() as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2:
            encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1)
            encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2)
            encoder_decoder_tf = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                tmp_dirname_1, tmp_dirname_2, encoder_from_pt=True, decoder_from_pt=True
            )

        logits_tf = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits

        max_diff = np.max(np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=3)

        # Make sure `from_pretrained` following `save_pretrained` work and give the same result
        # (See https://github.com/huggingface/transformers/pull/14016)
        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_decoder_tf.save_pretrained(tmp_dirname)
            encoder_decoder_tf = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)

            logits_tf_2 = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits

            max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy()))
            self.assertAlmostEqual(max_diff, 0.0, places=3)
    def check_equivalence_pt_to_tf(self, config, decoder_config, inputs_dict):

        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)

        pt_model = VisionEncoderDecoderModel(encoder_decoder_config)

        with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:

            pt_model.encoder.save_pretrained(encoder_tmp_dirname)
            pt_model.decoder.save_pretrained(decoder_tmp_dirname)
            tf_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_pt=True, decoder_from_pt=True
            )
            # This is only for copying some specific attributes of this particular model.
            tf_model.config = pt_model.config

        self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
    def check_pt_tf_equivalence(self, pt_model, tf_model, inputs_dict):

        pt_model.to(torch_device)
        pt_model.eval()

        # prepare inputs
        tf_inputs = inputs_dict
        pt_inputs = {k: torch.tensor(v.numpy()) for k, v in tf_inputs.items()}

        with torch.no_grad():
            pt_outputs = pt_model(**pt_inputs).to_tuple()

        tf_outputs = tf_model(**inputs_dict).to_tuple()
        self.assertEqual(len(tf_outputs), len(pt_outputs),
                         "Output lengths differ between TF and PyTorch")
        for tf_output, pt_output in zip(tf_outputs, pt_outputs):
            self.assert_almost_equals(tf_output.numpy(), pt_output.numpy(),
                                      1e-3)

        # PT -> TF
        with tempfile.TemporaryDirectory(
        ) as encoder_tmp_dirname, tempfile.TemporaryDirectory(
        ) as decoder_tmp_dirname:

            pt_model.encoder.save_pretrained(encoder_tmp_dirname)
            pt_model.decoder.save_pretrained(decoder_tmp_dirname)
            tf_model_loaded = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_tmp_dirname,
                decoder_tmp_dirname,
                encoder_from_pt=True,
                decoder_from_pt=True)
            # This is only for copying some specific attributes of this particular model.
            tf_model_loaded.config = pt_model.config

        tf_outputs_loaded = tf_model_loaded(**inputs_dict).to_tuple()
        self.assertEqual(len(tf_outputs_loaded), len(pt_outputs),
                         "Output lengths differ between TF and PyTorch")
        for tf_output_loaded, pt_output in zip(tf_outputs_loaded, pt_outputs):
            self.assert_almost_equals(tf_output_loaded.numpy(),
                                      pt_output.numpy(), 1e-3)
    def test_encoder_decoder_from_pretrained(self):
        load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix

        config = self.get_encoder_decoder_config()
        feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
        decoder_tokenizer = AutoTokenizer.from_pretrained("../gpt2")

        img = prepare_img()
        pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
        decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids

        with tempfile.TemporaryDirectory() as tmp_dirname:

            # Since most of HF's models don't have pretrained cross-attention layers, they are randomly
            # initialized even if we create models using `from_pretrained` method.
            # For the tests, the decoder need to be a model with pretrained cross-attention layers.
            # So we create pretrained models (without `load_weight_prefix`), save them, and later,
            # we load them using `from_pretrained`.
            # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder)
            encoder = TFAutoModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder")
            # It's necessary to specify `add_cross_attention=True` here.
            decoder = TFAutoModelForCausalLM.from_pretrained(
                "../gpt2", is_decoder=True, add_cross_attention=True, name="decoder"
            )
            pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder")
            pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder")
            encoder.save_pretrained(pretrained_encoder_dir)
            decoder.save_pretrained(pretrained_decoder_dir)
            del encoder
            del decoder

            enc_dec_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                pretrained_encoder_dir,
                pretrained_decoder_dir,
            )
            # check that the from pretrained methods work
            enc_dec_model.save_pretrained(tmp_dirname)
            enc_dec_model = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)

            output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids)

            loss_pretrained = output.loss
            del enc_dec_model

            # Create the model using `__init__` with loaded ``pretrained`` encoder / decoder
            encoder = TFAutoModel.from_pretrained(
                pretrained_encoder_dir, load_weight_prefix=load_weight_prefix, name="encoder"
            )
            decoder = TFAutoModelForCausalLM.from_pretrained(
                pretrained_decoder_dir, load_weight_prefix=load_weight_prefix, name="decoder"
            )
            enc_dec_model = TFVisionEncoderDecoderModel(config=config, encoder=encoder, decoder=decoder)

        output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids)

        loss_init = output.loss

        max_diff = np.max(np.abs(loss_pretrained - loss_init))
        expected_diff = 0.0

        self.assertAlmostEqual(max_diff, expected_diff, places=4)
 def get_from_encoderdecoder_pretrained_model(self):
     return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
         "google/vit-base-patch16-224-in21k", "../gpt2"
     )