def check_encoder_decoder_model_from_pretrained( self, config, pixel_values, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, return_dict, **kwargs ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} enc_dec_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs) outputs_encoder_decoder = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, return_dict=True, ) self.assertEqual( outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) ) self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[0], pixel_values.shape[0]) self.assertEqual(outputs_encoder_decoder["encoder_last_hidden_state"].shape[-1], config.hidden_size)
def check_pt_tf_equivalence(self, pt_model, tf_model, inputs_dict): pt_model.to(torch_device) pt_model.eval() # prepare inputs tf_inputs = inputs_dict pt_inputs = {k: torch.tensor(v.numpy()) for k, v in tf_inputs.items()} if "labels" in pt_inputs: pt_inputs["labels"] = pt_inputs["labels"].type(torch.LongTensor) # send pytorch inputs to the correct device pt_inputs = { k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items() } with torch.no_grad(): pt_outputs = pt_model(**pt_inputs).to_tuple() tf_outputs = tf_model(**inputs_dict) if "loss" in tf_outputs: tf_outputs.loss = tf.math.reduce_mean(tf_outputs.loss) tf_outputs = tf_outputs.to_tuple() self.assertEqual(len(tf_outputs), len(pt_outputs), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tf_outputs, pt_outputs): self.assert_almost_equals(tf_output.numpy(), pt_output.detach().to("cpu").numpy(), 1e-3) # PT -> TF with tempfile.TemporaryDirectory( ) as encoder_tmp_dirname, tempfile.TemporaryDirectory( ) as decoder_tmp_dirname: pt_model.encoder.save_pretrained(encoder_tmp_dirname) pt_model.decoder.save_pretrained(decoder_tmp_dirname) tf_model_loaded = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_pt=True, decoder_from_pt=True) # This is only for copying some specific attributes of this particular model. tf_model_loaded.config = pt_model.config tf_outputs_loaded = tf_model_loaded(**inputs_dict) if "loss" in tf_outputs_loaded: tf_outputs_loaded.loss = tf.math.reduce_mean( tf_outputs_loaded.loss) tf_outputs_loaded = tf_outputs_loaded.to_tuple() self.assertEqual(len(tf_outputs_loaded), len(pt_outputs), "Output lengths differ between TF and PyTorch") for tf_output_loaded, pt_output in zip(tf_outputs_loaded, pt_outputs): self.assert_almost_equals(tf_output_loaded.numpy(), pt_output.detach().to("cpu").numpy(), 1e-3)
def test_encoder_decoder_save_load_from_encoder_decoder(self): config = self.get_encoder_decoder_config_small() # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights) encoder = TFViTModel(config.encoder) encoder(encoder.dummy_inputs) decoder = TFGPT2LMHeadModel(config.decoder) decoder(decoder.dummy_inputs) encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder) pixel_values = floats_tensor([ 13, encoder.config.num_channels, encoder.config.image_size, encoder.config.image_size, ]) decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size) logits_orig = encoder_decoder_orig( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits with tempfile.TemporaryDirectory() as tmp_dirname: encoder_path = os.path.join(tmp_dirname, "encoder") decoder_path = os.path.join(tmp_dirname, "decoder") encoder.save_pretrained(encoder_path) decoder.save_pretrained(decoder_path) encoder_decoder = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_path, decoder_path) logits_1 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits self.assertTrue( logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3) max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=4) with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder.save_pretrained(tmp_dirname) encoder_decoder = TFVisionEncoderDecoderModel.from_pretrained( tmp_dirname) logits_2 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=4)
def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self): config = self.get_encoder_decoder_config_small() # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights) encoder_pt = ViTModel(config.encoder).to(torch_device).eval() decoder_pt = GPT2LMHeadModel(config.decoder).to(torch_device).eval() encoder_decoder_pt = VisionEncoderDecoderModel(encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval() pixel_values = floats_tensor( [ 13, encoder_pt.config.num_channels, encoder_pt.config.image_size, encoder_pt.config.image_size, ] ) decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size) pt_pixel_values = torch.tensor(pixel_values.numpy(), device=torch_device, dtype=torch.float) pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long) logits_pt = encoder_decoder_pt(pixel_values=pt_pixel_values, decoder_input_ids=pt_decoder_input_ids).logits # PyTorch => TensorFlow with tempfile.TemporaryDirectory() as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2: encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1) encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2) encoder_decoder_tf = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( tmp_dirname_1, tmp_dirname_2, encoder_from_pt=True, decoder_from_pt=True ) logits_tf = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3) # Make sure `from_pretrained` following `save_pretrained` work and give the same result # (See https://github.com/huggingface/transformers/pull/14016) with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder_tf.save_pretrained(tmp_dirname) encoder_decoder_tf = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname) logits_tf_2 = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3)
def check_equivalence_pt_to_tf(self, config, decoder_config, inputs_dict): encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = VisionEncoderDecoderModel(encoder_decoder_config) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: pt_model.encoder.save_pretrained(encoder_tmp_dirname) pt_model.decoder.save_pretrained(decoder_tmp_dirname) tf_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_pt=True, decoder_from_pt=True ) # This is only for copying some specific attributes of this particular model. tf_model.config = pt_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
def check_pt_tf_equivalence(self, pt_model, tf_model, inputs_dict): pt_model.to(torch_device) pt_model.eval() # prepare inputs tf_inputs = inputs_dict pt_inputs = {k: torch.tensor(v.numpy()) for k, v in tf_inputs.items()} with torch.no_grad(): pt_outputs = pt_model(**pt_inputs).to_tuple() tf_outputs = tf_model(**inputs_dict).to_tuple() self.assertEqual(len(tf_outputs), len(pt_outputs), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tf_outputs, pt_outputs): self.assert_almost_equals(tf_output.numpy(), pt_output.numpy(), 1e-3) # PT -> TF with tempfile.TemporaryDirectory( ) as encoder_tmp_dirname, tempfile.TemporaryDirectory( ) as decoder_tmp_dirname: pt_model.encoder.save_pretrained(encoder_tmp_dirname) pt_model.decoder.save_pretrained(decoder_tmp_dirname) tf_model_loaded = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_pt=True, decoder_from_pt=True) # This is only for copying some specific attributes of this particular model. tf_model_loaded.config = pt_model.config tf_outputs_loaded = tf_model_loaded(**inputs_dict).to_tuple() self.assertEqual(len(tf_outputs_loaded), len(pt_outputs), "Output lengths differ between TF and PyTorch") for tf_output_loaded, pt_output in zip(tf_outputs_loaded, pt_outputs): self.assert_almost_equals(tf_output_loaded.numpy(), pt_output.numpy(), 1e-3)
def test_encoder_decoder_from_pretrained(self): load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix config = self.get_encoder_decoder_config() feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") decoder_tokenizer = AutoTokenizer.from_pretrained("../gpt2") img = prepare_img() pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids with tempfile.TemporaryDirectory() as tmp_dirname: # Since most of HF's models don't have pretrained cross-attention layers, they are randomly # initialized even if we create models using `from_pretrained` method. # For the tests, the decoder need to be a model with pretrained cross-attention layers. # So we create pretrained models (without `load_weight_prefix`), save them, and later, # we load them using `from_pretrained`. # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder) encoder = TFAutoModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder") # It's necessary to specify `add_cross_attention=True` here. decoder = TFAutoModelForCausalLM.from_pretrained( "../gpt2", is_decoder=True, add_cross_attention=True, name="decoder" ) pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder") pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder") encoder.save_pretrained(pretrained_encoder_dir) decoder.save_pretrained(pretrained_decoder_dir) del encoder del decoder enc_dec_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( pretrained_encoder_dir, pretrained_decoder_dir, ) # check that the from pretrained methods work enc_dec_model.save_pretrained(tmp_dirname) enc_dec_model = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname) output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) loss_pretrained = output.loss del enc_dec_model # Create the model using `__init__` with loaded ``pretrained`` encoder / decoder encoder = TFAutoModel.from_pretrained( pretrained_encoder_dir, load_weight_prefix=load_weight_prefix, name="encoder" ) decoder = TFAutoModelForCausalLM.from_pretrained( pretrained_decoder_dir, load_weight_prefix=load_weight_prefix, name="decoder" ) enc_dec_model = TFVisionEncoderDecoderModel(config=config, encoder=encoder, decoder=decoder) output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) loss_init = output.loss max_diff = np.max(np.abs(loss_pretrained - loss_init)) expected_diff = 0.0 self.assertAlmostEqual(max_diff, expected_diff, places=4)
def get_from_encoderdecoder_pretrained_model(self): return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( "google/vit-base-patch16-224-in21k", "../gpt2" )