def test_real_model_save_load_from_pretrained(self): model_2 = self.get_pretrained_model() pixel_values = floats_tensor( [ 13, model_2.config.encoder.num_channels, model_2.config.encoder.image_size, model_2.config.encoder.image_size, ] ) decoder_input_ids = ids_tensor([13, 1], model_2.config.decoder.vocab_size) outputs = model_2( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, ) out_2 = np.array(outputs[0]) out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmp_dirname: model_2.save_pretrained(tmp_dirname) model_1 = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname) after_outputs = model_1(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids) out_1 = np.array(after_outputs[0]) out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def check_save_and_load( self, config, pixel_values, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, **kwargs ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) outputs = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) out_2 = np.array(outputs[0]) out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmpdirname: enc_dec_model.save_pretrained(tmpdirname) enc_dec_model = TFVisionEncoderDecoderModel.from_pretrained(tmpdirname) after_outputs = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) out_1 = np.array(after_outputs[0]) out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def test_inference_coco_en(self): loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc) model = TFVisionEncoderDecoderModel.from_pretrained(loc) # We will verify our results on an image of cute cats img = Image.open( "./tests/fixtures/tests_samples/COCO/000000039769.png") pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values decoder_input_ids = tf.constant([[model.config.decoder_start_token_id] ]) logits = model(pixel_values, decoder_input_ids)[0].numpy() # verify the logits expected_shape = (1, 1, model.config.decoder.vocab_size) self.assertEqual(logits.shape, expected_shape) EXPECTED_LOGIT_SLICE = np.array([ -38.705807, -30.639929, -31.41903, -39.012012, -38.38696, -34.887207, -33.290855, -35.68447, -38.508484, -36.124645, ]) max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE)) self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): outputs = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True) output_ids = outputs.sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds, outputs.scores.numpy() preds, scores = generate_step(pixel_values) # should produce # ["a cat laying on top of a couch next to another cat"] self.assertEqual( preds, ["a cat laying on top of a couch next to another cat"])
def test_encoder_decoder_save_load_from_encoder_decoder(self): config = self.get_encoder_decoder_config_small() # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights) encoder = TFViTModel(config.encoder) encoder(encoder.dummy_inputs) decoder = TFGPT2LMHeadModel(config.decoder) decoder(decoder.dummy_inputs) encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder) pixel_values = floats_tensor([ 13, encoder.config.num_channels, encoder.config.image_size, encoder.config.image_size, ]) decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size) logits_orig = encoder_decoder_orig( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits with tempfile.TemporaryDirectory() as tmp_dirname: encoder_path = os.path.join(tmp_dirname, "encoder") decoder_path = os.path.join(tmp_dirname, "decoder") encoder.save_pretrained(encoder_path) decoder.save_pretrained(decoder_path) encoder_decoder = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_path, decoder_path) logits_1 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits self.assertTrue( logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3) max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=4) with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder.save_pretrained(tmp_dirname) encoder_decoder = TFVisionEncoderDecoderModel.from_pretrained( tmp_dirname) logits_2 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=4)
def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self): config = self.get_encoder_decoder_config_small() # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights) encoder_pt = ViTModel(config.encoder).to(torch_device).eval() decoder_pt = GPT2LMHeadModel(config.decoder).to(torch_device).eval() encoder_decoder_pt = VisionEncoderDecoderModel(encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval() pixel_values = floats_tensor( [ 13, encoder_pt.config.num_channels, encoder_pt.config.image_size, encoder_pt.config.image_size, ] ) decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size) pt_pixel_values = torch.tensor(pixel_values.numpy(), device=torch_device, dtype=torch.float) pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long) logits_pt = encoder_decoder_pt(pixel_values=pt_pixel_values, decoder_input_ids=pt_decoder_input_ids).logits # PyTorch => TensorFlow with tempfile.TemporaryDirectory() as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2: encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1) encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2) encoder_decoder_tf = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( tmp_dirname_1, tmp_dirname_2, encoder_from_pt=True, decoder_from_pt=True ) logits_tf = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3) # Make sure `from_pretrained` following `save_pretrained` work and give the same result # (See https://github.com/huggingface/transformers/pull/14016) with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder_tf.save_pretrained(tmp_dirname) encoder_decoder_tf = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname) logits_tf_2 = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=3)
def test_encoder_decoder_from_pretrained(self): load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix config = self.get_encoder_decoder_config() feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") decoder_tokenizer = AutoTokenizer.from_pretrained("../gpt2") img = prepare_img() pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids with tempfile.TemporaryDirectory() as tmp_dirname: # Since most of HF's models don't have pretrained cross-attention layers, they are randomly # initialized even if we create models using `from_pretrained` method. # For the tests, the decoder need to be a model with pretrained cross-attention layers. # So we create pretrained models (without `load_weight_prefix`), save them, and later, # we load them using `from_pretrained`. # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder) encoder = TFAutoModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder") # It's necessary to specify `add_cross_attention=True` here. decoder = TFAutoModelForCausalLM.from_pretrained( "../gpt2", is_decoder=True, add_cross_attention=True, name="decoder" ) pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder") pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder") encoder.save_pretrained(pretrained_encoder_dir) decoder.save_pretrained(pretrained_decoder_dir) del encoder del decoder enc_dec_model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( pretrained_encoder_dir, pretrained_decoder_dir, ) # check that the from pretrained methods work enc_dec_model.save_pretrained(tmp_dirname) enc_dec_model = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname) output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) loss_pretrained = output.loss del enc_dec_model # Create the model using `__init__` with loaded ``pretrained`` encoder / decoder encoder = TFAutoModel.from_pretrained( pretrained_encoder_dir, load_weight_prefix=load_weight_prefix, name="encoder" ) decoder = TFAutoModelForCausalLM.from_pretrained( pretrained_decoder_dir, load_weight_prefix=load_weight_prefix, name="decoder" ) enc_dec_model = TFVisionEncoderDecoderModel(config=config, encoder=encoder, decoder=decoder) output = enc_dec_model(pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) loss_init = output.loss max_diff = np.max(np.abs(loss_pretrained - loss_init)) expected_diff = 0.0 self.assertAlmostEqual(max_diff, expected_diff, places=4)
def get_encoderdecoder_model(self): return TFVisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")