def __init__(self, model_name="ydshieh/vit-gpt2-coco-en", device=None): """ ``` ImageCaptioner constructor Args: model_name(str): name of image captioning model device(str): device to use (e.g., 'cuda', 'cpu') ``` """ if not I.PIL_INSTALLED: raise Exception( "PIL is not installed. Please install with: pip install pillow>=9.0.1" ) super().__init__(device=device, quantize=False, min_transformers_version="4.12.3") self.model_name = model_name from transformers import ( AutoTokenizer, VisionEncoderDecoderModel, ViTFeatureExtractor, ) self.model = VisionEncoderDecoderModel.from_pretrained( self.model_name).to(self.torch_device) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.extractor = ViTFeatureExtractor.from_pretrained(self.model_name)
def test_inference_interpolate_pos_encoding(self): # ViT models have an `interpolate_pos_encoding` argument in their forward method, # allowing to interpolate the pre-trained position embeddings in order to use # the model on higher resolutions. The DINO model by Facebook AI leverages this # to visualize self-attention on higher resolution images. model = ViTModel.from_pretrained("facebook/dino-vits8").to( torch_device) feature_extractor = ViTFeatureExtractor.from_pretrained( "facebook/dino-vits8", size=480) image = prepare_img() inputs = feature_extractor(images=image, return_tensors="pt") pixel_values = inputs.pixel_values.to(torch_device) # forward pass with torch.no_grad(): outputs = model(pixel_values, interpolate_pos_encoding=True) # verify the logits expected_shape = torch.Size((1, 3601, 384)) self.assertEqual(outputs.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor([[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]).to(torch_device) self.assertTrue( torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
def test_inference_coco_en(self): loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc) model = VisionEncoderDecoderModel.from_pretrained(loc) model.to(torch_device) model.eval() # We will verify our results on an image of cute cats img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device) decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device) with torch.no_grad(): logits = model(pixel_values, decoder_input_ids)[0].detach().cpu().numpy() # verify the logits expected_shape = (1, 1, model.config.decoder.vocab_size) self.assertEqual(logits.shape, expected_shape) EXPECTED_LOGIT_SLICE = np.array( [ -38.705807, -30.639929, -31.41903, -39.012012, -38.38696, -34.887207, -33.290855, -35.68447, -38.508484, -36.124645, ] ) max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE)) self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): outputs = model.generate( pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True ) output_ids = outputs.sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds, outputs.sequences_scores.detach().cpu().numpy() preds, scores = generate_step(pixel_values) EXPECTED_SCORES = np.array([-0.59562886]) max_diff = np.amax(np.abs(scores - EXPECTED_SCORES)) self.assertLessEqual(max_diff, 1e-4) # should produce # ["a cat laying on top of a couch next to another cat"] self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
def test_inference_coco_en(self): loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc) model = FlaxVisionEncoderDecoderModel.from_pretrained(loc) img = prepare_img() pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values decoder_input_ids = np.array([[model.config.decoder_start_token_id]]) logits = model(pixel_values, decoder_input_ids)[0] logits = np.array(logits) # verify the logits expected_shape = (1, 1, model.config.decoder.vocab_size) self.assertEqual(logits.shape, expected_shape) EXPECTED_LOGIT_SLICE = np.array([ -38.705837, -30.639936, -31.41905, -39.01204, -38.38698, -34.887215, -33.29087, -35.684475, -38.50852, -36.124676, ]) max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE)) self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): outputs = model.generate(pixel_values, max_length=16, num_beams=4) output_ids = outputs.sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds, outputs.scores preds, scores = generate_step(pixel_values) EXPECTED_SCORES = np.array([-0.59563464]) scores = np.array(scores) max_diff = np.amax(np.abs(scores - EXPECTED_SCORES)) self.assertLessEqual(max_diff, 1e-4) # should produce # ["a cat laying on top of a couch next to another cat"] self.assertEqual( preds, ["a cat laying on top of a couch next to another cat"])
def save_pretrained_model(config): try: feature_extractor = ViTFeatureExtractor.from_pretrained( 'google/vit-base-patch16-224-in21k') model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k') feature_extractor.save_pretrained(config.pretrained_vitfe_path) model.save_pretrained(config.pretrained_vit_path) except Exception as e: print(f'Error - {str (e)}') return 1 return 0
def train (feature_extractor, model, decoder, dataloader): for image, caption, target, target_seq_len in train_dataloader: # print (f'image shape - {image.shape}') # print (f'caption - {caption.shape}') # print (f'target - {target.shape}') # print (f'target_seq_len shape- {target_seq_len.shape}') # print (f'target_seq_len - {target_seq_len}') # print (f'image[0].shape {image [0].shape}') # print (f'max - {image.max ()}') # print (f'min - {image.min ()}') images_list = [image [i] for i in range (config.batch_sz)] # print (type (images_list)) # print (type (images_list [0])) # print (images_list [0].shape) inputs = feature_extractor(images=images_list, return_tensors="pt") outputs = model(**inputs, output_attentions=False, output_hidden_states=False) last_hidden_states = outputs.last_hidden_state print (f'output shape - {last_hidden_states.shape}') break if __name__ == '__main__': config = Config () text_transform = ToSequence (tokenizer=indic_tokenize.trivial_tokenize) image_transform = T.Compose ([T.ToTensor(), T.Resize ((224, 224))]) train_dataset = HVGDataset (config.train_captions, config.word_to_index_path, config.index_to_word_path, config.images_path, config.max_len, text_transform=text_transform, image_transform=image_transform) train_dataloader = DataLoader (train_dataset, batch_size=config.batch_sz, shuffle=True) feature_extractor = ViTFeatureExtractor.from_pretrained(config.pretrained_vitfe_path) model = ViTModel.from_pretrained(config.pretrained_vit_path) train (feature_extractor=feature_extractor, \ model=model, \ decoder=decoder, \ dataloader=train_dataloader)
def default_feature_extractor(self): return ViTFeatureExtractor.from_pretrained( "facebook/vit-mae-base") if is_vision_available() else None
def default_feature_extractor(self): return ViTFeatureExtractor.from_pretrained( "google/vit-base-patch16-224") if is_vision_available() else None
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, CustomTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Initialize our dataset. ds = load_dataset( data_args.dataset_name, data_args.dataset_config_name, data_files=data_args.data_files, cache_dir=model_args.cache_dir, ) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in ds.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = ds["train"].train_test_split(data_args.train_val_split) ds["train"] = split["train"] ds["validation"] = split["test"] # Load pretrained model and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = ViTMAEConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = ViTMAEConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") # adapt config config.update({ "mask_ratio": model_args.mask_ratio, "norm_pix_loss": model_args.norm_pix_loss, }) # create feature extractor if model_args.feature_extractor_name: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.feature_extractor_name, **config_kwargs) elif model_args.model_name_or_path: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.model_name_or_path, **config_kwargs) else: feature_extractor = ViTFeatureExtractor() # create model if model_args.model_name_or_path: model = ViTMAEForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = ViTMAEForPreTraining(config) if training_args.do_train: column_names = ds["train"].column_names else: column_names = ds["validation"].column_names if data_args.image_column_name is not None: image_column_name = data_args.image_column_name elif "image" in column_names: image_column_name = "image" elif "img" in column_names: image_column_name = "img" else: image_column_name = column_names[0] # transformations as done in original MAE paper # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py transforms = Compose([ Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC), RandomHorizontalFlip(), ToTensor(), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_images(examples): """Preprocess a batch of images by applying transforms.""" examples["pixel_values"] = [ transforms(image) for image in examples[image_column_name] ] return examples if training_args.do_train: if "train" not in ds: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: ds["train"] = ds["train"].shuffle(seed=training_args.seed).select( range(data_args.max_train_samples)) # Set the training transforms ds["train"].set_transform(preprocess_images) if training_args.do_eval: if "validation" not in ds: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: ds["validation"] = (ds["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms ds["validation"].set_transform(preprocess_images) # Compute absolute learning rate total_train_batch_size = (training_args.train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size) if training_args.base_learning_rate is not None: training_args.learning_rate = training_args.base_learning_rate * total_train_batch_size / 256 # Initialize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=collate_fn, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "tasks": "masked-auto-encoding", "dataset": data_args.dataset_name, "tags": ["masked-auto-encoding"], } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
# @author Loreto Parisi (loretoparisi at gmail dot com) # Copyright (c) 2021 Loreto Parisi (loretoparisi at gmail dot com) import os, sys import torch from transformers import ViTFeatureExtractor, ViTForImageClassification from PIL import Image import requests BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH, '..')) from lpdutils.lpimagedataset import LPImageDataSet # to choose a different model by image size, patch size, and parameters number, see README feature_extractor = ViTFeatureExtractor.from_pretrained( 'google/vit-large-patch16-224', cache_dir=os.getenv("cache_dir", "../../models")) model = ViTForImageClassification.from_pretrained( 'google/vit-large-patch16-224', cache_dir=os.getenv("cache_dir", "../../models")) # load local dataset batch_size = 2 num_workers = 2 my_dataset = LPImageDataSet(os.path.join( os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'imagenet'), transform=LPImageDataSet.transform) imageloader = torch.utils.data.DataLoader(my_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
def main(): ds = load_dataset( data_args.dataset_name, data_args.dataset_config, data_files=data_args.data_files, cache_dir=model_args.cache_dir, ) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in ds.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = ds["train"].train_test_split(data_args.train_val_split) ds["train"] = split["train"] ds["validation"] = split["test"] # Load pretrained model and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kw = { "cache_dir": model_args.cache_dir, "revision": model_args.model_version, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kw) elif model_args.model_name: config = ViTMAEConfig.from_pretrained(model_args.model_name, **config_kw) else: config = ViTMAEConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") # adapt config config.update({ "mask_ratio": model_args.mask_ratio, "norm_pix_loss": model_args.norm_pix_loss, }) # create feature extractor if model_args.feature_extractor: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.feature_extractor, **config_kw) elif model_args.model_name: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.model_name, **config_kw) else: feature_extractor = ViTFeatureExtractor() # create model if model_args.model_name: model = ViTMAEForPreTraining.from_pretrained( model_args.model_name, from_tf=bool(".ckpt" in model_args.model_name), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model") model = ViTMAEForPreTraining(config) if training_args.do_train: column_names = ds["train"].column_names else: column_names = ds["validation"].column_names if data_args.image_column_name is not None: image_column_name = data_args.image_column_name elif "image" in column_names: image_column_name = "image" elif "img" in column_names: image_column_name = "img" else: image_column_name = column_names[0] # transformations as done in original MAE paper # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py transforms = Compose([ Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC), RandomHorizontalFlip(), ToTensor(), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_images(examples): """Preprocess a batch of images by applying transforms.""" examples["pixel_values"] = [ transforms(image) for image in examples[image_column_name] ] return examples if training_args.do_train: if "train" not in ds: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: ds["train"] = (ds["train"].shuffle(seed=training_args.seed).select( range(data_args.max_train_samples))) # Set the training transforms ds["train"].set_transform(preprocess_images) if training_args.do_eval: if "validation" not in ds: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: ds["validation"] = (ds["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms ds["validation"].set_transform(preprocess_images) # Compute absolute learning rate total_train_batch_size = (training_args.train_batch_size * training_args.grad_accumulation_steps * training_args.world_size) if training_args.base_lr is not None: training_args.lr = training_args.base_lr * total_train_batch_size / 256 # Initialize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=collate_fn, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kw = { "tasks": "masked-auto-encoding", "dataset": data_args.dataset_name, "tags": ["masked-auto-encoding"], } if training_args.push_to_hub: trainer.push_to_hub(**kw) else: trainer.create_model_card(**kw)
def main(args): if use_ViT_Enc: print("It is using ViT encoder!!!!") transform = None feature_extractor = ViTFeatureExtractor.from_pretrained( 'google/vit-base-patch16-224-in21k') else: feature_extractor = None transform = transforms.Compose([ transforms.ToTensor(), transforms.Resize((args['image_size'], args['image_size'])), # The normalize parameters depends on the model we're gonna use # If we apply transfer learning from a model that used ImageNet, then # we should use the ImageNet values to normalize the dataset. # Otherwise we could just normalize the values between -1 and 1 using the # standard mean and standard deviation transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) dataset = Flickr8kDataset(dataset_folder='data', transform=transform, reduce=True, vocab_max_size=args['vocabulary_size'], feature_extractor=feature_extractor) # Create the model if use_ViT_Enc: model = ViTImageCaptioningModel( embed_size=args['embedding_dimension'], vocab=dataset.vocab, caption_max_length=args['captions_max_length'], ).to(device) else: model = ImageCaptioningModel( image_features_dim=args['image_features_dimension'], embed_size=args['embedding_dimension'], vocab=dataset.vocab, caption_max_length=args['captions_max_length'], ).to(device) # Perform the split of the dataset train_split, test_split = split_subsets(dataset, all_captions=True) train_loader = DataLoader(train_split, shuffle=True, batch_size=args['batch_size'], collate_fn=CapsCollate( pad_idx=dataset.vocab.word_to_index['<PAD>'], batch_first=True)) test_loader = DataLoader(test_split, shuffle=True, batch_size=args['batch_size'], collate_fn=CapsCollate( pad_idx=dataset.vocab.word_to_index['<PAD>'], batch_first=True)) optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'], betas=(0.9, 0.98), eps=1e-9) criterion = nn.CrossEntropyLoss( ignore_index=dataset.vocab.word_to_index['<PAD>']) train(num_epochs=args['epochs'], model=model, train_loader=train_loader, test_loader=test_loader, optimizer=optimizer, criterion=criterion, device=device, log_interval=args['log_interval'])
# ### Data loading # # First we specify the pre-trained ViT model we are going to use. The # model ["google/vit-base-patch16-224"] # (https://huggingface.co/google/vit-base-patch16-224) is pre-trained # on ImageNet-21k (14 million images, 21,843 classes) at resolution # 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 # classes) at resolution 224x224. # # We'll use a pre-trained ViT feature extractor that matches the ViT # model to preprocess the input images. VITMODEL = 'google/vit-base-patch16-224' feature_extractor = ViTFeatureExtractor.from_pretrained(VITMODEL) # Next we define functions to load and preprocess the images: def _load_and_process_image(path, label): img = Image.open(path.numpy()).convert("RGB") proc_img = feature_extractor(images=img, return_tensors="np")['pixel_values'] return np.squeeze(proc_img), label def load_and_process_image(path, label): image, label = tf.py_function(_load_and_process_image, (path, label), (tf.float32, tf.int32)) image.set_shape([None, None, None])
def get_feature_extractor(self, **kwargs): return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)