def test_inference_interpolate_pos_encoding(self): # ViT models have an `interpolate_pos_encoding` argument in their forward method, # allowing to interpolate the pre-trained position embeddings in order to use # the model on higher resolutions. The DINO model by Facebook AI leverages this # to visualize self-attention on higher resolution images. model = ViTModel.from_pretrained("facebook/dino-vits8").to( torch_device) feature_extractor = ViTFeatureExtractor.from_pretrained( "facebook/dino-vits8", size=480) image = prepare_img() inputs = feature_extractor(images=image, return_tensors="pt") pixel_values = inputs.pixel_values.to(torch_device) # forward pass with torch.no_grad(): outputs = model(pixel_values, interpolate_pos_encoding=True) # verify the logits expected_shape = torch.Size((1, 3601, 384)) self.assertEqual(outputs.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor([[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]).to(torch_device) self.assertTrue( torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
def __init__(self, model_name="ydshieh/vit-gpt2-coco-en", device=None): """ ``` ImageCaptioner constructor Args: model_name(str): name of image captioning model device(str): device to use (e.g., 'cuda', 'cpu') ``` """ if not I.PIL_INSTALLED: raise Exception( "PIL is not installed. Please install with: pip install pillow>=9.0.1" ) super().__init__(device=device, quantize=False, min_transformers_version="4.12.3") self.model_name = model_name from transformers import ( AutoTokenizer, VisionEncoderDecoderModel, ViTFeatureExtractor, ) self.model = VisionEncoderDecoderModel.from_pretrained( self.model_name).to(self.torch_device) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.extractor = ViTFeatureExtractor.from_pretrained(self.model_name)
def test_inference_coco_en(self): loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc) model = VisionEncoderDecoderModel.from_pretrained(loc) model.to(torch_device) model.eval() # We will verify our results on an image of cute cats img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device) decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device) with torch.no_grad(): logits = model(pixel_values, decoder_input_ids)[0].detach().cpu().numpy() # verify the logits expected_shape = (1, 1, model.config.decoder.vocab_size) self.assertEqual(logits.shape, expected_shape) EXPECTED_LOGIT_SLICE = np.array( [ -38.705807, -30.639929, -31.41903, -39.012012, -38.38696, -34.887207, -33.290855, -35.68447, -38.508484, -36.124645, ] ) max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE)) self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): outputs = model.generate( pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True ) output_ids = outputs.sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds, outputs.sequences_scores.detach().cpu().numpy() preds, scores = generate_step(pixel_values) EXPECTED_SCORES = np.array([-0.59562886]) max_diff = np.amax(np.abs(scores - EXPECTED_SCORES)) self.assertLessEqual(max_diff, 1e-4) # should produce # ["a cat laying on top of a couch next to another cat"] self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
def test_inference_coco_en(self): loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc) model = FlaxVisionEncoderDecoderModel.from_pretrained(loc) img = prepare_img() pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values decoder_input_ids = np.array([[model.config.decoder_start_token_id]]) logits = model(pixel_values, decoder_input_ids)[0] logits = np.array(logits) # verify the logits expected_shape = (1, 1, model.config.decoder.vocab_size) self.assertEqual(logits.shape, expected_shape) EXPECTED_LOGIT_SLICE = np.array([ -38.705837, -30.639936, -31.41905, -39.01204, -38.38698, -34.887215, -33.29087, -35.684475, -38.50852, -36.124676, ]) max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE)) self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): outputs = model.generate(pixel_values, max_length=16, num_beams=4) output_ids = outputs.sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds, outputs.scores preds, scores = generate_step(pixel_values) EXPECTED_SCORES = np.array([-0.59563464]) scores = np.array(scores) max_diff = np.amax(np.abs(scores - EXPECTED_SCORES)) self.assertLessEqual(max_diff, 1e-4) # should produce # ["a cat laying on top of a couch next to another cat"] self.assertEqual( preds, ["a cat laying on top of a couch next to another cat"])
def save_pretrained_model(config): try: feature_extractor = ViTFeatureExtractor.from_pretrained( 'google/vit-base-patch16-224-in21k') model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k') feature_extractor.save_pretrained(config.pretrained_vitfe_path) model.save_pretrained(config.pretrained_vit_path) except Exception as e: print(f'Error - {str (e)}') return 1 return 0
def visualize(image): image = image.convert("RGB") feature_extractor = ViTFeatureExtractor(do_resize=True, size=224, image_mean=[0.485, 0.456, 0.406], image_std=[0.485, 0.456, 0.406]) img = feature_extractor( images=image, return_tensors="pt" ).pixel_values # we remove the batch dimension, is added later on outputs = model(pixel_values=img, output_attentions=True) attentions = outputs['attentions'][ -1] # we are only interested in the attention maps of the last layer nh = attentions.shape[1] # number of head # we keep only the output patch attention attentions = attentions[0, :, 0, 1:].reshape(nh, -1) threshold = 0.6 w_featmap = img.shape[-2] // model.config.patch_size h_featmap = img.shape[-1] // model.config.patch_size # we keep only a certain percentage of the mass val, idx = torch.sort(attentions) val /= torch.sum(val, dim=1, keepdim=True) cumval = torch.cumsum(val, dim=1) th_attn = cumval > (1 - threshold) idx2 = torch.argsort(idx) for head in range(nh): th_attn[head] = th_attn[head][idx2[head]] th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float() # interpolate th_attn = nn.functional.interpolate(th_attn.unsqueeze(0), scale_factor=model.config.patch_size, mode="nearest")[0].cpu().numpy() attentions = attentions.reshape(nh, w_featmap, h_featmap) attentions = nn.functional.interpolate( attentions.unsqueeze(0), scale_factor=model.config.patch_size, mode="nearest")[0].cpu() attentions = attentions.detach().numpy() # show and save attentions heatmaps plt.axis("off") plt.imshow(attentions[2]) return plt
def train (feature_extractor, model, decoder, dataloader): for image, caption, target, target_seq_len in train_dataloader: # print (f'image shape - {image.shape}') # print (f'caption - {caption.shape}') # print (f'target - {target.shape}') # print (f'target_seq_len shape- {target_seq_len.shape}') # print (f'target_seq_len - {target_seq_len}') # print (f'image[0].shape {image [0].shape}') # print (f'max - {image.max ()}') # print (f'min - {image.min ()}') images_list = [image [i] for i in range (config.batch_sz)] # print (type (images_list)) # print (type (images_list [0])) # print (images_list [0].shape) inputs = feature_extractor(images=images_list, return_tensors="pt") outputs = model(**inputs, output_attentions=False, output_hidden_states=False) last_hidden_states = outputs.last_hidden_state print (f'output shape - {last_hidden_states.shape}') break if __name__ == '__main__': config = Config () text_transform = ToSequence (tokenizer=indic_tokenize.trivial_tokenize) image_transform = T.Compose ([T.ToTensor(), T.Resize ((224, 224))]) train_dataset = HVGDataset (config.train_captions, config.word_to_index_path, config.index_to_word_path, config.images_path, config.max_len, text_transform=text_transform, image_transform=image_transform) train_dataloader = DataLoader (train_dataset, batch_size=config.batch_sz, shuffle=True) feature_extractor = ViTFeatureExtractor.from_pretrained(config.pretrained_vitfe_path) model = ViTModel.from_pretrained(config.pretrained_vit_path) train (feature_extractor=feature_extractor, \ model=model, \ decoder=decoder, \ dataloader=train_dataloader)
def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our ViT structure. """ # define default ViT configuration config = ViTConfig() base_model = False # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size if vit_name[-5:] == "in21k": base_model = True config.patch_size = int(vit_name[-12:-10]) config.image_size = int(vit_name[-9:-6]) else: config.num_labels = 1000 repo_id = "datasets/huggingface/label-files" filename = "imagenet-1k-id2label.json" id2label = json.load( open(cached_download(hf_hub_url(repo_id, filename)), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} config.patch_size = int(vit_name[-6:-4]) config.image_size = int(vit_name[-3:]) # size of the architecture if "deit" in vit_name: if vit_name[9:].startswith("tiny"): config.hidden_size = 192 config.intermediate_size = 768 config.num_hidden_layers = 12 config.num_attention_heads = 3 elif vit_name[9:].startswith("small"): config.hidden_size = 384 config.intermediate_size = 1536 config.num_hidden_layers = 12 config.num_attention_heads = 6 else: pass else: if vit_name[4:].startswith("small"): config.hidden_size = 768 config.intermediate_size = 2304 config.num_hidden_layers = 8 config.num_attention_heads = 8 elif vit_name[4:].startswith("base"): pass elif vit_name[4:].startswith("large"): config.hidden_size = 1024 config.intermediate_size = 4096 config.num_hidden_layers = 24 config.num_attention_heads = 16 elif vit_name[4:].startswith("huge"): config.hidden_size = 1280 config.intermediate_size = 5120 config.num_hidden_layers = 32 config.num_attention_heads = 16 # load original model from timm timm_model = timm.create_model(vit_name, pretrained=True) timm_model.eval() # load state_dict of original model, remove and rename some keys state_dict = timm_model.state_dict() if base_model: remove_classification_head_(state_dict) rename_keys = create_rename_keys(config, base_model) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, config, base_model) # load HuggingFace model if vit_name[-5:] == "in21k": model = ViTModel(config).eval() else: model = ViTForImageClassification(config).eval() model.load_state_dict(state_dict) # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor if "deit" in vit_name: feature_extractor = DeiTFeatureExtractor(size=config.image_size) else: feature_extractor = ViTFeatureExtractor(size=config.image_size) encoding = feature_extractor(images=prepare_img(), return_tensors="pt") pixel_values = encoding["pixel_values"] outputs = model(pixel_values) if base_model: timm_pooled_output = timm_model.forward_features(pixel_values) assert timm_pooled_output.shape == outputs.pooler_output.shape assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3) else: timm_logits = timm_model(pixel_values) assert timm_logits.shape == outputs.logits.shape assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model {vit_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, CustomTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Initialize our dataset. ds = load_dataset( data_args.dataset_name, data_args.dataset_config_name, data_files=data_args.data_files, cache_dir=model_args.cache_dir, ) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in ds.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = ds["train"].train_test_split(data_args.train_val_split) ds["train"] = split["train"] ds["validation"] = split["test"] # Load pretrained model and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = ViTMAEConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = ViTMAEConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") # adapt config config.update({ "mask_ratio": model_args.mask_ratio, "norm_pix_loss": model_args.norm_pix_loss, }) # create feature extractor if model_args.feature_extractor_name: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.feature_extractor_name, **config_kwargs) elif model_args.model_name_or_path: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.model_name_or_path, **config_kwargs) else: feature_extractor = ViTFeatureExtractor() # create model if model_args.model_name_or_path: model = ViTMAEForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = ViTMAEForPreTraining(config) if training_args.do_train: column_names = ds["train"].column_names else: column_names = ds["validation"].column_names if data_args.image_column_name is not None: image_column_name = data_args.image_column_name elif "image" in column_names: image_column_name = "image" elif "img" in column_names: image_column_name = "img" else: image_column_name = column_names[0] # transformations as done in original MAE paper # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py transforms = Compose([ Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC), RandomHorizontalFlip(), ToTensor(), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_images(examples): """Preprocess a batch of images by applying transforms.""" examples["pixel_values"] = [ transforms(image) for image in examples[image_column_name] ] return examples if training_args.do_train: if "train" not in ds: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: ds["train"] = ds["train"].shuffle(seed=training_args.seed).select( range(data_args.max_train_samples)) # Set the training transforms ds["train"].set_transform(preprocess_images) if training_args.do_eval: if "validation" not in ds: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: ds["validation"] = (ds["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms ds["validation"].set_transform(preprocess_images) # Compute absolute learning rate total_train_batch_size = (training_args.train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size) if training_args.base_learning_rate is not None: training_args.learning_rate = training_args.base_learning_rate * total_train_batch_size / 256 # Initialize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=collate_fn, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "tasks": "masked-auto-encoding", "dataset": data_args.dataset_name, "tags": ["masked-auto-encoding"], } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
# @author Loreto Parisi (loretoparisi at gmail dot com) # Copyright (c) 2021 Loreto Parisi (loretoparisi at gmail dot com) import os, sys import torch from transformers import ViTFeatureExtractor, ViTForImageClassification from PIL import Image import requests BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH, '..')) from lpdutils.lpimagedataset import LPImageDataSet # to choose a different model by image size, patch size, and parameters number, see README feature_extractor = ViTFeatureExtractor.from_pretrained( 'google/vit-large-patch16-224', cache_dir=os.getenv("cache_dir", "../../models")) model = ViTForImageClassification.from_pretrained( 'google/vit-large-patch16-224', cache_dir=os.getenv("cache_dir", "../../models")) # load local dataset batch_size = 2 num_workers = 2 my_dataset = LPImageDataSet(os.path.join( os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'imagenet'), transform=LPImageDataSet.transform) imageloader = torch.utils.data.DataLoader(my_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
def main(): ds = load_dataset( data_args.dataset_name, data_args.dataset_config, data_files=data_args.data_files, cache_dir=model_args.cache_dir, ) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in ds.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = ds["train"].train_test_split(data_args.train_val_split) ds["train"] = split["train"] ds["validation"] = split["test"] # Load pretrained model and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kw = { "cache_dir": model_args.cache_dir, "revision": model_args.model_version, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kw) elif model_args.model_name: config = ViTMAEConfig.from_pretrained(model_args.model_name, **config_kw) else: config = ViTMAEConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") # adapt config config.update({ "mask_ratio": model_args.mask_ratio, "norm_pix_loss": model_args.norm_pix_loss, }) # create feature extractor if model_args.feature_extractor: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.feature_extractor, **config_kw) elif model_args.model_name: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.model_name, **config_kw) else: feature_extractor = ViTFeatureExtractor() # create model if model_args.model_name: model = ViTMAEForPreTraining.from_pretrained( model_args.model_name, from_tf=bool(".ckpt" in model_args.model_name), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model") model = ViTMAEForPreTraining(config) if training_args.do_train: column_names = ds["train"].column_names else: column_names = ds["validation"].column_names if data_args.image_column_name is not None: image_column_name = data_args.image_column_name elif "image" in column_names: image_column_name = "image" elif "img" in column_names: image_column_name = "img" else: image_column_name = column_names[0] # transformations as done in original MAE paper # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py transforms = Compose([ Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC), RandomHorizontalFlip(), ToTensor(), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_images(examples): """Preprocess a batch of images by applying transforms.""" examples["pixel_values"] = [ transforms(image) for image in examples[image_column_name] ] return examples if training_args.do_train: if "train" not in ds: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: ds["train"] = (ds["train"].shuffle(seed=training_args.seed).select( range(data_args.max_train_samples))) # Set the training transforms ds["train"].set_transform(preprocess_images) if training_args.do_eval: if "validation" not in ds: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: ds["validation"] = (ds["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms ds["validation"].set_transform(preprocess_images) # Compute absolute learning rate total_train_batch_size = (training_args.train_batch_size * training_args.grad_accumulation_steps * training_args.world_size) if training_args.base_lr is not None: training_args.lr = training_args.base_lr * total_train_batch_size / 256 # Initialize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=collate_fn, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kw = { "tasks": "masked-auto-encoding", "dataset": data_args.dataset_name, "tags": ["masked-auto-encoding"], } if training_args.push_to_hub: trainer.push_to_hub(**kw) else: trainer.create_model_card(**kw)
def main(args): if use_ViT_Enc: print("It is using ViT encoder!!!!") transform = None feature_extractor = ViTFeatureExtractor.from_pretrained( 'google/vit-base-patch16-224-in21k') else: feature_extractor = None transform = transforms.Compose([ transforms.ToTensor(), transforms.Resize((args['image_size'], args['image_size'])), # The normalize parameters depends on the model we're gonna use # If we apply transfer learning from a model that used ImageNet, then # we should use the ImageNet values to normalize the dataset. # Otherwise we could just normalize the values between -1 and 1 using the # standard mean and standard deviation transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) dataset = Flickr8kDataset(dataset_folder='data', transform=transform, reduce=True, vocab_max_size=args['vocabulary_size'], feature_extractor=feature_extractor) # Create the model if use_ViT_Enc: model = ViTImageCaptioningModel( embed_size=args['embedding_dimension'], vocab=dataset.vocab, caption_max_length=args['captions_max_length'], ).to(device) else: model = ImageCaptioningModel( image_features_dim=args['image_features_dimension'], embed_size=args['embedding_dimension'], vocab=dataset.vocab, caption_max_length=args['captions_max_length'], ).to(device) # Perform the split of the dataset train_split, test_split = split_subsets(dataset, all_captions=True) train_loader = DataLoader(train_split, shuffle=True, batch_size=args['batch_size'], collate_fn=CapsCollate( pad_idx=dataset.vocab.word_to_index['<PAD>'], batch_first=True)) test_loader = DataLoader(test_split, shuffle=True, batch_size=args['batch_size'], collate_fn=CapsCollate( pad_idx=dataset.vocab.word_to_index['<PAD>'], batch_first=True)) optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'], betas=(0.9, 0.98), eps=1e-9) criterion = nn.CrossEntropyLoss( ignore_index=dataset.vocab.word_to_index['<PAD>']) train(num_epochs=args['epochs'], model=model, train_loader=train_loader, test_loader=test_loader, optimizer=optimizer, criterion=criterion, device=device, log_interval=args['log_interval'])
def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our VisionEncoderDecoderModel structure. """ # define encoder and decoder configs based on checkpoint_url encoder_config = ViTConfig(image_size=384, qkv_bias=False) decoder_config = TrOCRConfig() # size of the architecture if "base" in checkpoint_url: decoder_config.encoder_hidden_size = 768 elif "large" in checkpoint_url: # use ViT-large encoder encoder_config.hidden_size = 1024 encoder_config.intermediate_size = 4096 encoder_config.num_hidden_layers = 24 encoder_config.num_attention_heads = 16 decoder_config.encoder_hidden_size = 1024 else: raise ValueError( "Should either find 'base' or 'large' in checkpoint URL") # the large-printed + stage1 checkpoints uses sinusoidal position embeddings, no layernorm afterwards if "large-printed" in checkpoint_url or "stage1" in checkpoint_url: decoder_config.tie_word_embeddings = False decoder_config.activation_function = "relu" decoder_config.max_position_embeddings = 1024 decoder_config.scale_embedding = True decoder_config.use_learned_position_embeddings = False decoder_config.layernorm_embedding = False # load HuggingFace model encoder = ViTModel(encoder_config, add_pooling_layer=False) decoder = TrOCRForCausalLM(decoder_config) model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder) model.eval() # load state_dict of original model, rename some keys state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["model"] rename_keys = create_rename_keys(encoder_config, decoder_config) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, encoder_config) # remove parameters we don't need del state_dict["encoder.deit.head.weight"] del state_dict["encoder.deit.head.bias"] del state_dict["decoder.version"] # add prefix to decoder keys for key, val in state_dict.copy().items(): val = state_dict.pop(key) if key.startswith("decoder") and "output_projection" not in key: state_dict["decoder.model." + key] = val else: state_dict[key] = val # load state dict model.load_state_dict(state_dict) # Check outputs on an image feature_extractor = ViTFeatureExtractor(size=encoder_config.image_size) tokenizer = RobertaTokenizer.from_pretrained("roberta-large") processor = TrOCRProcessor(feature_extractor, tokenizer) pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values # verify logits decoder_input_ids = torch.tensor( [[model.config.decoder.decoder_start_token_id]]) outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids) logits = outputs.logits expected_shape = torch.Size([1, 1, 50265]) if "trocr-base-handwritten" in checkpoint_url: expected_slice = torch.tensor([ -1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311 ]) elif "trocr-large-handwritten" in checkpoint_url: expected_slice = torch.tensor([ -2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702, 5.6113, 2.0170 ]) elif "trocr-base-printed" in checkpoint_url: expected_slice = torch.tensor([ -5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210 ]) elif "trocr-large-printed" in checkpoint_url: expected_slice = torch.tensor([ -6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466, -0.3081, -0.8106, -1.7535 ]) if "stage1" not in checkpoint_url: assert logits.shape == expected_shape, "Shape of logits not as expected" assert torch.allclose( logits[0, 0, :10], expected_slice, atol=1e-3), "First elements of logits not as expected" Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving processor to {pytorch_dump_folder_path}") processor.save_pretrained(pytorch_dump_folder_path)
def get_feature_extractor(self, **kwargs): return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True): """ Copy/paste/tweak model's weights to our ViT structure. """ # define default ViT configuration config = ViTConfig() # patch_size if model_name[-1] == "8": config.patch_size = 8 # set labels if required if not base_model: config.num_labels = 1000 repo_id = "datasets/huggingface/label-files" filename = "imagenet-1k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} # size of the architecture if model_name in ["dino_vits8", "dino_vits16"]: config.hidden_size = 384 config.intermediate_size = 1536 config.num_hidden_layers = 12 config.num_attention_heads = 6 # load original model from torch hub original_model = torch.hub.load("facebookresearch/dino:main", model_name) original_model.eval() # load state_dict of original model, remove and rename some keys state_dict = original_model.state_dict() if base_model: remove_classification_head_(state_dict) rename_keys = create_rename_keys(config, base_model=base_model) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, config, base_model) # load HuggingFace model if base_model: model = ViTModel(config, add_pooling_layer=False).eval() else: model = ViTForImageClassification(config).eval() model.load_state_dict(state_dict) # Check outputs on an image, prepared by ViTFeatureExtractor feature_extractor = ViTFeatureExtractor() encoding = feature_extractor(images=prepare_img(), return_tensors="pt") pixel_values = encoding["pixel_values"] outputs = model(pixel_values) if base_model: final_hidden_state_cls_token = original_model(pixel_values) assert torch.allclose(final_hidden_state_cls_token, outputs.last_hidden_state[:, 0, :], atol=1e-1) else: logits = original_model(pixel_values) assert logits.shape == outputs.logits.shape assert torch.allclose(logits, outputs.logits, atol=1e-3) Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model {model_name} to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path)
def default_feature_extractor(self): return ViTFeatureExtractor.from_pretrained( "google/vit-base-patch16-224") if is_vision_available() else None
def default_feature_extractor(self): return ViTFeatureExtractor.from_pretrained( "facebook/vit-mae-base") if is_vision_available() else None
# ### Data loading # # First we specify the pre-trained ViT model we are going to use. The # model ["google/vit-base-patch16-224"] # (https://huggingface.co/google/vit-base-patch16-224) is pre-trained # on ImageNet-21k (14 million images, 21,843 classes) at resolution # 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 # classes) at resolution 224x224. # # We'll use a pre-trained ViT feature extractor that matches the ViT # model to preprocess the input images. VITMODEL = 'google/vit-base-patch16-224' feature_extractor = ViTFeatureExtractor.from_pretrained(VITMODEL) # Next we define functions to load and preprocess the images: def _load_and_process_image(path, label): img = Image.open(path.numpy()).convert("RGB") proc_img = feature_extractor(images=img, return_tensors="np")['pixel_values'] return np.squeeze(proc_img), label def load_and_process_image(path, label): image, label = tf.py_function(_load_and_process_image, (path, label), (tf.float32, tf.int32)) image.set_shape([None, None, None])