def test_run_vit_mae_pretraining(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) tmp_dir = self.get_auto_remove_tmp_dir() testargs = f""" run_mae.py --output_dir {tmp_dir} --dataset_name hf-internal-testing/cats_vs_dogs_sample --do_train --do_eval --learning_rate 1e-4 --per_device_train_batch_size 2 --per_device_eval_batch_size 1 --remove_unused_columns False --overwrite_output_dir True --dataloader_num_workers 16 --metric_for_best_model accuracy --max_steps 10 --train_val_split 0.1 --seed 42 """.split() if is_cuda_and_apex_available(): testargs.append("--fp16") with patch.object(sys, "argv", testargs): run_mae.main() model = ViTMAEForPreTraining.from_pretrained(tmp_dir) self.assertIsNotNone(model)
def test_inference_for_pretraining(self): # make random mask reproducible across the PT and TF model np.random.seed(2) model = ViTMAEForPreTraining.from_pretrained( "facebook/vit-mae-base").to(torch_device) feature_extractor = self.default_feature_extractor image = prepare_img() inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) # prepare a noise vector that will be also used for testing the TF model # (this way we can ensure that the PT and TF models operate on the same inputs) vit_mae_config = ViTMAEConfig() num_patches = int( (vit_mae_config.image_size // vit_mae_config.patch_size)**2) noise = np.random.uniform(size=(1, num_patches)) # forward pass with torch.no_grad(): outputs = model(**inputs, noise=torch.from_numpy(noise)) # verify the logits expected_shape = torch.Size((1, 196, 768)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([[-0.0548, -1.7023, -0.9325], [0.3721, -0.5670, -0.2233], [0.8235, -1.3878, -0.3524]]) self.assertTrue( torch.allclose(outputs.logits[0, :3, :3], expected_slice.to(torch_device), atol=1e-4))
def test_inference_for_pretraining(self): # make random mask reproducible torch.manual_seed(2) model = ViTMAEForPreTraining.from_pretrained( "facebook/vit-mae-base").to(torch_device) feature_extractor = self.default_feature_extractor image = prepare_img() inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) # forward pass with torch.no_grad(): outputs = model(**inputs) # verify the logits expected_shape = torch.Size((1, 196, 768)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice = torch.tensor([[0.7366, -1.3663, -0.2844], [0.7919, -1.3839, -0.3241], [0.4313, -0.7168, -0.2878]]).to(torch_device) self.assertTrue( torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4))
def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path): config = ViTMAEConfig() if "large" in checkpoint_url: config.hidden_size = 1024 config.intermediate_size = 4096 config.num_hidden_layers = 24 config.num_attention_heads = 16 elif "huge" in checkpoint_url: config.patch_size = 14 config.hidden_size = 1280 config.intermediate_size = 5120 config.num_hidden_layers = 32 config.num_attention_heads = 16 model = ViTMAEForPreTraining(config) state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] feature_extractor = ViTMAEFeatureExtractor(size=config.image_size) new_state_dict = convert_state_dict(state_dict, config) model.load_state_dict(new_state_dict) model.eval() url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg" image = Image.open(requests.get(url, stream=True).raw) feature_extractor = ViTMAEFeatureExtractor(size=config.image_size) inputs = feature_extractor(images=image, return_tensors="pt") # forward pass torch.manual_seed(2) outputs = model(**inputs) logits = outputs.logits if "large" in checkpoint_url: expected_slice = torch.tensor( [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]] ) elif "huge" in checkpoint_url: expected_slice = torch.tensor( [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]] ) else: expected_slice = torch.tensor( [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]] ) # verify logits assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path)
def create_and_check_for_pretraining(self, config, pixel_values, labels): model = ViTMAEForPreTraining(config) model.to(torch_device) model.eval() result = model(pixel_values) # expected sequence length = num_patches image_size = to_2tuple(self.image_size) patch_size = to_2tuple(self.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) expected_seq_len = num_patches expected_num_channels = self.patch_size ** 2 * self.num_channels self.parent.assertEqual(result.logits.shape, (self.batch_size, expected_seq_len, expected_num_channels))
def test_inference_for_pretraining(self): # make random mask reproducible # note that the same seed on CPU and on GPU doesn’t mean they spew the same random number sequences, # as they both have fairly different PRNGs (for efficiency reasons). # source: https://discuss.pytorch.org/t/random-seed-that-spans-across-devices/19735 torch.manual_seed(2) model = ViTMAEForPreTraining.from_pretrained( "facebook/vit-mae-base").to(torch_device) feature_extractor = self.default_feature_extractor image = prepare_img() inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) # forward pass with torch.no_grad(): outputs = model(**inputs) # verify the logits expected_shape = torch.Size((1, 196, 768)) self.assertEqual(outputs.logits.shape, expected_shape) expected_slice_cpu = torch.tensor([[0.7366, -1.3663, -0.2844], [0.7919, -1.3839, -0.3241], [0.4313, -0.7168, -0.2878]]) expected_slice_gpu = torch.tensor([[0.8948, -1.0680, 0.0030], [0.9758, -1.1181, -0.0290], [1.0602, -1.1522, -0.0528]]) # set expected slice depending on device expected_slice = expected_slice_cpu if torch_device == "cpu" else expected_slice_gpu self.assertTrue( torch.allclose(outputs.logits[0, :3, :3], expected_slice.to(torch_device), atol=1e-4))
def create_and_check_for_pretraining(self, config, pixel_values, labels): model = ViTMAEForPreTraining(config) model.to(torch_device) model.eval() result = model(pixel_values) num_patches = (self.image_size // self.patch_size)**2 expected_num_channels = self.patch_size**2 * self.num_channels self.parent.assertEqual( result.logits.shape, (self.batch_size, num_patches, expected_num_channels)) # test greyscale images config.num_channels = 1 model = ViTMAEForPreTraining(config) model.to(torch_device) model.eval() pixel_values = floats_tensor( [self.batch_size, 1, self.image_size, self.image_size]) result = model(pixel_values) expected_num_channels = self.patch_size**2 self.parent.assertEqual( result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, CustomTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Initialize our dataset. ds = load_dataset( data_args.dataset_name, data_args.dataset_config_name, data_files=data_args.data_files, cache_dir=model_args.cache_dir, ) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in ds.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = ds["train"].train_test_split(data_args.train_val_split) ds["train"] = split["train"] ds["validation"] = split["test"] # Load pretrained model and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = ViTMAEConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = ViTMAEConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") # adapt config config.update({ "mask_ratio": model_args.mask_ratio, "norm_pix_loss": model_args.norm_pix_loss, }) # create feature extractor if model_args.feature_extractor_name: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.feature_extractor_name, **config_kwargs) elif model_args.model_name_or_path: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.model_name_or_path, **config_kwargs) else: feature_extractor = ViTFeatureExtractor() # create model if model_args.model_name_or_path: model = ViTMAEForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = ViTMAEForPreTraining(config) if training_args.do_train: column_names = ds["train"].column_names else: column_names = ds["validation"].column_names if data_args.image_column_name is not None: image_column_name = data_args.image_column_name elif "image" in column_names: image_column_name = "image" elif "img" in column_names: image_column_name = "img" else: image_column_name = column_names[0] # transformations as done in original MAE paper # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py transforms = Compose([ Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC), RandomHorizontalFlip(), ToTensor(), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_images(examples): """Preprocess a batch of images by applying transforms.""" examples["pixel_values"] = [ transforms(image) for image in examples[image_column_name] ] return examples if training_args.do_train: if "train" not in ds: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: ds["train"] = ds["train"].shuffle(seed=training_args.seed).select( range(data_args.max_train_samples)) # Set the training transforms ds["train"].set_transform(preprocess_images) if training_args.do_eval: if "validation" not in ds: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: ds["validation"] = (ds["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms ds["validation"].set_transform(preprocess_images) # Compute absolute learning rate total_train_batch_size = (training_args.train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size) if training_args.base_learning_rate is not None: training_args.learning_rate = training_args.base_learning_rate * total_train_batch_size / 256 # Initialize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=collate_fn, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "tasks": "masked-auto-encoding", "dataset": data_args.dataset_name, "tags": ["masked-auto-encoding"], } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): ds = load_dataset( data_args.dataset_name, data_args.dataset_config, data_files=data_args.data_files, cache_dir=model_args.cache_dir, ) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in ds.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = ds["train"].train_test_split(data_args.train_val_split) ds["train"] = split["train"] ds["validation"] = split["test"] # Load pretrained model and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kw = { "cache_dir": model_args.cache_dir, "revision": model_args.model_version, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kw) elif model_args.model_name: config = ViTMAEConfig.from_pretrained(model_args.model_name, **config_kw) else: config = ViTMAEConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") # adapt config config.update({ "mask_ratio": model_args.mask_ratio, "norm_pix_loss": model_args.norm_pix_loss, }) # create feature extractor if model_args.feature_extractor: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.feature_extractor, **config_kw) elif model_args.model_name: feature_extractor = ViTFeatureExtractor.from_pretrained( model_args.model_name, **config_kw) else: feature_extractor = ViTFeatureExtractor() # create model if model_args.model_name: model = ViTMAEForPreTraining.from_pretrained( model_args.model_name, from_tf=bool(".ckpt" in model_args.model_name), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_version, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model") model = ViTMAEForPreTraining(config) if training_args.do_train: column_names = ds["train"].column_names else: column_names = ds["validation"].column_names if data_args.image_column_name is not None: image_column_name = data_args.image_column_name elif "image" in column_names: image_column_name = "image" elif "img" in column_names: image_column_name = "img" else: image_column_name = column_names[0] # transformations as done in original MAE paper # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py transforms = Compose([ Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC), RandomHorizontalFlip(), ToTensor(), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_images(examples): """Preprocess a batch of images by applying transforms.""" examples["pixel_values"] = [ transforms(image) for image in examples[image_column_name] ] return examples if training_args.do_train: if "train" not in ds: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: ds["train"] = (ds["train"].shuffle(seed=training_args.seed).select( range(data_args.max_train_samples))) # Set the training transforms ds["train"].set_transform(preprocess_images) if training_args.do_eval: if "validation" not in ds: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: ds["validation"] = (ds["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms ds["validation"].set_transform(preprocess_images) # Compute absolute learning rate total_train_batch_size = (training_args.train_batch_size * training_args.grad_accumulation_steps * training_args.world_size) if training_args.base_lr is not None: training_args.lr = training_args.base_lr * total_train_batch_size / 256 # Initialize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=collate_fn, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kw = { "tasks": "masked-auto-encoding", "dataset": data_args.dataset_name, "tags": ["masked-auto-encoding"], } if training_args.push_to_hub: trainer.push_to_hub(**kw) else: trainer.create_model_card(**kw)