def get_swinv2_config(swinv2_name): config = Swinv2Config() name_split = swinv2_name.split("_") model_size = name_split[1] if "to" in name_split[3]: img_size = int(name_split[3][-3:]) else: img_size = int(name_split[3]) if "to" in name_split[2]: window_size = int(name_split[2][-2:]) else: window_size = int(name_split[2][6:]) if model_size == "tiny": embed_dim = 96 depths = (2, 2, 6, 2) num_heads = (3, 6, 12, 24) elif model_size == "small": embed_dim = 96 depths = (2, 2, 18, 2) num_heads = (3, 6, 12, 24) elif model_size == "base": embed_dim = 128 depths = (2, 2, 18, 2) num_heads = (4, 8, 16, 32) else: embed_dim = 192 depths = (2, 2, 18, 2) num_heads = (6, 12, 24, 48) if "to" in swinv2_name: config.pretrained_window_sizes = (12, 12, 12, 6) if ("22k" in swinv2_name) and ("to" not in swinv2_name): num_classes = 21841 repo_id = "datasets/huggingface/label-files" filename = "imagenet-22k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} else: num_classes = 1000 repo_id = "datasets/huggingface/label-files" filename = "imagenet-1k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} config.image_size = img_size config.num_labels = num_classes config.embed_dim = embed_dim config.depths = depths config.num_heads = num_heads config.window_size = window_size return config
def from_hub(repo_id: str, **kwargs: Any): """Instantiate & load a pretrained model from HF hub. >>> from doctr.models import from_hub >>> model = from_hub("mindee/fasterrcnn_mobilenet_v3_large_fpn") Args: repo_id: HuggingFace model hub repo kwargs: kwargs of `hf_hub_download` or `snapshot_download` Returns: Model loaded with the checkpoint """ # Get the config with open(hf_hub_download(repo_id, filename="config.json", **kwargs), "rb") as f: cfg = json.load(f) arch = cfg["arch"] task = cfg["task"] cfg.pop("arch") cfg.pop("task") if task == "classification": model = models.classification.__dict__[arch]( pretrained=False, classes=cfg["classes"], num_classes=cfg["num_classes"]) elif task == "detection": model = models.detection.__dict__[arch](pretrained=False) elif task == "recognition": model = models.recognition.__dict__[arch]( pretrained=False, input_shape=cfg["input_shape"], vocab=cfg["vocab"]) elif task == "obj_detection" and is_torch_available(): model = models.obj_detection.__dict__[arch]( pretrained=False, image_mean=cfg["mean"], image_std=cfg["std"], max_size=cfg["input_shape"][-1], num_classes=len(cfg["classes"]), ) # update model cfg model.cfg = cfg # Load checkpoint if is_torch_available(): state_dict = torch.load(hf_hub_download(repo_id, filename="pytorch_model.bin", **kwargs), map_location="cpu") model.load_state_dict(state_dict) else: # tf repo_path = snapshot_download(repo_id, **kwargs) model.load_weights(os.path.join(repo_path, "tf_model", "weights")) return model
def download_from_string( path, module, keys, package, quantized=False ): model = path repo_id = f'{HUGGINGFACE_USERNAME}/{module}-{model}' if quantized: repo_id = f'{repo_id}-quantized' try: list_repo_files(repo_id) except: raise ValueError( f'Quantized model for `{os.path.join(module, model)}` is not available, please load normal model.' ) logger.warning('Load quantized model will cause accuracy drop.') files = {} for k, file in keys.items(): if '/' in file: splitted = os.path.split(file) repo_id_ = splitted[0].replace('/', '-') repo_id_ = f'{HUGGINGFACE_USERNAME}/{repo_id_}' file = splitted[1] else: repo_id_ = repo_id files[k] = hf_hub_download(repo_id_, file) return files
def get_mobilevit_config(mobilevit_name): config = MobileViTConfig() # size of the architecture if "mobilevit_s" in mobilevit_name: config.hidden_sizes = [144, 192, 240] config.neck_hidden_sizes = [16, 32, 64, 96, 128, 160, 640] elif "mobilevit_xs" in mobilevit_name: config.hidden_sizes = [96, 120, 144] config.neck_hidden_sizes = [16, 32, 48, 64, 80, 96, 384] elif "mobilevit_xxs" in mobilevit_name: config.hidden_sizes = [64, 80, 96] config.neck_hidden_sizes = [16, 16, 24, 48, 64, 80, 320] config.hidden_dropout_prob = 0.05 config.expand_ratio = 2.0 if mobilevit_name.startswith("deeplabv3_"): config.image_size = 512 config.output_stride = 16 config.num_labels = 21 filename = "pascal-voc-id2label.json" else: config.num_labels = 1000 filename = "imagenet-1k-id2label.json" repo_id = "datasets/huggingface/label-files" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} return config
def get_yolos_config(yolos_name): config = YolosConfig() # size of the architecture if "yolos_ti" in yolos_name: config.hidden_size = 192 config.intermediate_size = 768 config.num_hidden_layers = 12 config.num_attention_heads = 3 config.image_size = [800, 1333] config.use_mid_position_embeddings = False elif yolos_name == "yolos_s_dWr": config.hidden_size = 330 config.num_hidden_layers = 14 config.num_attention_heads = 6 config.intermediate_size = 1320 elif "yolos_s" in yolos_name: config.hidden_size = 384 config.intermediate_size = 1536 config.num_hidden_layers = 12 config.num_attention_heads = 6 elif "yolos_b" in yolos_name: config.image_size = [800, 1344] config.num_labels = 91 repo_id = "datasets/huggingface/label-files" filename = "coco-detection-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} return config
def get_videomae_config(model_name): config = VideoMAEConfig() if "large" in model_name: config.hidden_size = 1024 config.intermediate_size = 4096 config.num_hidden_layers = 24 config.num_attention_heads = 16 config.decoder_num_hidden_layers = 12 config.decoder_num_attention_heads = 8 config.decoder_hidden_size = 512 config.decoder_intermediate_size = 2048 if "finetuned" not in model_name: config.use_mean_pooling = False if "finetuned" in model_name: repo_id = "datasets/huggingface/label-files" if "kinetics" in model_name: config.num_labels = 400 filename = "kinetics400-id2label.json" elif "ssv2" in model_name: config.num_labels = 174 filename = "something-something-v2-id2label.json" else: raise ValueError( "Model name should either contain 'kinetics' or 'ssv2' in case it's fine-tuned." ) id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} return config
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 expected_shape = (1, num_labels) repo_id = "datasets/huggingface/label-files" num_labels = num_labels id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} id2label = id2label label2id = {v: k for k, v in id2label.items()} ImageNetPreTrainedConfig = partial(ResNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) names_to_config = { "resnet18": ImageNetPreTrainedConfig(depths=[2, 2, 2, 2], hidden_sizes=[64, 128, 256, 512], layer_type="basic"), "resnet26": ImageNetPreTrainedConfig(depths=[2, 2, 2, 2], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"), "resnet34": ImageNetPreTrainedConfig(depths=[3, 4, 6, 3], hidden_sizes=[64, 128, 256, 512], layer_type="basic"), "resnet50": ImageNetPreTrainedConfig(depths=[3, 4, 6, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"), "resnet101": ImageNetPreTrainedConfig(depths=[3, 4, 23, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"), "resnet152": ImageNetPreTrainedConfig(depths=[3, 8, 36, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"), } if model_name: convert_weight_and_push(model_name, names_to_config[model_name], save_directory, push_to_hub) else: for model_name, config in names_to_config.items(): convert_weight_and_push(model_name, config, save_directory, push_to_hub) return config, expected_shape
def download_checkpoints(dataset=None): """Downloads the checkpoints from the HuggingFace Hub for the specified dataset Args: - dataset (str): one of 'fastmri' or 'OASIS'. Defaults to None, which means that both are downloaded. """ if dataset is None: for d in ['fastmri', 'OASIS']: download_checkpoints(dataset=d) else: for model_name in MODEL_NAMES: repo_id = REPO_ID_BASE.format(model_name=model_name, dataset=dataset) hf_hub_download( repo_id=repo_id, filename='model_weights.h5', cache_dir=Path(CHECKPOINTS_DIR) / 'checkpoints', force_filename=f'{model_name}-{dataset}.hdf5', )
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *init_inputs, **kwargs): if os.path.isdir(pretrained_model_name_or_path): block_records_path = os.path.join(pretrained_model_name_or_path, _REALM_BLOCK_RECORDS_FILENAME) else: block_records_path = hf_hub_download( repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs ) block_records = np.load(block_records_path, allow_pickle=True) tokenizer = RealmTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) return cls(block_records, tokenizer)
def test_inference_for_pretraining(self): model = VideoMAEForPreTraining.from_pretrained( "MCG-NJU/videomae-base-short").to(torch_device) feature_extractor = self.default_feature_extractor video = prepare_video() inputs = feature_extractor(video, return_tensors="pt").to(torch_device) # add boolean mask, indicating which patches to mask local_path = hf_hub_download( repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt") inputs["bool_masked_pos"] = torch.load(local_path) # forward pass with torch.no_grad(): outputs = model(**inputs) # verify the logits expected_shape = torch.Size([1, 1408, 1536]) expected_slice = torch.tensor( [[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]], device=torch_device) self.assertEqual(outputs.logits.shape, expected_shape) self.assertTrue( torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)) # verify the loss (`config.norm_pix_loss` = `True`) expected_loss = torch.tensor([0.5142], device=torch_device) self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4)) # verify the loss (`config.norm_pix_loss` = `False`) model = VideoMAEForPreTraining.from_pretrained( "MCG-NJU/videomae-base-short", norm_pix_loss=False).to(torch_device) with torch.no_grad(): outputs = model(**inputs) expected_loss = torch.tensor(torch.tensor([0.6469]), device=torch_device) self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
def get_convnext_config(checkpoint_url): config = ConvNextConfig() if "tiny" in checkpoint_url: depths = [3, 3, 9, 3] hidden_sizes = [96, 192, 384, 768] if "small" in checkpoint_url: depths = [3, 3, 27, 3] hidden_sizes = [96, 192, 384, 768] if "base" in checkpoint_url: depths = [3, 3, 27, 3] hidden_sizes = [128, 256, 512, 1024] if "large" in checkpoint_url: depths = [3, 3, 27, 3] hidden_sizes = [192, 384, 768, 1536] if "xlarge" in checkpoint_url: depths = [3, 3, 27, 3] hidden_sizes = [256, 512, 1024, 2048] if "1k" in checkpoint_url: num_labels = 1000 filename = "imagenet-1k-id2label.json" expected_shape = (1, 1000) else: num_labels = 21841 filename = "imagenet-22k-id2label.json" expected_shape = (1, 21841) repo_id = "datasets/huggingface/label-files" config.num_labels = num_labels id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} if "1k" not in checkpoint_url: # this dataset contains 21843 labels but the model only has 21841 # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 del id2label[9205] del id2label[15027] config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} config.hidden_sizes = hidden_sizes config.depths = depths return config, expected_shape
def _get_hf_hub_pretrained_model_info(cls, model_name: str, refresh_cache: bool = False ) -> (type, str): """ Resolve the HuggingFace Hub model pretrained information given a model name. The model name must be of general syntax ``{source_repo}/{model_name}``. Note: The ``{source_repo}`` need not be ``nvidia``, it can be any public repository, even external to Nvidia. This allows public, externally contributed models to be run freely using Nvidia NeMo. Args: model_name: Str name of the model. Must be the original name or an alias of the model, without any '/'. refresh_cache: Bool, determines whether cache must be refreshed (model is re-downloaded). Returns: A tuple of details describing : - The resolved class of the model. Since the source is external to NeMo, always default to using the calling class. Depend on target class resolution by restore_from() for calling the correct class. - The path to the NeMo model (.nemo file) in some cached directory (managed by HF Hub). """ # Resolve the model name without origin for filename resolved_model_filename = model_name.split("/")[-1] + '.nemo' # Check if api token exists, use if it does is_token_available = HfFolder.get_token() is not None # Try to load the model from the Huggingface Hub path = hf_hub_download( repo_id=model_name, filename=resolved_model_filename, library_name="nemo", library_version=nemo.__version__, force_download=refresh_cache, use_auth_token=is_token_available, ) # Cannot pre-resolve the specific class without double instantiation (first for config, second for model params) # Default to current class, and perform basic class path resolution (handled via restore_from() + target class) class_ = cls return class_, path
def download_from_dict(file, s3_file, package, quantized=False): home, _ = _get_home(package=package) if quantized: if 'quantized' not in file: f = file['model'].replace(home, '').split('/') raise ValueError( f'Quantized model for {f[1]} module is not available, please load normal model.' ) model = 'quantized' logger.warning('Load quantized model will cause accuracy drop.') else: model = 'model' files = {} for k, file in s3_file.items(): base_path, file = os.path.split(file) base_path = base_path.replace('/', '-') base_path = f'{HUGGINGFACE_USERNAME}/{base_path}' logger.info(f'downloading frozen {base_path}/{file}') files[k] = hf_hub_download(base_path, file) return files
def load_trained_model(name_or_path): import huggingface_hub as HFhub tokenizer = AutoTokenizer.from_pretrained(name_or_path) model = AutoModelForSeq2SeqLM.from_pretrained(name_or_path) # load preprocessing_kwargs from the model repo on HF hub, or from the local model directory kwargs_filename = None if name_or_path.startswith( "kleinay/" ) and 'preprocessing_kwargs.json' in HFhub.list_repo_files(name_or_path): kwargs_filename = HFhub.hf_hub_download( repo_id=name_or_path, filename="preprocessing_kwargs.json") elif Path(name_or_path).is_dir() and (Path(name_or_path) / "experiment_kwargs.json").exists(): kwargs_filename = Path(name_or_path) / "experiment_kwargs.json" if kwargs_filename: preprocessing_kwargs = json.load(open(kwargs_filename)) # integrate into model.config (for decoding args, e.g. "num_beams"), and save also as standalone object for preprocessing model.config.preprocessing_kwargs = Namespace(**preprocessing_kwargs) model.config.update(preprocessing_kwargs) return model, tokenizer
def from_pretrained( cls, pretrained_model_name_or_path, *init_inputs, **kw, ): if os.path.isdir(pretrained_model_name_or_path): block_records_path = os.path.join(pretrained_model_name_or_path, _REALM_BLOCK_RECORDS_FILENAME) else: block_records_path = hf_hub_download( repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kw, ) block_records = np.load(block_records_path, allow_pickle=True) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, *init_inputs, **kw) return cls(block_records, tokenizer)
def __init__(self, model_id: str): # Reload Keras SavedModel self.model = from_pretrained_keras(model_id) # Handle binary classification with a single output unit. self.single_output_unit = self.model.output_shape[1] == 1 self.num_labels = 2 if self.single_output_unit else self.model.output_shape[ 1] # Config is required to know the mapping to label. config_file = hf_hub_download(model_id, filename=CONFIG_FILENAME) with open(config_file) as config: config = json.load(config) self.id2label = config.get( "id2label", {str(i): f"LABEL_{i}" for i in range(self.num_labels)}) # Define PIL Interpolation method. Uses Image.NEAREST by default. self.interpolation = _PIL_INTERPOLATION_METHODS.get( config.get("interpolation", "nearest"), None) # Return at most the top 5 predicted classes self.top_k = 5
def main(): args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_semantic_segmentation_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = (Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. # We set device_specific to True as we want different data augmentation per device. if args.seed is not None: set_seed(args.seed, device_specific=True) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Load dataset # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # TODO support datasets from local folders dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) # Rename column names to standardized names (only "image" and "label" need to be present) if "pixel_values" in dataset["train"].column_names: dataset = dataset.rename_columns({"pixel_values": "image"}) if "annotation" in dataset["train"].column_names: dataset = dataset.rename_columns({"annotation": "label"}) # If we don't have a validation split, split off a percentage of train as validation. args.train_val_split = None if "validation" in dataset.keys( ) else args.train_val_split if isinstance(args.train_val_split, float) and args.train_val_split > 0.0: split = dataset["train"].train_test_split(args.train_val_split) dataset["train"] = split["train"] dataset["validation"] = split["test"] # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. if args.dataset_name == "scene_parse_150": repo_id = "datasets/huggingface/label-files" filename = "ade20k-id2label.json" else: repo_id = f"datasets/{args.dataset_name}" filename = "id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} label2id = {v: k for k, v in id2label.items()} # Load pretrained model and feature extractor config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id) feature_extractor = AutoFeatureExtractor.from_pretrained( args.model_name_or_path) model = AutoModelForSemanticSegmentation.from_pretrained( args.model_name_or_path, config=config) # Preprocessing the datasets # Define torchvision transforms to be applied to each image + target. # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9 # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py train_transforms = Compose([ ReduceLabels() if args.reduce_labels else Identity(), RandomCrop(size=feature_extractor.size), RandomHorizontalFlip(flip_prob=0.5), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) # Define torchvision transform to be applied to each image. # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) val_transforms = Compose([ ReduceLabels() if args.reduce_labels else Identity(), Resize(size=(feature_extractor.size, feature_extractor.size)), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_train(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = train_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding def preprocess_val(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = val_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding with accelerator.main_process_first(): train_dataset = dataset["train"].with_transform(preprocess_train) eval_dataset = dataset["validation"].with_transform(preprocess_val) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer optimizer = torch.optim.AdamW( list(model.parameters()), lr=args.learning_rate, betas=[args.adam_beta1, args.adam_beta2], eps=args.adam_epsilon, ) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Instantiate metric metric = evaluate.load("mean_iou") # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config[ "lr_scheduler_type"].value accelerator.init_trackers("semantic_segmentation_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print( f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[ -1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): if args.with_tracking: total_loss = 0 model.train() for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, ) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) repo.push_to_hub( commit_message= f"Training in progress {completed_steps} steps", blocking=False, auto_lfs_prune=True, ) if completed_steps >= args.max_train_steps: break logger.info("***** Running evaluation *****") model.eval() samples_seen = 0 for step, batch in enumerate( tqdm(eval_dataloader, disable=not accelerator.is_local_main_process)): with torch.no_grad(): outputs = model(**batch) upsampled_logits = torch.nn.functional.interpolate( outputs.logits, size=batch["labels"].shape[-2:], mode="bilinear", align_corners=False) predictions = upsampled_logits.argmax(dim=1) predictions, references = accelerator.gather( (predictions, batch["labels"])) # If we are in a multiprocess environment, the last batch has duplicates if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: predictions = predictions[:len(eval_dataloader.dataset) - samples_seen] references = references[:len(eval_dataloader.dataset) - samples_seen] else: samples_seen += references.shape[0] metric.add_batch( predictions=predictions, references=references, ) eval_metrics = metric.compute( num_labels=len(id2label), ignore_index=255, reduce_labels=False, # we've already reduced the labels before ) logger.info(f"epoch {epoch}: {eval_metrics}") if args.with_tracking: accelerator.log( { "mean_iou": eval_metrics["mean_iou"], "mean_accuracy": eval_metrics["mean_accuracy"], "overall_accuracy": eval_metrics["overall_accuracy"], "train_loss": total_loss.item() / len(train_dataloader), "epoch": epoch, "step": completed_steps, }, step=completed_steps, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump( { "eval_overall_accuracy": eval_metrics["overall_accuracy"] }, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_semantic_segmentation", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Load dataset # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # TODO support datasets from local folders dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir) # Rename column names to standardized names (only "image" and "label" need to be present) if "pixel_values" in dataset["train"].column_names: dataset = dataset.rename_columns({"pixel_values": "image"}) if "annotation" in dataset["train"].column_names: dataset = dataset.rename_columns({"annotation": "label"}) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in dataset.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = dataset["train"].train_test_split(data_args.train_val_split) dataset["train"] = split["train"] dataset["validation"] = split["test"] # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. if data_args.dataset_name == "scene_parse_150": repo_id = "datasets/huggingface/label-files" filename = "ade20k-id2label.json" else: repo_id = f"datasets/{data_args.dataset_name}" filename = "id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} label2id = {v: str(k) for k, v in id2label.items()} # Load the mean IoU metric from the datasets package metric = evaluate.load("mean_iou") # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. @torch.no_grad() def compute_metrics(eval_pred): logits, labels = eval_pred logits_tensor = torch.from_numpy(logits) # scale the logits to the size of the label logits_tensor = nn.functional.interpolate( logits_tensor, size=labels.shape[-2:], mode="bilinear", align_corners=False, ).argmax(dim=1) pred_labels = logits_tensor.detach().cpu().numpy() metrics = metric.compute( predictions=pred_labels, references=labels, num_labels=len(id2label), ignore_index=0, reduce_labels=feature_extractor.reduce_labels, ) # add per category metrics as individual key-value pairs per_category_accuracy = metrics.pop("per_category_accuracy").tolist() per_category_iou = metrics.pop("per_category_iou").tolist() metrics.update({ f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy) }) metrics.update( {f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)}) return metrics config = AutoConfig.from_pretrained( model_args.config_name or model_args.model_name_or_path, label2id=label2id, id2label=id2label, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSemanticSegmentation.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Define torchvision transforms to be applied to each image + target. # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9 # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py train_transforms = Compose([ ReduceLabels() if data_args.reduce_labels else Identity(), RandomCrop(size=feature_extractor.size), RandomHorizontalFlip(flip_prob=0.5), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) # Define torchvision transform to be applied to each image. # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) val_transforms = Compose([ ReduceLabels() if data_args.reduce_labels else Identity(), Resize(size=(feature_extractor.size, feature_extractor.size)), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_train(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = train_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding def preprocess_val(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = val_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding if training_args.do_train: if "train" not in dataset: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: dataset["train"] = (dataset["train"].shuffle( seed=training_args.seed).select( range(data_args.max_train_samples))) # Set the training transforms dataset["train"].set_transform(preprocess_train) if training_args.do_eval: if "validation" not in dataset: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: dataset["validation"] = (dataset["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms dataset["validation"].set_transform(preprocess_val) # Initalize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"] if training_args.do_train else None, eval_dataset=dataset["validation"] if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=feature_extractor, data_collator=default_data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "dataset": data_args.dataset_name, "tags": ["image-segmentation", "vision"], } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our SegFormer structure. """ # load default SegFormer configuration config = SegformerConfig() encoder_only = False # set attributes based on model_name repo_id = "datasets/huggingface/label-files" if "segformer" in model_name: size = model_name[len("segformer.") : len("segformer.") + 2] if "ade" in model_name: config.num_labels = 150 filename = "ade20k-id2label.json" expected_shape = (1, 150, 128, 128) elif "city" in model_name: config.num_labels = 19 filename = "cityscapes-id2label.json" expected_shape = (1, 19, 128, 128) else: raise ValueError(f"Model {model_name} not supported") elif "mit" in model_name: encoder_only = True size = model_name[4:6] config.num_labels = 1000 filename = "imagenet-1k-id2label.json" expected_shape = (1, 1000) else: raise ValueError(f"Model {model_name} not supported") # set config attributes id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} if size == "b0": pass elif size == "b1": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 256 elif size == "b2": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 4, 6, 3] elif size == "b3": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 4, 18, 3] elif size == "b4": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 8, 27, 3] elif size == "b5": config.hidden_sizes = [64, 128, 320, 512] config.decoder_hidden_size = 768 config.depths = [3, 6, 40, 3] else: raise ValueError(f"Size {size} not supported") # load feature extractor (only resize + normalize) feature_extractor = SegformerFeatureExtractor( image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False ) # prepare image image = prepare_img() pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values logger.info(f"Converting model {model_name}...") # load original state dict if encoder_only: state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) else: state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))["state_dict"] # rename keys state_dict = rename_keys(state_dict, encoder_only=encoder_only) if not encoder_only: del state_dict["decode_head.conv_seg.weight"] del state_dict["decode_head.conv_seg.bias"] # key and value matrices need special treatment read_in_k_v(state_dict, config) # create HuggingFace model and load state dict if encoder_only: config.reshape_last_stage = False model = SegformerForImageClassification(config) else: model = SegformerForSemanticSegmentation(config) model.load_state_dict(state_dict) model.eval() # forward pass outputs = model(pixel_values) logits = outputs.logits # set expected_slice based on model name # ADE20k checkpoints if model_name == "segformer.b0.512x512.ade.160k": expected_slice = torch.tensor( [ [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]], [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]], [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]], ] ) elif model_name == "segformer.b1.512x512.ade.160k": expected_slice = torch.tensor( [ [[-7.5820, -8.7231, -8.3215], [-8.0600, -10.3529, -10.0304], [-7.5208, -9.4103, -9.6239]], [[-12.6918, -13.8994, -13.7137], [-13.3196, -15.7523, -15.4789], [-12.9343, -14.8757, -14.9689]], [[-11.1911, -11.9421, -11.3243], [-11.3342, -13.6839, -13.3581], [-10.3909, -12.1832, -12.4858]], ] ) elif model_name == "segformer.b2.512x512.ade.160k": expected_slice = torch.tensor( [ [[-11.8173, -14.3850, -16.3128], [-14.5648, -16.5804, -18.6568], [-14.7223, -15.7387, -18.4218]], [[-15.7290, -17.9171, -19.4423], [-18.3105, -19.9448, -21.4661], [-17.9296, -18.6497, -20.7910]], [[-15.0783, -17.0336, -18.2789], [-16.8771, -18.6870, -20.1612], [-16.2454, -17.1426, -19.5055]], ] ) elif model_name == "segformer.b3.512x512.ade.160k": expected_slice = torch.tensor( [ [[-9.0878, -10.2081, -10.1891], [-9.3144, -10.7941, -10.9843], [-9.2294, -10.3855, -10.5704]], [[-12.2316, -13.9068, -13.6102], [-12.9161, -14.3702, -14.3235], [-12.5233, -13.7174, -13.7932]], [[-14.6275, -15.2490, -14.9727], [-14.3400, -15.9687, -16.2827], [-14.1484, -15.4033, -15.8937]], ] ) elif model_name == "segformer.b4.512x512.ade.160k": expected_slice = torch.tensor( [ [[-12.3144, -13.2447, -14.0802], [-13.3614, -14.5816, -15.6117], [-13.3340, -14.4433, -16.2219]], [[-19.2781, -20.4128, -20.7506], [-20.6153, -21.6566, -22.0998], [-19.9800, -21.0430, -22.1494]], [[-18.8739, -19.7804, -21.1834], [-20.1233, -21.6765, -23.2944], [-20.0315, -21.2641, -23.6944]], ] ) elif model_name == "segformer.b5.640x640.ade.160k": expected_slice = torch.tensor( [ [[-9.5524, -12.0835, -11.7348], [-10.5229, -13.6446, -14.5662], [-9.5842, -12.8851, -13.9414]], [[-15.3432, -17.5323, -17.0818], [-16.3330, -18.9255, -19.2101], [-15.1340, -17.7848, -18.3971]], [[-12.6072, -14.9486, -14.6631], [-13.7629, -17.0907, -17.7745], [-12.7899, -16.1695, -17.1671]], ] ) # Cityscapes checkpoints elif model_name == "segformer.b0.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-11.9295, -13.4057, -14.8106], [-13.3431, -14.8179, -15.3781], [-14.2836, -15.5942, -16.1588]], [[-11.4906, -12.8067, -13.6564], [-13.1189, -14.0500, -14.1543], [-13.8748, -14.5136, -14.8789]], [[0.5374, 0.1067, -0.4742], [0.1141, -0.2255, -0.7099], [-0.3000, -0.5924, -1.3105]], ] ) elif model_name == "segformer.b0.512x1024.city.160k": expected_slice = torch.tensor( [ [[-7.8217, -9.8767, -10.1717], [-9.4438, -10.9058, -11.4047], [-9.7939, -12.3495, -12.1079]], [[-7.1514, -9.5336, -10.0860], [-9.7776, -11.6822, -11.8439], [-10.1411, -12.7655, -12.8972]], [[0.3021, 0.0805, -0.2310], [-0.0328, -0.1605, -0.2714], [-0.1408, -0.5477, -0.6976]], ] ) elif model_name == "segformer.b0.640x1280.city.160k": expected_slice = torch.tensor( [ [ [-1.1372e01, -1.2787e01, -1.3477e01], [-1.2536e01, -1.4194e01, -1.4409e01], [-1.3217e01, -1.4888e01, -1.5327e01], ], [ [-1.4791e01, -1.7122e01, -1.8277e01], [-1.7163e01, -1.9192e01, -1.9533e01], [-1.7897e01, -1.9991e01, -2.0315e01], ], [ [7.6723e-01, 4.1921e-01, -7.7878e-02], [4.7772e-01, 9.5557e-03, -2.8082e-01], [3.6032e-01, -2.4826e-01, -5.1168e-01], ], ] ) elif model_name == "segformer.b0.768x768.city.160k": expected_slice = torch.tensor( [ [[-9.4959, -11.3087, -11.7479], [-11.0025, -12.6540, -12.3319], [-11.4064, -13.0487, -12.9905]], [[-9.8905, -11.3084, -12.0854], [-11.1726, -12.7698, -12.9583], [-11.5985, -13.3278, -14.1774]], [[0.2213, 0.0192, -0.2466], [-0.1731, -0.4213, -0.4874], [-0.3126, -0.6541, -1.1389]], ] ) elif model_name == "segformer.b1.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]], [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]], [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]], ] ) elif model_name == "segformer.b2.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-16.0976, -16.4856, -17.3962], [-16.6234, -19.0342, -19.7685], [-16.0900, -18.0661, -19.1180]], [[-18.4750, -18.8488, -19.5074], [-19.4030, -22.1570, -22.5977], [-19.1191, -20.8486, -22.3783]], [[-4.5178, -5.5037, -6.5109], [-5.0884, -7.2174, -8.0334], [-4.4156, -5.8117, -7.2970]], ] ) elif model_name == "segformer.b3.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-14.2081, -14.4732, -14.1977], [-14.5867, -16.4423, -16.6356], [-13.4441, -14.9685, -16.8696]], [[-14.4576, -14.7073, -15.0451], [-15.0816, -17.6237, -17.9873], [-14.4213, -16.0199, -18.5992]], [[-4.7349, -4.9588, -5.0966], [-4.3210, -6.9325, -7.2591], [-3.4312, -4.7484, -7.1917]], ] ) elif model_name == "segformer.b4.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-11.7737, -11.9526, -11.3273], [-13.6692, -14.4574, -13.8878], [-13.8937, -14.6924, -15.9345]], [[-14.6706, -14.5330, -14.1306], [-16.1502, -16.8180, -16.4269], [-16.8338, -17.8939, -20.1746]], [[1.0491, 0.8289, 1.0310], [1.1044, 0.5219, 0.8055], [1.0899, 0.6926, 0.5590]], ] ) elif model_name == "segformer.b5.1024x1024.city.160k": expected_slice = torch.tensor( [ [[-12.5641, -13.4777, -13.0684], [-13.9587, -15.8983, -16.6557], [-13.3109, -15.7350, -16.3141]], [[-14.7074, -15.4352, -14.5944], [-16.6353, -18.1663, -18.6120], [-15.1702, -18.0329, -18.1547]], [[-1.7990, -2.0951, -1.7784], [-2.6397, -3.8245, -3.9686], [-1.5264, -2.8126, -2.9316]], ] ) else: predicted_class_idx = logits.argmax(-1).item() print("Predicted class:", model.config.id2label[predicted_class_idx]) # verify logits if not encoder_only: assert logits.shape == expected_shape assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2) # finally, save model and feature extractor logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path) feature_extractor.save_pretrained(pytorch_dump_folder_path)
def cached_file( path_or_repo_id: Union[str, os.PathLike], filename: str, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, resume_download: bool = False, proxies: Optional[Dict[str, str]] = None, use_auth_token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, local_files_only: bool = False, subfolder: str = "", user_agent: Optional[Union[str, Dict[str, str]]] = None, _raise_exceptions_for_missing_entries: bool = True, _raise_exceptions_for_connection_errors: bool = True, _commit_hash: Optional[str] = None, ): """ Tries to locate a file in a local folder and repo, downloads and cache it if necessary. Args: path_or_repo_id (`str` or `os.PathLike`): This can be either: - a string, the *model id* of a model repo on huggingface.co. - a path to a *directory* potentially containing the file. filename (`str`): The name of the file to locate in `path_or_repo`. cache_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. use_auth_token (`str` or *bool*, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. local_files_only (`bool`, *optional*, defaults to `False`): If `True`, will only try to load the tokenizer configuration from local files. subfolder (`str`, *optional*, defaults to `""`): In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can specify the folder name here. <Tip> Passing `use_auth_token=True` is required when you want to use a private model. </Tip> Returns: `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo). Examples: ```python # Download a model weight from the Hub and cache it. model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin") ```""" # Private arguments # _raise_exceptions_for_missing_entries: if False, do not raise an exception for missing entries but return # None. # _raise_exceptions_for_connection_errors: if False, do not raise an exception for connection errors but return # None. # _commit_hash: passed when we are chaining several calls to various files (e.g. when loading a tokenizer or # a pipeline). If files are cached for this commit hash, avoid calls to head and get from the cache. if is_offline_mode() and not local_files_only: logger.info("Offline mode: forcing local_files_only=True") local_files_only = True if subfolder is None: subfolder = "" path_or_repo_id = str(path_or_repo_id) full_filename = os.path.join(subfolder, filename) if os.path.isdir(path_or_repo_id): resolved_file = os.path.join(os.path.join(path_or_repo_id, subfolder), filename) if not os.path.isfile(resolved_file): if _raise_exceptions_for_missing_entries: raise EnvironmentError( f"Could not locate {full_filename} inside {path_or_repo_id}." ) else: return None return resolved_file if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) if _commit_hash is not None: # If the file is cached under that commit hash, we return it directly. resolved_file = try_to_load_from_cache(cache_dir, path_or_repo_id, full_filename, commit_hash=_commit_hash) if resolved_file is not None: return resolved_file user_agent = http_user_agent(user_agent) try: # Load from URL or cache if already cached with _patch_hf_hub_tqdm(): resolved_file = hf_hub_download( path_or_repo_id, filename, subfolder=None if len(subfolder) == 0 else subfolder, revision=revision, cache_dir=cache_dir, user_agent=user_agent, force_download=force_download, proxies=proxies, resume_download=resume_download, use_auth_token=use_auth_token, local_files_only=local_files_only, ) except RepositoryNotFoundError: raise EnvironmentError( f"{path_or_repo_id} is not a local folder and is not a valid model identifier " "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to " "pass a token having permission to this repo with `use_auth_token` or log in with " "`huggingface-cli login` and pass `use_auth_token=True`.") except RevisionNotFoundError: raise EnvironmentError( f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists " "for this model name. Check the model page at " f"'https://huggingface.co/{path_or_repo_id}' for available revisions." ) except EntryNotFoundError: if not _raise_exceptions_for_missing_entries: return None if revision is None: revision = "main" raise EnvironmentError( f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout " f"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files." ) except HTTPError as err: # First we try to see if we have a cached version (not up to date): resolved_file = try_to_load_from_cache(cache_dir, path_or_repo_id, full_filename, revision=revision) if resolved_file is not None: return resolved_file if not _raise_exceptions_for_connection_errors: return None raise EnvironmentError( f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}" ) except ValueError as err: # HuggingFace Hub returns a ValueError for a missing file when local_files_only=True we need to catch it here # This could be caught above along in `EntryNotFoundError` if hf_hub sent a different error message here if LOCAL_FILES_ONLY_HF_ERROR in err.args[ 0] and local_files_only and not _raise_exceptions_for_missing_entries: return None # Otherwise we try to see if we have a cached version (not up to date): resolved_file = try_to_load_from_cache(cache_dir, path_or_repo_id, full_filename, revision=revision) if resolved_file is not None: return resolved_file if not _raise_exceptions_for_connection_errors: return None raise EnvironmentError( f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this file, couldn't find it in the" f" cached files and it looks like {path_or_repo_id} is not the path to a directory containing a file named" f" {full_filename}.\nCheckout your internet connection or see how to run the library in offline mode at" " 'https://huggingface.co/docs/transformers/installation#offline-mode'." ) return resolved_file
def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our ViLT structure. """ # define configuration and initialize HuggingFace model config = ViltConfig(image_size=384, patch_size=32, tie_word_embeddings=False) mlm_model = False vqa_model = False nlvr_model = False irtr_model = False if "vqa" in checkpoint_url: vqa_model = True config.num_labels = 3129 repo_id = "datasets/huggingface/label-files" filename = "vqa2-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} model = ViltForQuestionAnswering(config) elif "nlvr" in checkpoint_url: nlvr_model = True config.num_labels = 2 config.id2label = {0: "False", 1: "True"} config.label2id = {v: k for k, v in config.id2label.items()} config.modality_type_vocab_size = 3 model = ViltForImagesAndTextClassification(config) elif "irtr" in checkpoint_url: irtr_model = True model = ViltForImageAndTextRetrieval(config) elif "mlm_itm" in checkpoint_url: mlm_model = True model = ViltForMaskedLM(config) else: raise ValueError("Unknown model type") # load state_dict of original model, remove and rename some keys state_dict = torch.hub.load_state_dict_from_url( checkpoint_url, map_location="cpu")["state_dict"] rename_keys = create_rename_keys(config, vqa_model, nlvr_model, irtr_model) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, config) if mlm_model or irtr_model: ignore_keys = ["itm_score.fc.weight", "itm_score.fc.bias"] for k in ignore_keys: state_dict.pop(k, None) # load state dict into HuggingFace model model.eval() if mlm_model: missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) assert missing_keys == ["mlm_score.decoder.bias"] else: model.load_state_dict(state_dict) # Define processor feature_extractor = ViltFeatureExtractor(size=384) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") processor = ViltProcessor(feature_extractor, tokenizer) # Forward pass on example inputs (image + text) if nlvr_model: image1 = Image.open( requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw) image2 = Image.open( requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw) text = ( "The left image contains twice the number of dogs as the right image, and at least two dogs in total are" " standing.") encoding_1 = processor(image1, text, return_tensors="pt") encoding_2 = processor(image2, text, return_tensors="pt") outputs = model( input_ids=encoding_1.input_ids, pixel_values=encoding_1.pixel_values, pixel_values_2=encoding_2.pixel_values, ) else: image = Image.open( requests.get( "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) if mlm_model: text = "a bunch of [MASK] laying on a [MASK]." else: text = "How many cats are there?" encoding = processor(image, text, return_tensors="pt") outputs = model(**encoding) # Verify outputs if mlm_model: expected_shape = torch.Size([1, 11, 30522]) expected_slice = torch.tensor([-12.5061, -12.5123, -12.5174]) assert outputs.logits.shape == expected_shape assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4) # verify masked token prediction equals "cats" predicted_id = outputs.logits[0, 4, :].argmax(-1).item() assert tokenizer.decode([predicted_id]) == "cats" elif vqa_model: expected_shape = torch.Size([1, 3129]) expected_slice = torch.tensor([-15.9495, -18.1472, -10.3041]) assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4) assert outputs.logits.shape == expected_shape assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4) # verify vqa prediction equals "2" predicted_idx = outputs.logits.argmax(-1).item() assert model.config.id2label[predicted_idx] == "2" elif nlvr_model: expected_shape = torch.Size([1, 2]) expected_slice = torch.tensor([-2.8721, 2.1291]) assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4) assert outputs.logits.shape == expected_shape Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model and processor to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) processor.save_pretrained(pytorch_dump_folder_path)
# @author Loreto Parisi (loretoparisi at gmail dot com) # Copyright (c) 2022 Loreto Parisi (loretoparisi at gmail dot com) import os from huggingface_hub import hf_hub_download import fasttext def prob2labels(predictions, decimal_places=2): labels = {} # predictions are ordered to max prob int2label_dict = predictions[0] for (index, value) in enumerate(predictions[1]): label = int2label_dict[index].replace('__label__', '') labels[label] = round(value, decimal_places) return labels model_path = hf_hub_download("julien-c/fasttext-language-id", "lid.176.bin", cache_dir=os.getenv("cache_dir", "../../models")) # Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar. # LP: ignore this Warning: https://fasttext.cc/blog/2019/06/25/blog-post.html model = fasttext.load_model(model_path) sequence = "The head of the United Nations says there is no military solution in Syria" num_labels = 3 predictions = model.predict(sequence, k=num_labels) print(sequence, prob2labels(predictions))
def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architecture="MLM"): with open(pickle_file, "rb") as f: checkpoint = pickle.loads(f.read()) state = None if isinstance(checkpoint, dict) and architecture in [ "image_classification", "image_classification_fourier", "image_classification_conv", ]: params = checkpoint["params"] state = checkpoint["state"] else: params = checkpoint state_dict = dict() for scope_name, parameters in hk.data_structures.to_mutable_dict(params).items(): for param_name, param in parameters.items(): state_dict[scope_name + "/" + param_name] = param if state is not None: for scope_name, parameters in hk.data_structures.to_mutable_dict(state).items(): for param_name, param in parameters.items(): state_dict[scope_name + "/" + param_name] = param rename_keys(state_dict, architecture=architecture) cfg = PerceiverConfig() subsampling = None repo_id = "datasets/huggingface/label-files" if architecture == "MLM": cfg.qk_channels = 8 * 32 cfg.v_channels = 1280 model = PerceiverForMaskedLM(cfg) elif "image_classification" in architecture: cfg.num_latents = 512 cfg.d_latents = 1024 cfg.d_model = 512 cfg.num_blocks = 8 cfg.num_self_attends_per_block = 6 cfg.num_cross_attention_heads = 1 cfg.num_self_attention_heads = 8 cfg.qk_channels = None cfg.v_channels = None # set labels cfg.num_labels = 1000 filename = "imagenet-1k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} cfg.id2label = id2label cfg.label2id = {v: k for k, v in id2label.items()} if architecture == "image_classification": cfg.image_size = 224 model = PerceiverForImageClassificationLearned(cfg) elif architecture == "image_classification_fourier": cfg.d_model = 261 model = PerceiverForImageClassificationFourier(cfg) elif architecture == "image_classification_conv": cfg.d_model = 322 model = PerceiverForImageClassificationConvProcessing(cfg) else: raise ValueError(f"Architecture {architecture} not supported") elif architecture == "optical_flow": cfg.num_latents = 2048 cfg.d_latents = 512 cfg.d_model = 322 cfg.num_blocks = 1 cfg.num_self_attends_per_block = 24 cfg.num_self_attention_heads = 16 cfg.num_cross_attention_heads = 1 model = PerceiverForOpticalFlow(cfg) elif architecture == "multimodal_autoencoding": cfg.num_latents = 28 * 28 * 1 cfg.d_latents = 512 cfg.d_model = 704 cfg.num_blocks = 1 cfg.num_self_attends_per_block = 8 cfg.num_self_attention_heads = 8 cfg.num_cross_attention_heads = 1 cfg.num_labels = 700 # define dummy inputs + subsampling (as each forward pass is only on a chunk of image + audio data) images = torch.randn((1, 16, 3, 224, 224)) audio = torch.randn((1, 30720, 1)) nchunks = 128 image_chunk_size = np.prod((16, 224, 224)) // nchunks audio_chunk_size = audio.shape[1] // cfg.samples_per_patch // nchunks # process the first chunk chunk_idx = 0 subsampling = { "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)), "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)), "label": None, } model = PerceiverForMultimodalAutoencoding(cfg) # set labels filename = "kinetics700-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} cfg.id2label = id2label cfg.label2id = {v: k for k, v in id2label.items()} else: raise ValueError(f"Architecture {architecture} not supported") model.eval() model.load_state_dict(state_dict) input_mask = None if architecture == "MLM": tokenizer = PerceiverTokenizer.from_pretrained( "/Users/NielsRogge/Documents/Perceiver/Tokenizer files" ) text = "This is an incomplete sentence where some words are missing." encoding = tokenizer(text, padding="max_length", return_tensors="pt") # mask " missing.". Note that the model performs much better if the masked chunk starts with a space. encoding.input_ids[0, 51:60] = tokenizer.mask_token_id inputs = encoding.input_ids input_mask = encoding.attention_mask elif architecture in [ "image_classification", "image_classification_fourier", "image_classification_conv", ]: feature_extractor = PerceiverFeatureExtractor() image = prepare_img() encoding = feature_extractor(image, return_tensors="pt") inputs = encoding.pixel_values elif architecture == "optical_flow": inputs = torch.randn(1, 2, 27, 368, 496) elif architecture == "multimodal_autoencoding": images = torch.randn((1, 16, 3, 224, 224)) audio = torch.randn((1, 30720, 1)) inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700))) if architecture == "multimodal_autoencoding": outputs = model( inputs=inputs, attention_mask=input_mask, subsampled_output_points=subsampling ) else: outputs = model(inputs=inputs, attention_mask=input_mask) logits = outputs.logits if not isinstance(logits, dict): print("Shape of logits:", logits.shape) else: for k, v in logits.items(): print(f"Shape of logits of modality {k}", v.shape) if architecture == "MLM": expected_slice = torch.tensor( [ [-11.8336, -11.6850, -11.8483], [-12.8149, -12.5863, -12.7904], [-12.8440, -12.6410, -12.8646], ] ) assert torch.allclose(logits[0, :3, :3], expected_slice) masked_tokens_predictions = logits[0, 51:60].argmax(dim=-1).tolist() expected_list = [38, 115, 111, 121, 121, 111, 116, 109, 52] assert masked_tokens_predictions == expected_list print("Greedy predictions:") print(masked_tokens_predictions) print() print("Predicted string:") print(tokenizer.decode(masked_tokens_predictions)) elif architecture in [ "image_classification", "image_classification_fourier", "image_classification_conv", ]: print("Predicted class:", model.config.id2label[logits.argmax(-1).item()]) Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 repo_id = "datasets/huggingface/label-files" num_labels = num_labels id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} id2label = id2label label2id = {v: k for k, v in id2label.items()} ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) names_to_config = { "van-tiny": ImageNetPreTrainedConfig( hidden_sizes=[32, 64, 160, 256], depths=[3, 3, 5, 2], mlp_ratios=[8, 8, 4, 4], ), "van-small": ImageNetPreTrainedConfig( hidden_sizes=[64, 128, 320, 512], depths=[2, 2, 4, 2], mlp_ratios=[8, 8, 4, 4], ), "van-base": ImageNetPreTrainedConfig( hidden_sizes=[64, 128, 320, 512], depths=[3, 3, 12, 3], mlp_ratios=[8, 8, 4, 4], ), "van-large": ImageNetPreTrainedConfig( hidden_sizes=[64, 128, 320, 512], depths=[3, 5, 27, 3], mlp_ratios=[8, 8, 4, 4], ), } names_to_original_models = { "van-tiny": van_tiny, "van-small": van_small, "van-base": van_base, "van-large": van_large, } names_to_original_checkpoints = { "van-tiny": "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar", "van-small": "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar", "van-base": "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar", "van-large": "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar", } if model_name: convert_weight_and_push( model_name, names_to_config[model_name], checkpoint=names_to_original_checkpoints[model_name], from_model=names_to_original_models[model_name](), save_directory=save_directory, push_to_hub=push_to_hub, ) else: for model_name, config in names_to_config.items(): convert_weight_and_push( model_name, config, checkpoint=names_to_original_checkpoints[model_name], from_model=names_to_original_models[model_name](), save_directory=save_directory, push_to_hub=push_to_hub, )
def main(): args = get_args() is_finetuned = "ft1k" in args.hf_checkpoint_name is_large = "large" in args.hf_checkpoint_name if is_finetuned: # To convert Beit's data2vec_vision to HF you need to copy # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py # into this folder. import modeling_finetune # noqa: F401 else: # To convert Beit's data2vec_vision to HF you need to copy # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py # into this folder # IMPORTANT: Note that for now we've only converted the down-stream # model and not the full pretrained model. This means for the integration # test you need to add a `return x` after the following line: # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197 # to make the integration test pass. import modeling_cyclical # noqa: F401 # 1. Create model config config = Data2VecVisionConfig() if is_finetuned: config.use_relative_position_bias = True config.use_shared_relative_position_bias = False config.use_mean_pooling = True config.num_labels = 1000 repo_id = "datasets/huggingface/label-files" filename = "imagenet-1k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} else: config.use_relative_position_bias = False config.use_shared_relative_position_bias = True config.use_mean_pooling = False if is_large: config.hidden_size = 1024 config.intermediate_size = 4096 config.num_hidden_layers = 24 config.num_attention_heads = 16 # 2. Load Beit model orig_model = load_beit_model(args, is_finetuned, is_large) orig_model.eval() # 3. Forward Beit model feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False) image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png") encoding = feature_extractor(images=image, return_tensors="pt") pixel_values = encoding["pixel_values"] orig_args = (pixel_values,) if is_finetuned else (pixel_values, None) with torch.no_grad(): orig_model_output = orig_model(*orig_args) # 4. Load HF Data2VecVision model if is_finetuned: hf_model = Data2VecVisionForImageClassification(config) hf_model.eval() has_lm_head = False hf_prefix = "data2vec_vision." else: hf_model = Data2VecVisionModel(config) hf_model.eval() has_lm_head = True hf_prefix = "" rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) state_dict = orig_model.state_dict() for src, dest in rename_keys: val = state_dict.pop(src) state_dict[dest] = val read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) print("HF missing", missing_keys) print("HF unexpected_keys", unexpected_keys) # 5. Forward HF Data2VecVision model with torch.no_grad(): hf_model_output = hf_model(pixel_values) hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state # 6. Compare max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") success = torch.allclose(hf_output, orig_model_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") # 7. Save print(f"Saving to {args.hf_checkpoint_name}") hf_model.save_pretrained(args.hf_checkpoint_name) feature_extractor.save_pretrained(args.hf_checkpoint_name)
def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our PoolFormer structure. """ # load default PoolFormer configuration config = PoolFormerConfig() # set attributes based on model_name repo_id = "datasets/huggingface/label-files" size = model_name[-3:] config.num_labels = 1000 filename = "imagenet-1k-id2label.json" expected_shape = (1, 1000) # set config attributes id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} if size == "s12": config.depths = [2, 2, 6, 2] config.hidden_sizes = [64, 128, 320, 512] config.mlp_ratio = 4.0 crop_pct = 0.9 elif size == "s24": config.depths = [4, 4, 12, 4] config.hidden_sizes = [64, 128, 320, 512] config.mlp_ratio = 4.0 crop_pct = 0.9 elif size == "s36": config.depths = [6, 6, 18, 6] config.hidden_sizes = [64, 128, 320, 512] config.mlp_ratio = 4.0 config.layer_scale_init_value = 1e-6 crop_pct = 0.9 elif size == "m36": config.depths = [6, 6, 18, 6] config.hidden_sizes = [96, 192, 384, 768] config.mlp_ratio = 4.0 config.layer_scale_init_value = 1e-6 crop_pct = 0.95 elif size == "m48": config.depths = [8, 8, 24, 8] config.hidden_sizes = [96, 192, 384, 768] config.mlp_ratio = 4.0 config.layer_scale_init_value = 1e-6 crop_pct = 0.95 else: raise ValueError(f"Size {size} not supported") # load feature extractor feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct) # Prepare image image = prepare_img() pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values logger.info(f"Converting model {model_name}...") # load original state dict state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) # rename keys state_dict = rename_keys(state_dict) # create HuggingFace model and load state dict model = PoolFormerForImageClassification(config) model.load_state_dict(state_dict) model.eval() # Define feature extractor feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct) pixel_values = feature_extractor(images=prepare_img(), return_tensors="pt").pixel_values # forward pass outputs = model(pixel_values) logits = outputs.logits # define expected logit slices for different models if size == "s12": expected_slice = torch.tensor([-0.3045, -0.6758, -0.4869]) elif size == "s24": expected_slice = torch.tensor([0.4402, -0.1374, -0.8045]) elif size == "s36": expected_slice = torch.tensor([-0.6080, -0.5133, -0.5898]) elif size == "m36": expected_slice = torch.tensor([0.3952, 0.2263, -1.2668]) elif size == "m48": expected_slice = torch.tensor([0.1167, -0.0656, -0.3423]) else: raise ValueError(f"Size {size} not supported") # verify logits assert logits.shape == expected_shape assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2) # finally, save model and feature extractor logger.info( f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}..." ) Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path)
def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our BEiT structure. """ # define default BEiT configuration config = BeitConfig() has_lm_head = False is_semantic = False repo_id = "datasets/huggingface/label-files" # set config parameters based on URL if checkpoint_url[-9:-4] == "pt22k": # masked image modeling config.use_shared_relative_position_bias = True config.use_mask_token = True has_lm_head = True elif checkpoint_url[-9:-4] == "ft22k": # intermediate fine-tuning on ImageNet-22k config.use_relative_position_bias = True config.num_labels = 21841 filename = "imagenet-22k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} # this dataset contains 21843 labels but the model only has 21841 # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 del id2label[9205] del id2label[15027] config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} elif checkpoint_url[-8:-4] == "to1k": # fine-tuning on ImageNet-1k config.use_relative_position_bias = True config.num_labels = 1000 filename = "imagenet-1k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} if "384" in checkpoint_url: config.image_size = 384 if "512" in checkpoint_url: config.image_size = 512 elif "ade20k" in checkpoint_url: # fine-tuning config.use_relative_position_bias = True config.num_labels = 150 filename = "ade20k-id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} config.image_size = 640 is_semantic = True else: raise ValueError( "Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'" ) # size of the architecture if "base" in checkpoint_url: pass elif "large" in checkpoint_url: config.hidden_size = 1024 config.intermediate_size = 4096 config.num_hidden_layers = 24 config.num_attention_heads = 16 if "ade20k" in checkpoint_url: config.image_size = 640 config.out_indices = [7, 11, 15, 23] else: raise ValueError( "Should either find 'base' or 'large' in checkpoint URL") # load state_dict of original model, remove and rename some keys state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True) state_dict = state_dict[ "model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"] rename_keys = create_rename_keys(config, has_lm_head=has_lm_head, is_semantic=is_semantic) for src, dest in rename_keys: rename_key(state_dict, src, dest) read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head, is_semantic=is_semantic) if is_semantic: # add prefix to decoder keys for key, val in state_dict.copy().items(): val = state_dict.pop(key) if key.startswith("backbone.fpn"): key = key.replace("backbone.fpn", "fpn") state_dict[key] = val # load HuggingFace model if checkpoint_url[-9:-4] == "pt22k": model = BeitForMaskedImageModeling(config) elif "ade20k" in checkpoint_url: model = BeitForSemanticSegmentation(config) else: model = BeitForImageClassification(config) model.eval() model.load_state_dict(state_dict) # Check outputs on an image if is_semantic: feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False) ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") image = Image.open(ds[0]["file"]) else: feature_extractor = BeitFeatureExtractor(size=config.image_size, resample=Image.BILINEAR, do_center_crop=False) image = prepare_img() encoding = feature_extractor(images=image, return_tensors="pt") pixel_values = encoding["pixel_values"] outputs = model(pixel_values) logits = outputs.logits # verify logits expected_shape = torch.Size([1, 1000]) if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"): expected_shape = torch.Size([1, 196, 8192]) elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"): expected_shape = torch.Size([1, 196, 8192]) elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"): expected_shape = torch.Size([1, 21841]) expected_logits = torch.tensor([2.2288, 2.4671, 0.7395]) expected_class_idx = 2397 elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"): expected_shape = torch.Size([1, 21841]) expected_logits = torch.tensor([1.6881, -0.2787, 0.5901]) expected_class_idx = 2396 elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"): expected_logits = torch.tensor([0.1241, 0.0798, -0.6569]) expected_class_idx = 285 elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"): expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108]) expected_class_idx = 281 elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"): expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147]) expected_class_idx = 761 elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"): expected_logits = torch.tensor([0.4610, -0.0928, 0.2086]) expected_class_idx = 761 elif checkpoint_url[:-4].endswith( "beit_large_patch16_224_pt22k_ft22kto1k"): expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837]) expected_class_idx = 761 elif checkpoint_url[:-4].endswith( "beit_large_patch16_384_pt22k_ft22kto1k"): expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]]) expected_class_idx = 761 elif checkpoint_url[:-4].endswith( "beit_large_patch16_512_pt22k_ft22kto1k"): expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852]) expected_class_idx = 761 elif checkpoint_url[:-4].endswith( "beit_base_patch16_640_pt22k_ft22ktoade20k"): expected_shape = (1, 150, 160, 160) expected_logits = torch.tensor([ [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]], [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]], [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]], ]) elif checkpoint_url[:-4].endswith( "beit_large_patch16_640_pt22k_ft22ktoade20k"): expected_shape = (1, 150, 160, 160) expected_logits = torch.tensor([ [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251], [-3.4198, -1.8004, -2.9062]], [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755], [-4.2791, -3.1874, -4.1681]], [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518], [4.5550, 6.2495, 6.5154]], ]) else: raise ValueError("Can't verify logits as model is not supported") assert logits.shape == expected_shape, "Shape of logits not as expected" if not has_lm_head: if is_semantic: assert torch.allclose( logits[0, :3, :3, :3], expected_logits, atol=1e-3), "First elements of logits not as expected" else: print("Predicted class idx:", logits.argmax(-1).item()) assert torch.allclose( logits[0, :3], expected_logits, atol=1e-3), "First elements of logits not as expected" assert logits.argmax(-1).item( ) == expected_class_idx, "Predicted class index not as expected" Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) print(f"Saving feature extractor to {pytorch_dump_folder_path}") feature_extractor.save_pretrained(pytorch_dump_folder_path)
def __init__(self, model_id: str): model_path = hf_hub_download(model_id, "model.bin", library_name="fasttext") self.model = fasttext.load_model(model_path)
def prepare_video(): file = hf_hub_download( repo_id="datasets/hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy") video = np.load(file) return list(video)
def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub): config = get_videomae_config(model_name) if "finetuned" in model_name: model = VideoMAEForVideoClassification(config) else: model = VideoMAEForPreTraining(config) # download original checkpoint, hosted on Google Drive output = "pytorch_model.bin" gdown.cached_download(checkpoint_url, output, quiet=False) files = torch.load(output, map_location="cpu") if "model" in files: state_dict = files["model"] else: state_dict = files["module"] new_state_dict = convert_state_dict(state_dict, config) model.load_state_dict(new_state_dict) model.eval() # verify model on basic input feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5]) video = prepare_video() inputs = feature_extractor(video, return_tensors="pt") if "finetuned" not in model_name: local_path = hf_hub_download( repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt") inputs["bool_masked_pos"] = torch.load(local_path) outputs = model(**inputs) logits = outputs.logits model_names = [ # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600) "videomae-base-short", "videomae-base-short-finetuned-kinetics", "videomae-base", "videomae-base-finetuned-kinetics", "videomae-large", "videomae-large-finetuned-kinetics", # Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400) "videomae-base-short-ssv2", "videomae-base-short-finetuned-ssv2", "videomae-base-ssv2", "videomae-base-finetuned-ssv2", ] # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5] if model_name == "videomae-base": expected_shape = torch.Size([1, 1408, 1536]) expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]]) elif model_name == "videomae-base-short": expected_shape = torch.Size([1, 1408, 1536]) expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]]) # we verified the loss both for normalized and unnormalized targets for this one expected_loss = torch.tensor( [0.5142]) if config.norm_pix_loss else torch.tensor([0.6469]) elif model_name == "videomae-large": expected_shape = torch.Size([1, 1408, 1536]) expected_slice = torch.tensor([[0.7149, 0.7997, 0.6966], [0.6768, 0.7869, 0.6948], [0.5139, 0.6221, 0.5605]]) elif model_name == "videomae-large-finetuned-kinetics": expected_shape = torch.Size([1, 400]) expected_slice = torch.tensor([0.0771, 0.0011, -0.3625]) elif model_name == "videomae-base-short-finetuned-kinetics": expected_shape = torch.Size([1, 400]) expected_slice = torch.tensor([0.6588, 0.0990, -0.2493]) elif model_name == "videomae-base-finetuned-kinetics": expected_shape = torch.Size([1, 400]) expected_slice = torch.tensor([0.3669, -0.0688, -0.2421]) elif model_name == "videomae-base-short-ssv2": expected_shape = torch.Size([1, 1408, 1536]) expected_slice = torch.tensor([[0.4712, 0.5296, 0.5786], [0.2278, 0.2729, 0.4026], [0.0352, 0.0730, 0.2506]]) elif model_name == "videomae-base-short-finetuned-ssv2": expected_shape = torch.Size([1, 174]) expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266]) elif model_name == "videomae-base-ssv2": expected_shape = torch.Size([1, 1408, 1536]) expected_slice = torch.tensor([[0.8131, 0.8727, 0.8546], [0.7366, 0.9377, 0.8870], [0.5935, 0.8874, 0.8564]]) elif model_name == "videomae-base-finetuned-ssv2": expected_shape = torch.Size([1, 174]) expected_slice = torch.tensor([0.1961, -0.8337, -0.6389]) else: raise ValueError( f"Model name not supported. Should be one of {model_names}") # verify logits assert logits.shape == expected_shape if "finetuned" in model_name: assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4) else: print("Logits:", logits[0, :3, :3]) assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4) print("Logits ok!") # verify loss, if applicable if model_name == "videomae-base-short": loss = outputs.loss assert torch.allclose(loss, expected_loss, atol=1e-4) print("Loss ok!") if pytorch_dump_folder_path is not None: print( f"Saving model and feature extractor to {pytorch_dump_folder_path}" ) feature_extractor.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path) if push_to_hub: print("Pushing to the hub...") model.push_to_hub(model_name, organization="nielsr")