def upload(self, filepaths: List[str], split: str, col_mapping: Dict[str, str]): """Uploads files to the project""" local_dataset_dir = os.path.expanduser(f"~/.huggingface/autonlp/projects/{self.dataset_id}") if os.path.exists(local_dataset_dir): if os.path.isdir(os.path.join(local_dataset_dir, "git")): clone_from = None else: shutil.rmtree(local_dataset_dir) clone_from = "https://huggingface.co/datasets/" + self.dataset_id else: clone_from = "https://huggingface.co/datasets/" + self.dataset_id dataset_repo = Repository( local_dir=local_dataset_dir, clone_from=clone_from, use_auth_token=self._token, ) dataset_repo.git_pull() for idx, file_path in enumerate(filepaths): if not os.path.isfile(file_path): logger.error(f"[{idx + 1}/{len(filepaths)}] ❌ '{file_path}' does not exist or is not a file!") continue file_name = os.path.basename(file_path) file_extension = file_name.split(".")[-1] src = os.path.expanduser(file_path) dst = os.path.join(local_dataset_dir, "raw", file_name) logger.info(f"[{idx + 1}/{len(filepaths)}] 📦 Copying {src} to {dst}...") os.makedirs(os.path.dirname(dst), exist_ok=True) shutil.copyfile(src, dst) logger.info(f"[{idx + 1}/{len(filepaths)}] 🔎 Validating {dst} and column mapping...") validate_file(path=dst, task=self.task, file_ext=file_extension, col_mapping=col_mapping) dataset_repo.lfs_track(patterns=[f"raw/*.{file_extension}"]) dataset_repo.git_pull() try: logger.info("☁ Uploading files to the dataset hub...") dataset_repo.push_to_hub(commit_message="Upload from AutoNLP CLI") logger.info("✅ Successfully uploaded the files!") except OSError as err: if "nothing to commit, working tree clean" in err.args[0]: logger.info("❔ Files did not change since last upload!") dataset_repo.git_push() return logger.error("❌ Something went wrong when uploading the files!") raise for idx, file_path in enumerate(filepaths): file_name = os.path.basename(file_path) logger.info(f"[{idx + 1}/{len(filepaths)}] 📁 Registering file {file_name} into project '{file_name}'...") payload = { "split": split, "col_mapping": col_mapping, "data_files": [{"fname": file_name, "username": self.user}], } http_post(path=f"/projects/{self.proj_id}/data/add", payload=payload, token=self._token) logger.info(f"[{idx + 1}/{len(filepaths)}] ✅ Success!")
def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None): """ Copy/paste/tweak model's weights to transformers design. """ repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}") repo.git_pull() if config_path is not None: config = OwlViTConfig.from_pretrained(config_path) else: config = OwlViTConfig() hf_backbone = OwlViTModel(config).eval() hf_model = OwlViTForObjectDetection(config).eval() copy_text_model_and_projection(hf_backbone, pt_backbone) copy_vision_model_and_projection(hf_backbone, pt_backbone) hf_backbone.logit_scale = pt_backbone.logit_scale copy_flax_attn_params(hf_backbone, attn_params) hf_model.owlvit = hf_backbone copy_class_merge_token(hf_model, flax_params) copy_class_box_heads(hf_model, flax_params) # Save HF model hf_model.save_pretrained(repo.local_dir) # Initialize feature extractor feature_extractor = OwlViTFeatureExtractor( size=config.vision_config.image_size, crop_size=config.vision_config.image_size ) # Initialize tokenizer tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16) # Initialize processor processor = OwlViTProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) feature_extractor.save_pretrained(repo.local_dir) processor.save_pretrained(repo.local_dir) repo.git_add() repo.git_commit("Upload model and processor") repo.git_push()
def run(self): if version.parse(huggingface_hub.__version__) < version.parse("0.8.1"): raise ImportError( "The huggingface_hub version must be >= 0.8.1 to use this command. Please update your huggingface_hub" " installation.") else: from huggingface_hub import Repository, create_commit from huggingface_hub._commit_api import CommitOperationAdd # Fetch remote data repo = Repository(local_dir=self._local_dir, clone_from=self._model_name) # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights config = AutoConfig.from_pretrained(self._local_dir) architectures = config.architectures if architectures is None: # No architecture defined -- use auto classes pt_class = getattr(import_module("transformers"), "AutoModel") tf_class = getattr(import_module("transformers"), "TFAutoModel") self._logger.warn( "No detected architecture, using AutoModel/TFAutoModel") else: # Architecture defined -- use it if len(architectures) > 1: raise ValueError( f"More than one architecture was found, aborting. (architectures = {architectures})" ) self._logger.warn(f"Detected architecture: {architectures[0]}") pt_class = getattr(import_module("transformers"), architectures[0]) try: tf_class = getattr(import_module("transformers"), "TF" + architectures[0]) except AttributeError: raise AttributeError( f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers." ) # Load models and acquire a basic input compatible with the model. pt_model = pt_class.from_pretrained(self._local_dir) tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True) pt_input, tf_input = self.get_inputs(pt_model, config) pt_outputs = pt_model(**pt_input, output_hidden_states=True) del pt_model # will no longer be used, and may have a large memory footprint tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True) tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True) # Confirms that cross loading PT weights into TF worked. crossload_differences = self.find_pt_tf_differences( pt_outputs, tf_from_pt_outputs) output_differences = { k: v for k, v in crossload_differences.items() if "hidden" not in k } hidden_differences = { k: v for k, v in crossload_differences.items() if "hidden" in k } max_crossload_output_diff = max(output_differences.values()) max_crossload_hidden_diff = max(hidden_differences.values()) if max_crossload_output_diff > MAX_ERROR or max_crossload_hidden_diff > self._max_hidden_error: raise ValueError( "The cross-loaded TensorFlow model has different outputs, something went wrong!\n" + f"\nList of maximum output differences above the threshold ({MAX_ERROR}):\n" + "\n".join([ f"{k}: {v:.3e}" for k, v in output_differences.items() if v > MAX_ERROR ]) + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_hidden_error}):\n" + "\n".join([ f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_hidden_error ])) # Save the weights in a TF format (if needed) and confirms that the results are still good tf_weights_path = os.path.join(self._local_dir, TF2_WEIGHTS_NAME) tf_weights_index_path = os.path.join(self._local_dir, TF2_WEIGHTS_INDEX_NAME) if (not os.path.exists(tf_weights_path) and not os.path.exists(tf_weights_index_path) ) or self._new_weights: tf_from_pt_model.save_pretrained(self._local_dir) del tf_from_pt_model # will no longer be used, and may have a large memory footprint tf_model = tf_class.from_pretrained(self._local_dir) tf_outputs = tf_model(**tf_input, output_hidden_states=True) conversion_differences = self.find_pt_tf_differences( pt_outputs, tf_outputs) output_differences = { k: v for k, v in conversion_differences.items() if "hidden" not in k } hidden_differences = { k: v for k, v in conversion_differences.items() if "hidden" in k } max_conversion_output_diff = max(output_differences.values()) max_conversion_hidden_diff = max(hidden_differences.values()) if max_conversion_output_diff > MAX_ERROR or max_conversion_hidden_diff > self._max_hidden_error: raise ValueError( "The converted TensorFlow model has different outputs, something went wrong!\n" + f"\nList of maximum output differences above the threshold ({MAX_ERROR}):\n" + "\n".join([ f"{k}: {v:.3e}" for k, v in output_differences.items() if v > MAX_ERROR ]) + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_hidden_error}):\n" + "\n".join([ f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_hidden_error ])) commit_message = "Update TF weights" if self._new_weights else "Add TF weights" if self._push: repo.git_add(auto_lfs_track=True) repo.git_commit(commit_message) repo.git_push( blocking=True) # this prints a progress bar with the upload self._logger.warn(f"TF weights pushed into {self._model_name}") elif not self._no_pr: self._logger.warn("Uploading the weights into a new PR...") commit_descrition = ( "Model converted by the [`transformers`' `pt_to_tf`" " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py). " "All converted model outputs and hidden layers were validated against its Pytorch counterpart.\n\n" f"Maximum crossload output difference={max_crossload_output_diff:.3e}; " f"Maximum crossload hidden layer difference={max_crossload_hidden_diff:.3e};\n" f"Maximum conversion output difference={max_conversion_output_diff:.3e}; " f"Maximum conversion hidden layer difference={max_conversion_hidden_diff:.3e};\n" ) if self._extra_commit_description: commit_descrition += "\n\n" + self._extra_commit_description # sharded model -> adds all related files (index and .h5 shards) if os.path.exists(tf_weights_index_path): operations = [ CommitOperationAdd(path_in_repo=TF2_WEIGHTS_INDEX_NAME, path_or_fileobj=tf_weights_index_path) ] for shard_path in tf.io.gfile.glob(self._local_dir + "/tf_model-*.h5"): operations += [ CommitOperationAdd( path_in_repo=os.path.basename(shard_path), path_or_fileobj=shard_path) ] else: operations = [ CommitOperationAdd(path_in_repo=TF2_WEIGHTS_NAME, path_or_fileobj=tf_weights_path) ] hub_pr_url = create_commit( repo_id=self._model_name, operations=operations, commit_message=commit_message, commit_description=commit_descrition, repo_type="model", create_pr=True, ) self._logger.warn(f"PR open in {hub_pr_url}")
def push_to_hf_hub(model: Any, model_name: str, task: str, **kwargs) -> None: """Save model and its configuration on HF hub >>> from doctr.models import login_to_hub, push_to_hf_hub >>> from doctr.models.recognition import crnn_mobilenet_v3_small >>> login_to_hub() >>> model = crnn_mobilenet_v3_small(pretrained=True) >>> push_to_hf_hub(model, 'my-model', 'recognition', arch='crnn_mobilenet_v3_small') Args: model: TF or PyTorch model to be saved model_name: name of the model which is also the repository name task: task name **kwargs: keyword arguments for push_to_hf_hub """ run_config = kwargs.get("run_config", None) arch = kwargs.get("arch", None) if run_config is None and arch is None: raise ValueError("run_config or arch must be specified") if task not in [ "classification", "detection", "recognition", "obj_detection" ]: raise ValueError( "task must be one of classification, detection, recognition, obj_detection" ) # default readme readme = textwrap.dedent(f""" --- language: en --- <p align="center"> <img src="https://github.com/mindee/doctr/releases/download/v0.3.1/Logo_doctr.gif" width="60%"> </p> **Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch** ## Task: {task} https://github.com/mindee/doctr ### Example usage: ```python >>> from doctr.io import DocumentFile >>> from doctr.models import ocr_predictor, from_hub >>> img = DocumentFile.from_images(['<image_path>']) >>> # Load your model from the hub >>> model = from_hub('mindee/my-model') >>> # Pass it to the predictor >>> # If your model is a recognition model: >>> predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', >>> reco_arch=model, >>> pretrained=True) >>> # If your model is a detection model: >>> predictor = ocr_predictor(det_arch=model, >>> reco_arch='crnn_mobilenet_v3_small', >>> pretrained=True) >>> # Get your predictions >>> res = predictor(img) ``` """) # add run configuration to readme if available if run_config is not None: arch = run_config.arch readme += textwrap.dedent(f"""### Run Configuration \n{json.dumps(vars(run_config), indent=2, ensure_ascii=False)}""" ) if arch not in AVAILABLE_ARCHS[task]: # type: ignore raise ValueError(f"Architecture: {arch} for task: {task} not found.\ \nAvailable architectures: {AVAILABLE_ARCHS}") commit_message = f"Add {model_name} model" local_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "hub", model_name) repo_url = HfApi().create_repo(model_name, token=HfFolder.get_token(), exist_ok=False) repo = Repository(local_dir=local_cache_dir, clone_from=repo_url, use_auth_token=True) with repo.commit(commit_message): _save_model_and_config_for_hf_hub(model, repo.local_dir, arch=arch, task=task) readme_path = Path(repo.local_dir) / "README.md" readme_path.write_text(readme) repo.git_push()
def run(self): if version.parse(huggingface_hub.__version__) < version.parse("0.8.1"): raise ImportError( "The huggingface_hub version must be >= 0.8.1 to use this command. Please update your huggingface_hub" " installation.") else: from huggingface_hub import Repository, create_commit from huggingface_hub._commit_api import CommitOperationAdd # Fetch remote data repo = Repository(local_dir=self._local_dir, clone_from=self._model_name) # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights config = AutoConfig.from_pretrained(self._local_dir) architectures = config.architectures if architectures is None: # No architecture defined -- use auto classes pt_class = getattr(import_module("transformers"), "AutoModel") tf_class = getattr(import_module("transformers"), "TFAutoModel") self._logger.warn( "No detected architecture, using AutoModel/TFAutoModel") else: # Architecture defined -- use it if len(architectures) > 1: raise ValueError( f"More than one architecture was found, aborting. (architectures = {architectures})" ) self._logger.warn(f"Detected architecture: {architectures[0]}") pt_class = getattr(import_module("transformers"), architectures[0]) try: tf_class = getattr(import_module("transformers"), "TF" + architectures[0]) except AttributeError: raise AttributeError( f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers." ) # Load models and acquire a basic input for its modality. pt_model = pt_class.from_pretrained(self._local_dir) main_input_name = pt_model.main_input_name if main_input_name == "input_ids": pt_input, tf_input = self.get_text_inputs() elif main_input_name == "pixel_values": pt_input, tf_input = self.get_image_inputs() elif main_input_name == "input_features": pt_input, tf_input = self.get_audio_inputs() else: raise ValueError( f"Can't detect the model modality (`main_input_name` = {main_input_name})" ) tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True) # Extra input requirements, in addition to the input modality if config.is_encoder_decoder or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder")): decoder_input_ids = np.asarray( [[1], [1]], dtype=int) * pt_model.config.decoder_start_token_id pt_input.update( {"decoder_input_ids": torch.tensor(decoder_input_ids)}) tf_input.update( {"decoder_input_ids": tf.convert_to_tensor(decoder_input_ids)}) # Confirms that cross loading PT weights into TF worked. crossload_differences = self.find_pt_tf_differences( pt_model, pt_input, tf_from_pt_model, tf_input) max_crossload_diff = max(crossload_differences.values()) if max_crossload_diff > MAX_ERROR: raise ValueError( "The cross-loaded TensorFlow model has different outputs, something went wrong! Exaustive list of" f" maximum tensor differences above the error threshold ({MAX_ERROR}):\n" + "\n".join([ f"{key}: {value:.3e}" for key, value in crossload_differences.items() if value > MAX_ERROR ])) # Save the weights in a TF format (if needed) and confirms that the results are still good tf_weights_path = os.path.join(self._local_dir, TF_WEIGHTS_NAME) if not os.path.exists(tf_weights_path) or self._new_weights: tf_from_pt_model.save_weights(tf_weights_path) del tf_from_pt_model # will no longer be used, and may have a large memory footprint tf_model = tf_class.from_pretrained(self._local_dir) conversion_differences = self.find_pt_tf_differences( pt_model, pt_input, tf_model, tf_input) max_conversion_diff = max(conversion_differences.values()) if max_conversion_diff > MAX_ERROR: raise ValueError( "The converted TensorFlow model has different outputs, something went wrong! Exaustive list of maximum" f" tensor differences above the error threshold ({MAX_ERROR}):\n" + "\n".join([ f"{key}: {value:.3e}" for key, value in conversion_differences.items() if value > MAX_ERROR ])) commit_message = "Update TF weights" if self._new_weights else "Add TF weights" if self._push: repo.git_add(auto_lfs_track=True) repo.git_commit(commit_message) repo.git_push( blocking=True) # this prints a progress bar with the upload self._logger.warn(f"TF weights pushed into {self._model_name}") elif not self._no_pr: self._logger.warn("Uploading the weights into a new PR...") commit_descrition = ( "Model converted by the [`transformers`' `pt_to_tf`" " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py)." "\n\nAll converted model outputs and hidden layers were validated against its Pytorch counterpart." f" Maximum crossload output difference={max_crossload_diff:.3e}; Maximum converted output" f" difference={max_conversion_diff:.3e}.") if self._extra_commit_description: commit_descrition += "\n\n" + self._extra_commit_description hub_pr_url = create_commit( repo_id=self._model_name, operations=[ CommitOperationAdd(path_in_repo=TF_WEIGHTS_NAME, path_or_fileobj=tf_weights_path) ], commit_message=commit_message, commit_description=commit_descrition, repo_type="model", create_pr=True, ) self._logger.warn(f"PR open in {hub_pr_url}")