Exemple #1
1
    def upload(self, filepaths: List[str], split: str, col_mapping: Dict[str, str]):
        """Uploads files to the project"""
        local_dataset_dir = os.path.expanduser(f"~/.huggingface/autonlp/projects/{self.dataset_id}")
        if os.path.exists(local_dataset_dir):
            if os.path.isdir(os.path.join(local_dataset_dir, "git")):
                clone_from = None
            else:
                shutil.rmtree(local_dataset_dir)
                clone_from = "https://huggingface.co/datasets/" + self.dataset_id
        else:
            clone_from = "https://huggingface.co/datasets/" + self.dataset_id
        dataset_repo = Repository(
            local_dir=local_dataset_dir,
            clone_from=clone_from,
            use_auth_token=self._token,
        )
        dataset_repo.git_pull()

        for idx, file_path in enumerate(filepaths):
            if not os.path.isfile(file_path):
                logger.error(f"[{idx + 1}/{len(filepaths)}] ❌ '{file_path}' does not exist or is not a file!")
                continue
            file_name = os.path.basename(file_path)
            file_extension = file_name.split(".")[-1]
            src = os.path.expanduser(file_path)
            dst = os.path.join(local_dataset_dir, "raw", file_name)
            logger.info(f"[{idx + 1}/{len(filepaths)}] 📦 Copying {src} to {dst}...")
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copyfile(src, dst)

            logger.info(f"[{idx + 1}/{len(filepaths)}] 🔎 Validating {dst} and column mapping...")
            validate_file(path=dst, task=self.task, file_ext=file_extension, col_mapping=col_mapping)

            dataset_repo.lfs_track(patterns=[f"raw/*.{file_extension}"])

        dataset_repo.git_pull()

        try:
            logger.info("☁ Uploading files to the dataset hub...")
            dataset_repo.push_to_hub(commit_message="Upload from AutoNLP CLI")
            logger.info("✅ Successfully uploaded  the files!")
        except OSError as err:
            if "nothing to commit, working tree clean" in err.args[0]:
                logger.info("❔ Files did not change since last upload!")
                dataset_repo.git_push()
                return
            logger.error("❌ Something went wrong when uploading the files!")
            raise

        for idx, file_path in enumerate(filepaths):
            file_name = os.path.basename(file_path)
            logger.info(f"[{idx + 1}/{len(filepaths)}] 📁 Registering file {file_name} into project '{file_name}'...")
            payload = {
                "split": split,
                "col_mapping": col_mapping,
                "data_files": [{"fname": file_name, "username": self.user}],
            }
            http_post(path=f"/projects/{self.proj_id}/data/add", payload=payload, token=self._token)
            logger.info(f"[{idx + 1}/{len(filepaths)}] ✅ Success!")
Exemple #2
0
def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}")
    repo.git_pull()

    if config_path is not None:
        config = OwlViTConfig.from_pretrained(config_path)
    else:
        config = OwlViTConfig()

    hf_backbone = OwlViTModel(config).eval()
    hf_model = OwlViTForObjectDetection(config).eval()

    copy_text_model_and_projection(hf_backbone, pt_backbone)
    copy_vision_model_and_projection(hf_backbone, pt_backbone)
    hf_backbone.logit_scale = pt_backbone.logit_scale
    copy_flax_attn_params(hf_backbone, attn_params)

    hf_model.owlvit = hf_backbone
    copy_class_merge_token(hf_model, flax_params)
    copy_class_box_heads(hf_model, flax_params)

    # Save HF model
    hf_model.save_pretrained(repo.local_dir)

    # Initialize feature extractor
    feature_extractor = OwlViTFeatureExtractor(
        size=config.vision_config.image_size, crop_size=config.vision_config.image_size
    )
    # Initialize tokenizer
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)

    # Initialize processor
    processor = OwlViTProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
    feature_extractor.save_pretrained(repo.local_dir)
    processor.save_pretrained(repo.local_dir)

    repo.git_add()
    repo.git_commit("Upload model and processor")
    repo.git_push()
    def run(self):
        if version.parse(huggingface_hub.__version__) < version.parse("0.8.1"):
            raise ImportError(
                "The huggingface_hub version must be >= 0.8.1 to use this command. Please update your huggingface_hub"
                " installation.")
        else:
            from huggingface_hub import Repository, create_commit
            from huggingface_hub._commit_api import CommitOperationAdd

        # Fetch remote data
        repo = Repository(local_dir=self._local_dir,
                          clone_from=self._model_name)

        # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights
        config = AutoConfig.from_pretrained(self._local_dir)
        architectures = config.architectures
        if architectures is None:  # No architecture defined -- use auto classes
            pt_class = getattr(import_module("transformers"), "AutoModel")
            tf_class = getattr(import_module("transformers"), "TFAutoModel")
            self._logger.warn(
                "No detected architecture, using AutoModel/TFAutoModel")
        else:  # Architecture defined -- use it
            if len(architectures) > 1:
                raise ValueError(
                    f"More than one architecture was found, aborting. (architectures = {architectures})"
                )
            self._logger.warn(f"Detected architecture: {architectures[0]}")
            pt_class = getattr(import_module("transformers"), architectures[0])
            try:
                tf_class = getattr(import_module("transformers"),
                                   "TF" + architectures[0])
            except AttributeError:
                raise AttributeError(
                    f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers."
                )

        # Load models and acquire a basic input compatible with the model.
        pt_model = pt_class.from_pretrained(self._local_dir)
        tf_from_pt_model = tf_class.from_pretrained(self._local_dir,
                                                    from_pt=True)
        pt_input, tf_input = self.get_inputs(pt_model, config)
        pt_outputs = pt_model(**pt_input, output_hidden_states=True)
        del pt_model  # will no longer be used, and may have a large memory footprint

        tf_from_pt_model = tf_class.from_pretrained(self._local_dir,
                                                    from_pt=True)
        tf_from_pt_outputs = tf_from_pt_model(**tf_input,
                                              output_hidden_states=True)

        # Confirms that cross loading PT weights into TF worked.
        crossload_differences = self.find_pt_tf_differences(
            pt_outputs, tf_from_pt_outputs)
        output_differences = {
            k: v
            for k, v in crossload_differences.items() if "hidden" not in k
        }
        hidden_differences = {
            k: v
            for k, v in crossload_differences.items() if "hidden" in k
        }
        max_crossload_output_diff = max(output_differences.values())
        max_crossload_hidden_diff = max(hidden_differences.values())
        if max_crossload_output_diff > MAX_ERROR or max_crossload_hidden_diff > self._max_hidden_error:
            raise ValueError(
                "The cross-loaded TensorFlow model has different outputs, something went wrong!\n"
                +
                f"\nList of maximum output differences above the threshold ({MAX_ERROR}):\n"
                + "\n".join([
                    f"{k}: {v:.3e}"
                    for k, v in output_differences.items() if v > MAX_ERROR
                ]) +
                f"\n\nList of maximum hidden layer differences above the threshold ({self._max_hidden_error}):\n"
                + "\n".join([
                    f"{k}: {v:.3e}" for k, v in hidden_differences.items()
                    if v > self._max_hidden_error
                ]))

        # Save the weights in a TF format (if needed) and confirms that the results are still good
        tf_weights_path = os.path.join(self._local_dir, TF2_WEIGHTS_NAME)
        tf_weights_index_path = os.path.join(self._local_dir,
                                             TF2_WEIGHTS_INDEX_NAME)
        if (not os.path.exists(tf_weights_path)
                and not os.path.exists(tf_weights_index_path)
            ) or self._new_weights:
            tf_from_pt_model.save_pretrained(self._local_dir)
        del tf_from_pt_model  # will no longer be used, and may have a large memory footprint

        tf_model = tf_class.from_pretrained(self._local_dir)
        tf_outputs = tf_model(**tf_input, output_hidden_states=True)

        conversion_differences = self.find_pt_tf_differences(
            pt_outputs, tf_outputs)
        output_differences = {
            k: v
            for k, v in conversion_differences.items() if "hidden" not in k
        }
        hidden_differences = {
            k: v
            for k, v in conversion_differences.items() if "hidden" in k
        }
        max_conversion_output_diff = max(output_differences.values())
        max_conversion_hidden_diff = max(hidden_differences.values())
        if max_conversion_output_diff > MAX_ERROR or max_conversion_hidden_diff > self._max_hidden_error:
            raise ValueError(
                "The converted TensorFlow model has different outputs, something went wrong!\n"
                +
                f"\nList of maximum output differences above the threshold ({MAX_ERROR}):\n"
                + "\n".join([
                    f"{k}: {v:.3e}"
                    for k, v in output_differences.items() if v > MAX_ERROR
                ]) +
                f"\n\nList of maximum hidden layer differences above the threshold ({self._max_hidden_error}):\n"
                + "\n".join([
                    f"{k}: {v:.3e}" for k, v in hidden_differences.items()
                    if v > self._max_hidden_error
                ]))

        commit_message = "Update TF weights" if self._new_weights else "Add TF weights"
        if self._push:
            repo.git_add(auto_lfs_track=True)
            repo.git_commit(commit_message)
            repo.git_push(
                blocking=True)  # this prints a progress bar with the upload
            self._logger.warn(f"TF weights pushed into {self._model_name}")
        elif not self._no_pr:
            self._logger.warn("Uploading the weights into a new PR...")
            commit_descrition = (
                "Model converted by the [`transformers`' `pt_to_tf`"
                " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py). "
                "All converted model outputs and hidden layers were validated against its Pytorch counterpart.\n\n"
                f"Maximum crossload output difference={max_crossload_output_diff:.3e}; "
                f"Maximum crossload hidden layer difference={max_crossload_hidden_diff:.3e};\n"
                f"Maximum conversion output difference={max_conversion_output_diff:.3e}; "
                f"Maximum conversion hidden layer difference={max_conversion_hidden_diff:.3e};\n"
            )
            if self._extra_commit_description:
                commit_descrition += "\n\n" + self._extra_commit_description

            # sharded model -> adds all related files (index and .h5 shards)
            if os.path.exists(tf_weights_index_path):
                operations = [
                    CommitOperationAdd(path_in_repo=TF2_WEIGHTS_INDEX_NAME,
                                       path_or_fileobj=tf_weights_index_path)
                ]
                for shard_path in tf.io.gfile.glob(self._local_dir +
                                                   "/tf_model-*.h5"):
                    operations += [
                        CommitOperationAdd(
                            path_in_repo=os.path.basename(shard_path),
                            path_or_fileobj=shard_path)
                    ]
            else:
                operations = [
                    CommitOperationAdd(path_in_repo=TF2_WEIGHTS_NAME,
                                       path_or_fileobj=tf_weights_path)
                ]

            hub_pr_url = create_commit(
                repo_id=self._model_name,
                operations=operations,
                commit_message=commit_message,
                commit_description=commit_descrition,
                repo_type="model",
                create_pr=True,
            )
            self._logger.warn(f"PR open in {hub_pr_url}")
Exemple #4
0
def push_to_hf_hub(model: Any, model_name: str, task: str, **kwargs) -> None:
    """Save model and its configuration on HF hub

    >>> from doctr.models import login_to_hub, push_to_hf_hub
    >>> from doctr.models.recognition import crnn_mobilenet_v3_small
    >>> login_to_hub()
    >>> model = crnn_mobilenet_v3_small(pretrained=True)
    >>> push_to_hf_hub(model, 'my-model', 'recognition', arch='crnn_mobilenet_v3_small')

    Args:
        model: TF or PyTorch model to be saved
        model_name: name of the model which is also the repository name
        task: task name
        **kwargs: keyword arguments for push_to_hf_hub
    """
    run_config = kwargs.get("run_config", None)
    arch = kwargs.get("arch", None)

    if run_config is None and arch is None:
        raise ValueError("run_config or arch must be specified")
    if task not in [
            "classification", "detection", "recognition", "obj_detection"
    ]:
        raise ValueError(
            "task must be one of classification, detection, recognition, obj_detection"
        )

    # default readme
    readme = textwrap.dedent(f"""
    ---
    language: en
    ---

    <p align="center">
    <img src="https://github.com/mindee/doctr/releases/download/v0.3.1/Logo_doctr.gif" width="60%">
    </p>

    **Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch**

    ## Task: {task}

    https://github.com/mindee/doctr

    ### Example usage:

    ```python
    >>> from doctr.io import DocumentFile
    >>> from doctr.models import ocr_predictor, from_hub

    >>> img = DocumentFile.from_images(['<image_path>'])
    >>> # Load your model from the hub
    >>> model = from_hub('mindee/my-model')

    >>> # Pass it to the predictor
    >>> # If your model is a recognition model:
    >>> predictor = ocr_predictor(det_arch='db_mobilenet_v3_large',
    >>>                           reco_arch=model,
    >>>                           pretrained=True)

    >>> # If your model is a detection model:
    >>> predictor = ocr_predictor(det_arch=model,
    >>>                           reco_arch='crnn_mobilenet_v3_small',
    >>>                           pretrained=True)

    >>> # Get your predictions
    >>> res = predictor(img)
    ```
    """)

    # add run configuration to readme if available
    if run_config is not None:
        arch = run_config.arch
        readme += textwrap.dedent(f"""### Run Configuration
                                  \n{json.dumps(vars(run_config), indent=2, ensure_ascii=False)}"""
                                  )

    if arch not in AVAILABLE_ARCHS[task]:  # type: ignore
        raise ValueError(f"Architecture: {arch} for task: {task} not found.\
                         \nAvailable architectures: {AVAILABLE_ARCHS}")

    commit_message = f"Add {model_name} model"

    local_cache_dir = os.path.join(os.path.expanduser("~"), ".cache",
                                   "huggingface", "hub", model_name)
    repo_url = HfApi().create_repo(model_name,
                                   token=HfFolder.get_token(),
                                   exist_ok=False)
    repo = Repository(local_dir=local_cache_dir,
                      clone_from=repo_url,
                      use_auth_token=True)

    with repo.commit(commit_message):

        _save_model_and_config_for_hf_hub(model,
                                          repo.local_dir,
                                          arch=arch,
                                          task=task)
        readme_path = Path(repo.local_dir) / "README.md"
        readme_path.write_text(readme)

    repo.git_push()
Exemple #5
0
    def run(self):
        if version.parse(huggingface_hub.__version__) < version.parse("0.8.1"):
            raise ImportError(
                "The huggingface_hub version must be >= 0.8.1 to use this command. Please update your huggingface_hub"
                " installation.")
        else:
            from huggingface_hub import Repository, create_commit
            from huggingface_hub._commit_api import CommitOperationAdd

        # Fetch remote data
        repo = Repository(local_dir=self._local_dir,
                          clone_from=self._model_name)

        # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights
        config = AutoConfig.from_pretrained(self._local_dir)
        architectures = config.architectures
        if architectures is None:  # No architecture defined -- use auto classes
            pt_class = getattr(import_module("transformers"), "AutoModel")
            tf_class = getattr(import_module("transformers"), "TFAutoModel")
            self._logger.warn(
                "No detected architecture, using AutoModel/TFAutoModel")
        else:  # Architecture defined -- use it
            if len(architectures) > 1:
                raise ValueError(
                    f"More than one architecture was found, aborting. (architectures = {architectures})"
                )
            self._logger.warn(f"Detected architecture: {architectures[0]}")
            pt_class = getattr(import_module("transformers"), architectures[0])
            try:
                tf_class = getattr(import_module("transformers"),
                                   "TF" + architectures[0])
            except AttributeError:
                raise AttributeError(
                    f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers."
                )

        # Load models and acquire a basic input for its modality.
        pt_model = pt_class.from_pretrained(self._local_dir)
        main_input_name = pt_model.main_input_name
        if main_input_name == "input_ids":
            pt_input, tf_input = self.get_text_inputs()
        elif main_input_name == "pixel_values":
            pt_input, tf_input = self.get_image_inputs()
        elif main_input_name == "input_features":
            pt_input, tf_input = self.get_audio_inputs()
        else:
            raise ValueError(
                f"Can't detect the model modality (`main_input_name` = {main_input_name})"
            )
        tf_from_pt_model = tf_class.from_pretrained(self._local_dir,
                                                    from_pt=True)

        # Extra input requirements, in addition to the input modality
        if config.is_encoder_decoder or (hasattr(pt_model, "encoder")
                                         and hasattr(pt_model, "decoder")):
            decoder_input_ids = np.asarray(
                [[1], [1]], dtype=int) * pt_model.config.decoder_start_token_id
            pt_input.update(
                {"decoder_input_ids": torch.tensor(decoder_input_ids)})
            tf_input.update(
                {"decoder_input_ids": tf.convert_to_tensor(decoder_input_ids)})

        # Confirms that cross loading PT weights into TF worked.
        crossload_differences = self.find_pt_tf_differences(
            pt_model, pt_input, tf_from_pt_model, tf_input)
        max_crossload_diff = max(crossload_differences.values())
        if max_crossload_diff > MAX_ERROR:
            raise ValueError(
                "The cross-loaded TensorFlow model has different outputs, something went wrong! Exaustive list of"
                f" maximum tensor differences above the error threshold ({MAX_ERROR}):\n"
                + "\n".join([
                    f"{key}: {value:.3e}"
                    for key, value in crossload_differences.items()
                    if value > MAX_ERROR
                ]))

        # Save the weights in a TF format (if needed) and confirms that the results are still good
        tf_weights_path = os.path.join(self._local_dir, TF_WEIGHTS_NAME)
        if not os.path.exists(tf_weights_path) or self._new_weights:
            tf_from_pt_model.save_weights(tf_weights_path)
        del tf_from_pt_model  # will no longer be used, and may have a large memory footprint
        tf_model = tf_class.from_pretrained(self._local_dir)
        conversion_differences = self.find_pt_tf_differences(
            pt_model, pt_input, tf_model, tf_input)
        max_conversion_diff = max(conversion_differences.values())
        if max_conversion_diff > MAX_ERROR:
            raise ValueError(
                "The converted TensorFlow model has different outputs, something went wrong! Exaustive list of maximum"
                f" tensor differences above the error threshold ({MAX_ERROR}):\n"
                + "\n".join([
                    f"{key}: {value:.3e}"
                    for key, value in conversion_differences.items()
                    if value > MAX_ERROR
                ]))

        commit_message = "Update TF weights" if self._new_weights else "Add TF weights"
        if self._push:
            repo.git_add(auto_lfs_track=True)
            repo.git_commit(commit_message)
            repo.git_push(
                blocking=True)  # this prints a progress bar with the upload
            self._logger.warn(f"TF weights pushed into {self._model_name}")
        elif not self._no_pr:
            self._logger.warn("Uploading the weights into a new PR...")
            commit_descrition = (
                "Model converted by the [`transformers`' `pt_to_tf`"
                " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py)."
                "\n\nAll converted model outputs and hidden layers were validated against its Pytorch counterpart."
                f" Maximum crossload output difference={max_crossload_diff:.3e}; Maximum converted output"
                f" difference={max_conversion_diff:.3e}.")
            if self._extra_commit_description:
                commit_descrition += "\n\n" + self._extra_commit_description
            hub_pr_url = create_commit(
                repo_id=self._model_name,
                operations=[
                    CommitOperationAdd(path_in_repo=TF_WEIGHTS_NAME,
                                       path_or_fileobj=tf_weights_path)
                ],
                commit_message=commit_message,
                commit_description=commit_descrition,
                repo_type="model",
                create_pr=True,
            )
            self._logger.warn(f"PR open in {hub_pr_url}")