Example #1
1
    def upload(self, filepaths: List[str], split: str, col_mapping: Dict[str, str]):
        """Uploads files to the project"""
        local_dataset_dir = os.path.expanduser(f"~/.huggingface/autonlp/projects/{self.dataset_id}")
        if os.path.exists(local_dataset_dir):
            if os.path.isdir(os.path.join(local_dataset_dir, "git")):
                clone_from = None
            else:
                shutil.rmtree(local_dataset_dir)
                clone_from = "https://huggingface.co/datasets/" + self.dataset_id
        else:
            clone_from = "https://huggingface.co/datasets/" + self.dataset_id
        dataset_repo = Repository(
            local_dir=local_dataset_dir,
            clone_from=clone_from,
            use_auth_token=self._token,
        )
        dataset_repo.git_pull()

        for idx, file_path in enumerate(filepaths):
            if not os.path.isfile(file_path):
                logger.error(f"[{idx + 1}/{len(filepaths)}] ❌ '{file_path}' does not exist or is not a file!")
                continue
            file_name = os.path.basename(file_path)
            file_extension = file_name.split(".")[-1]
            src = os.path.expanduser(file_path)
            dst = os.path.join(local_dataset_dir, "raw", file_name)
            logger.info(f"[{idx + 1}/{len(filepaths)}] 📦 Copying {src} to {dst}...")
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copyfile(src, dst)

            logger.info(f"[{idx + 1}/{len(filepaths)}] 🔎 Validating {dst} and column mapping...")
            validate_file(path=dst, task=self.task, file_ext=file_extension, col_mapping=col_mapping)

            dataset_repo.lfs_track(patterns=[f"raw/*.{file_extension}"])

        dataset_repo.git_pull()

        try:
            logger.info("☁ Uploading files to the dataset hub...")
            dataset_repo.push_to_hub(commit_message="Upload from AutoNLP CLI")
            logger.info("✅ Successfully uploaded  the files!")
        except OSError as err:
            if "nothing to commit, working tree clean" in err.args[0]:
                logger.info("❔ Files did not change since last upload!")
                dataset_repo.git_push()
                return
            logger.error("❌ Something went wrong when uploading the files!")
            raise

        for idx, file_path in enumerate(filepaths):
            file_name = os.path.basename(file_path)
            logger.info(f"[{idx + 1}/{len(filepaths)}] 📁 Registering file {file_name} into project '{file_name}'...")
            payload = {
                "split": split,
                "col_mapping": col_mapping,
                "data_files": [{"fname": file_name, "username": self.user}],
            }
            http_post(path=f"/projects/{self.proj_id}/data/add", payload=payload, token=self._token)
            logger.info(f"[{idx + 1}/{len(filepaths)}] ✅ Success!")
Example #2
0
    def _create_or_get_repo(
        cls,
        repo_path_or_name: Optional[str] = None,
        repo_url: Optional[str] = None,
        organization: Optional[str] = None,
        private: bool = None,
        use_auth_token: Optional[Union[bool, str]] = None,
    ) -> Repository:
        if repo_path_or_name is None and repo_url is None:
            raise ValueError(
                "You need to specify a `repo_path_or_name` or a `repo_url`.")

        if use_auth_token is None and repo_url is None:
            use_auth_token = True

        if repo_path_or_name is None:
            repo_path_or_name = repo_url.split("/")[-1]

        if repo_url is None and not os.path.exists(repo_path_or_name):
            repo_name = Path(repo_path_or_name).name
            repo_url = cls._get_repo_url_from_name(
                repo_name,
                organization=organization,
                private=private,
                use_auth_token=use_auth_token)

        # Create a working directory if it does not exist.
        if not os.path.exists(repo_path_or_name):
            os.makedirs(repo_path_or_name)

        repo = Repository(repo_path_or_name,
                          clone_from=repo_url,
                          use_auth_token=use_auth_token)
        repo.git_pull()
        return repo
    def make_model_card(self, key: str):
        model_root: Path = self.root / key
        try:
            repo = Repository(
                model_root,
                clone_from=f"https://huggingface.co/glasses/{key}.git")
            repo.git_pull()
            model: nn.Module = AutoModel.from_name(key)
            doc: str = model.__doc__
            file_path: Path = model_root / "README.rst"
            file_path_md: Path = model_root / "README.md"

            with open(file_path, "w") as f:
                f.write(doc)

            os.system(f"pandoc -s -o {str(file_path_md)} {str(file_path)}")

            with open(file_path_md, "r") as f:
                text: str = f.read()
                text = text.replace(">", "")
                text = text.replace("{.sourceCode .python}", "python")

            text = f"# {key}\n" + text
            text = text.split("Args:")[0]
            # prepend the tags, datasets\
            with open("./glasses/utils/prepend.md", "r") as f:
                text = f"{f.read()}{text}"

            with open(file_path_md, "w") as f:
                f.write(text)

            file_path.unlink()
            repo.push_to_hub()
        except OSError as e:
            print(key, e)
Example #4
0
 def _clone_dataset_repo(self) -> Repository:
     local_dataset_dir = os.path.expanduser(f"~/.huggingface/autonlp/projects/{self.dataset_id}")
     if os.path.exists(local_dataset_dir):
         if os.path.isdir(os.path.join(local_dataset_dir, ".git")):
             clone_from = None
         else:
             shutil.rmtree(local_dataset_dir)
             clone_from = "https://huggingface.co/datasets/" + self.dataset_id
     else:
         clone_from = "https://huggingface.co/datasets/" + self.dataset_id
     dataset_repo = Repository(
         local_dir=local_dataset_dir,
         clone_from=clone_from,
         use_auth_token=self._token,
     )
     try:
         subprocess.run(
             "git reset --hard".split(),
             stderr=subprocess.PIPE,
             stdout=subprocess.PIPE,
             check=True,
             encoding="utf-8",
             cwd=dataset_repo.local_dir,
         )
     except subprocess.CalledProcessError as exc:
         raise EnvironmentError(exc.stderr)
     dataset_repo.git_pull()
     return dataset_repo
Example #5
0
 def _clone_dataset_repo(self) -> Repository:
     local_dataset_dir = os.path.expanduser(
         f"~/.huggingface/autonlp/projects/{self.dataset_id}")
     if os.path.exists(local_dataset_dir):
         if os.path.isdir(os.path.join(local_dataset_dir, ".git")):
             clone_from = None
         else:
             shutil.rmtree(local_dataset_dir)
             clone_from = "https://huggingface.co/datasets/" + self.dataset_id
     else:
         clone_from = "https://huggingface.co/datasets/" + self.dataset_id
     dataset_repo = Repository(
         local_dir=local_dataset_dir,
         clone_from=clone_from,
         use_auth_token=self._token,
     )
     dataset_repo.git_pull()
     return dataset_repo
Example #6
0
def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}")
    repo.git_pull()

    if config_path is not None:
        config = OwlViTConfig.from_pretrained(config_path)
    else:
        config = OwlViTConfig()

    hf_backbone = OwlViTModel(config).eval()
    hf_model = OwlViTForObjectDetection(config).eval()

    copy_text_model_and_projection(hf_backbone, pt_backbone)
    copy_vision_model_and_projection(hf_backbone, pt_backbone)
    hf_backbone.logit_scale = pt_backbone.logit_scale
    copy_flax_attn_params(hf_backbone, attn_params)

    hf_model.owlvit = hf_backbone
    copy_class_merge_token(hf_model, flax_params)
    copy_class_box_heads(hf_model, flax_params)

    # Save HF model
    hf_model.save_pretrained(repo.local_dir)

    # Initialize feature extractor
    feature_extractor = OwlViTFeatureExtractor(
        size=config.vision_config.image_size, crop_size=config.vision_config.image_size
    )
    # Initialize tokenizer
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)

    # Initialize processor
    processor = OwlViTProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
    feature_extractor.save_pretrained(repo.local_dir)
    processor.save_pretrained(repo.local_dir)

    repo.git_add()
    repo.git_commit("Upload model and processor")
    repo.git_push()
Example #7
0
def push_to_hf(
    repo_name: str,
    serialization_dir: Optional[Union[str, PathLike]] = None,
    archive_path: Optional[Union[str, PathLike]] = None,
    organization: Optional[str] = None,
    commit_message: str = "Update repository",
    local_repo_path: Union[str, PathLike] = "hub",
    use_auth_token: Union[bool, str] = True,
) -> str:
    """Pushes model and related files to the Hugging Face Hub ([hf.co](https://hf.co/))

    # Parameters

    repo_name: `str`
        Name of the repository in the Hugging Face Hub.

    serialization_dir : `Union[str, PathLike]`, optional (default = `None`)
        Full path to a directory with the serialized model.

    archive_path : `Union[str, PathLike]`, optional (default = `None`)
        Full path to the zipped model (e.g. model/model.tar.gz). Use `serialization_dir` if possible.

    organization : `Optional[str]`, optional (default = `None`)
        Name of organization to which the model should be uploaded.

    commit_message: `str` (default=`Update repository`)
        Commit message to use for the push.

    local_repo_path : `Union[str, Path]`, optional (default=`hub`)
        Local directory where the repository will be saved.

    use_auth_token (``str`` or ``bool``, `optional`, defaults ``True``):
        huggingface_token can be extract from ``HfApi().login(username, password)`` and is used to authenticate
        against the Hugging Face Hub (useful from Google Colab for instance). It's automatically retrieved
        if you've done `huggingface-cli login` before.
    """

    if serialization_dir is not None:
        working_dir = Path(serialization_dir)
        if archive_path is not None:
            raise ValueError(
                "serialization_dir and archive_path are mutually exclusive, please just use one."
            )
        if not working_dir.exists() or not working_dir.is_dir():
            raise ValueError(
                f"Can't find path: {serialization_dir}, please point"
                "to a directory with the serialized model.")
    elif archive_path is not None:
        working_dir = Path(archive_path)
        if (not working_dir.exists() or not zipfile.is_zipfile(working_dir)
                and not tarfile.is_tarfile(working_dir)):
            raise ValueError(
                f"Can't find path: {archive_path}, please point to a .tar.gz archive"
                "or to a directory with the serialized model.")
        else:
            logging.info(
                "Using the archive_path is discouraged. Using the serialization_dir"
                "will also upload metrics and TensorBoard traces to the Hugging Face Hub."
            )
    else:
        raise ValueError(
            "please specify either serialization_dir or archive_path")

    info_msg = f"Preparing repository '{use_auth_token}'"
    if isinstance(use_auth_token, str):
        huggingface_token = use_auth_token
    elif use_auth_token:
        huggingface_token = HfFolder.get_token()

    # Create the repo (or clone its content if it's nonempty)
    api = HfApi()
    repo_url = api.create_repo(
        name=repo_name,
        token=huggingface_token,
        organization=organization,
        private=False,
        exist_ok=True,
    )

    repo_local_path = Path(local_repo_path) / repo_name
    repo = Repository(repo_local_path,
                      clone_from=repo_url,
                      use_auth_token=use_auth_token)
    repo.git_pull(rebase=True)

    # Model file should be tracked with Git LFS
    repo.lfs_track(["*.th"])
    info_msg = f"Preparing repository '{repo_name}'"
    if organization is not None:
        info_msg += f" ({organization})"
    logging.info(info_msg)

    # Extract information from either serializable directory or a
    # .tar.gz file
    if serialization_dir is not None:
        for filename in working_dir.iterdir():
            _copy_allowed_file(Path(filename), repo_local_path)
    else:
        with tempfile.TemporaryDirectory() as temp_dir:
            extracted_dir = Path(
                cached_path(working_dir, temp_dir, extract_archive=True))
            for filename in extracted_dir.iterdir():
                _copy_allowed_file(Path(filename), repo_local_path)

    _create_model_card(repo_local_path)

    logging.info(f"Pushing repo {repo_name} to the Hugging Face Hub")
    repo.push_to_hub(commit_message=commit_message)

    logging.info(f"View your model in {repo_url}")
    return repo_url
Example #8
0
    def push_to_huggingface_hub(self):
        """Creates a downstream repository on the Hub and pushes training artifacts to it."""
        if self.args.hf_hub_org.lower() != "none":
            organization = self.args.hf_hub_org
        else:
            organization = os.environ.get("HF_USERNAME")
        huggingface_token = HfFolder.get_token()
        print(f"[Runner] - Organisation to push fine-tuned model to: {organization}")
        
        # Extract upstream repository metadata
        if self.args.hub == "huggingface":
            model_info = HfApi().model_info(self.args.upstream, token=huggingface_token)
            downstream_model_id = model_info.sha
            # Exclude "/" characters from downstream repo ID
            upstream_model_id = model_info.modelId.replace("/", "__")
        else:
            upstream_model_id = self.args.upstream.replace("/", "__")
            downstream_model_id = str(uuid.uuid4())[:8]
        repo_name = f"{upstream_model_id}__{downstream_model_id}"
        # Create downstream repo on the Hub
        repo_url = HfApi().create_repo(
            token=huggingface_token,
            name=repo_name,
            organization=organization,
            exist_ok=True,
            private=False,
        )
        print(f"[Runner] - Created Hub repo: {repo_url}")

        # Download repo
        HF_HUB_DIR = "hf_hub"
        REPO_ROOT_DIR = os.path.join(self.args.expdir, HF_HUB_DIR, repo_name)
        REPO_TASK_DIR = os.path.join(REPO_ROOT_DIR, self.args.downstream, self.args.expname)
        print(f"[Runner] - Cloning Hub repo to {REPO_ROOT_DIR}")
        model_repo = Repository(
            local_dir=REPO_ROOT_DIR, clone_from=repo_url, use_auth_token=huggingface_token
        )
        # Pull latest changes if they exist
        model_repo.git_pull()

        # Copy checkpoints, tensorboard logs, and args / configs
        # Note that this copies all files from the experiment directory,
        # including those from multiple runs
        shutil.copytree(self.args.expdir, REPO_TASK_DIR, dirs_exist_ok=True, ignore=shutil.ignore_patterns(HF_HUB_DIR))

        # By default we use model.ckpt in the PreTrainedModel interface, so
        # rename the best checkpoint to match this convention
        checkpoints = list(Path(REPO_TASK_DIR).glob("*best*.ckpt"))
        if len(checkpoints) == 0:
            print("[Runner] - Did not find a best checkpoint! Using the final checkpoint instead ...")
            CKPT_PATH = (
                os.path.join(REPO_TASK_DIR, f"states-{self.config['runner']['total_steps']}.ckpt")
                )
        elif len(checkpoints) > 1:
            print(f"[Runner] - More than one best checkpoint found! Using {checkpoints[0]} as default ...")
            CKPT_PATH = checkpoints[0]
        else:
            print(f"[Runner] - Found best checkpoint {checkpoints[0]}!")
            CKPT_PATH = checkpoints[0]
        shutil.move(CKPT_PATH, os.path.join(REPO_TASK_DIR, "model.ckpt"))
        model_repo.lfs_track("*.ckpt")

        # Write model card
        self._create_model_card(REPO_ROOT_DIR)

        # Push everything to the Hub
        print("[Runner] - Pushing model files to the Hub ...")
        model_repo.push_to_hub()
        print("[Runner] - Training run complete!")