def upload(self, filepaths: List[str], split: str, col_mapping: Dict[str, str]): """Uploads files to the project""" local_dataset_dir = os.path.expanduser(f"~/.huggingface/autonlp/projects/{self.dataset_id}") if os.path.exists(local_dataset_dir): if os.path.isdir(os.path.join(local_dataset_dir, "git")): clone_from = None else: shutil.rmtree(local_dataset_dir) clone_from = "https://huggingface.co/datasets/" + self.dataset_id else: clone_from = "https://huggingface.co/datasets/" + self.dataset_id dataset_repo = Repository( local_dir=local_dataset_dir, clone_from=clone_from, use_auth_token=self._token, ) dataset_repo.git_pull() for idx, file_path in enumerate(filepaths): if not os.path.isfile(file_path): logger.error(f"[{idx + 1}/{len(filepaths)}] ❌ '{file_path}' does not exist or is not a file!") continue file_name = os.path.basename(file_path) file_extension = file_name.split(".")[-1] src = os.path.expanduser(file_path) dst = os.path.join(local_dataset_dir, "raw", file_name) logger.info(f"[{idx + 1}/{len(filepaths)}] 📦 Copying {src} to {dst}...") os.makedirs(os.path.dirname(dst), exist_ok=True) shutil.copyfile(src, dst) logger.info(f"[{idx + 1}/{len(filepaths)}] 🔎 Validating {dst} and column mapping...") validate_file(path=dst, task=self.task, file_ext=file_extension, col_mapping=col_mapping) dataset_repo.lfs_track(patterns=[f"raw/*.{file_extension}"]) dataset_repo.git_pull() try: logger.info("☁ Uploading files to the dataset hub...") dataset_repo.push_to_hub(commit_message="Upload from AutoNLP CLI") logger.info("✅ Successfully uploaded the files!") except OSError as err: if "nothing to commit, working tree clean" in err.args[0]: logger.info("❔ Files did not change since last upload!") dataset_repo.git_push() return logger.error("❌ Something went wrong when uploading the files!") raise for idx, file_path in enumerate(filepaths): file_name = os.path.basename(file_path) logger.info(f"[{idx + 1}/{len(filepaths)}] 📁 Registering file {file_name} into project '{file_name}'...") payload = { "split": split, "col_mapping": col_mapping, "data_files": [{"fname": file_name, "username": self.user}], } http_post(path=f"/projects/{self.proj_id}/data/add", payload=payload, token=self._token) logger.info(f"[{idx + 1}/{len(filepaths)}] ✅ Success!")
def _create_or_get_repo( cls, repo_path_or_name: Optional[str] = None, repo_url: Optional[str] = None, organization: Optional[str] = None, private: bool = None, use_auth_token: Optional[Union[bool, str]] = None, ) -> Repository: if repo_path_or_name is None and repo_url is None: raise ValueError( "You need to specify a `repo_path_or_name` or a `repo_url`.") if use_auth_token is None and repo_url is None: use_auth_token = True if repo_path_or_name is None: repo_path_or_name = repo_url.split("/")[-1] if repo_url is None and not os.path.exists(repo_path_or_name): repo_name = Path(repo_path_or_name).name repo_url = cls._get_repo_url_from_name( repo_name, organization=organization, private=private, use_auth_token=use_auth_token) # Create a working directory if it does not exist. if not os.path.exists(repo_path_or_name): os.makedirs(repo_path_or_name) repo = Repository(repo_path_or_name, clone_from=repo_url, use_auth_token=use_auth_token) repo.git_pull() return repo
def make_model_card(self, key: str): model_root: Path = self.root / key try: repo = Repository( model_root, clone_from=f"https://huggingface.co/glasses/{key}.git") repo.git_pull() model: nn.Module = AutoModel.from_name(key) doc: str = model.__doc__ file_path: Path = model_root / "README.rst" file_path_md: Path = model_root / "README.md" with open(file_path, "w") as f: f.write(doc) os.system(f"pandoc -s -o {str(file_path_md)} {str(file_path)}") with open(file_path_md, "r") as f: text: str = f.read() text = text.replace(">", "") text = text.replace("{.sourceCode .python}", "python") text = f"# {key}\n" + text text = text.split("Args:")[0] # prepend the tags, datasets\ with open("./glasses/utils/prepend.md", "r") as f: text = f"{f.read()}{text}" with open(file_path_md, "w") as f: f.write(text) file_path.unlink() repo.push_to_hub() except OSError as e: print(key, e)
def _clone_dataset_repo(self) -> Repository: local_dataset_dir = os.path.expanduser(f"~/.huggingface/autonlp/projects/{self.dataset_id}") if os.path.exists(local_dataset_dir): if os.path.isdir(os.path.join(local_dataset_dir, ".git")): clone_from = None else: shutil.rmtree(local_dataset_dir) clone_from = "https://huggingface.co/datasets/" + self.dataset_id else: clone_from = "https://huggingface.co/datasets/" + self.dataset_id dataset_repo = Repository( local_dir=local_dataset_dir, clone_from=clone_from, use_auth_token=self._token, ) try: subprocess.run( "git reset --hard".split(), stderr=subprocess.PIPE, stdout=subprocess.PIPE, check=True, encoding="utf-8", cwd=dataset_repo.local_dir, ) except subprocess.CalledProcessError as exc: raise EnvironmentError(exc.stderr) dataset_repo.git_pull() return dataset_repo
def _clone_dataset_repo(self) -> Repository: local_dataset_dir = os.path.expanduser( f"~/.huggingface/autonlp/projects/{self.dataset_id}") if os.path.exists(local_dataset_dir): if os.path.isdir(os.path.join(local_dataset_dir, ".git")): clone_from = None else: shutil.rmtree(local_dataset_dir) clone_from = "https://huggingface.co/datasets/" + self.dataset_id else: clone_from = "https://huggingface.co/datasets/" + self.dataset_id dataset_repo = Repository( local_dir=local_dataset_dir, clone_from=clone_from, use_auth_token=self._token, ) dataset_repo.git_pull() return dataset_repo
def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None): """ Copy/paste/tweak model's weights to transformers design. """ repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}") repo.git_pull() if config_path is not None: config = OwlViTConfig.from_pretrained(config_path) else: config = OwlViTConfig() hf_backbone = OwlViTModel(config).eval() hf_model = OwlViTForObjectDetection(config).eval() copy_text_model_and_projection(hf_backbone, pt_backbone) copy_vision_model_and_projection(hf_backbone, pt_backbone) hf_backbone.logit_scale = pt_backbone.logit_scale copy_flax_attn_params(hf_backbone, attn_params) hf_model.owlvit = hf_backbone copy_class_merge_token(hf_model, flax_params) copy_class_box_heads(hf_model, flax_params) # Save HF model hf_model.save_pretrained(repo.local_dir) # Initialize feature extractor feature_extractor = OwlViTFeatureExtractor( size=config.vision_config.image_size, crop_size=config.vision_config.image_size ) # Initialize tokenizer tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16) # Initialize processor processor = OwlViTProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) feature_extractor.save_pretrained(repo.local_dir) processor.save_pretrained(repo.local_dir) repo.git_add() repo.git_commit("Upload model and processor") repo.git_push()
def push_to_hf( repo_name: str, serialization_dir: Optional[Union[str, PathLike]] = None, archive_path: Optional[Union[str, PathLike]] = None, organization: Optional[str] = None, commit_message: str = "Update repository", local_repo_path: Union[str, PathLike] = "hub", use_auth_token: Union[bool, str] = True, ) -> str: """Pushes model and related files to the Hugging Face Hub ([hf.co](https://hf.co/)) # Parameters repo_name: `str` Name of the repository in the Hugging Face Hub. serialization_dir : `Union[str, PathLike]`, optional (default = `None`) Full path to a directory with the serialized model. archive_path : `Union[str, PathLike]`, optional (default = `None`) Full path to the zipped model (e.g. model/model.tar.gz). Use `serialization_dir` if possible. organization : `Optional[str]`, optional (default = `None`) Name of organization to which the model should be uploaded. commit_message: `str` (default=`Update repository`) Commit message to use for the push. local_repo_path : `Union[str, Path]`, optional (default=`hub`) Local directory where the repository will be saved. use_auth_token (``str`` or ``bool``, `optional`, defaults ``True``): huggingface_token can be extract from ``HfApi().login(username, password)`` and is used to authenticate against the Hugging Face Hub (useful from Google Colab for instance). It's automatically retrieved if you've done `huggingface-cli login` before. """ if serialization_dir is not None: working_dir = Path(serialization_dir) if archive_path is not None: raise ValueError( "serialization_dir and archive_path are mutually exclusive, please just use one." ) if not working_dir.exists() or not working_dir.is_dir(): raise ValueError( f"Can't find path: {serialization_dir}, please point" "to a directory with the serialized model.") elif archive_path is not None: working_dir = Path(archive_path) if (not working_dir.exists() or not zipfile.is_zipfile(working_dir) and not tarfile.is_tarfile(working_dir)): raise ValueError( f"Can't find path: {archive_path}, please point to a .tar.gz archive" "or to a directory with the serialized model.") else: logging.info( "Using the archive_path is discouraged. Using the serialization_dir" "will also upload metrics and TensorBoard traces to the Hugging Face Hub." ) else: raise ValueError( "please specify either serialization_dir or archive_path") info_msg = f"Preparing repository '{use_auth_token}'" if isinstance(use_auth_token, str): huggingface_token = use_auth_token elif use_auth_token: huggingface_token = HfFolder.get_token() # Create the repo (or clone its content if it's nonempty) api = HfApi() repo_url = api.create_repo( name=repo_name, token=huggingface_token, organization=organization, private=False, exist_ok=True, ) repo_local_path = Path(local_repo_path) / repo_name repo = Repository(repo_local_path, clone_from=repo_url, use_auth_token=use_auth_token) repo.git_pull(rebase=True) # Model file should be tracked with Git LFS repo.lfs_track(["*.th"]) info_msg = f"Preparing repository '{repo_name}'" if organization is not None: info_msg += f" ({organization})" logging.info(info_msg) # Extract information from either serializable directory or a # .tar.gz file if serialization_dir is not None: for filename in working_dir.iterdir(): _copy_allowed_file(Path(filename), repo_local_path) else: with tempfile.TemporaryDirectory() as temp_dir: extracted_dir = Path( cached_path(working_dir, temp_dir, extract_archive=True)) for filename in extracted_dir.iterdir(): _copy_allowed_file(Path(filename), repo_local_path) _create_model_card(repo_local_path) logging.info(f"Pushing repo {repo_name} to the Hugging Face Hub") repo.push_to_hub(commit_message=commit_message) logging.info(f"View your model in {repo_url}") return repo_url
def push_to_huggingface_hub(self): """Creates a downstream repository on the Hub and pushes training artifacts to it.""" if self.args.hf_hub_org.lower() != "none": organization = self.args.hf_hub_org else: organization = os.environ.get("HF_USERNAME") huggingface_token = HfFolder.get_token() print(f"[Runner] - Organisation to push fine-tuned model to: {organization}") # Extract upstream repository metadata if self.args.hub == "huggingface": model_info = HfApi().model_info(self.args.upstream, token=huggingface_token) downstream_model_id = model_info.sha # Exclude "/" characters from downstream repo ID upstream_model_id = model_info.modelId.replace("/", "__") else: upstream_model_id = self.args.upstream.replace("/", "__") downstream_model_id = str(uuid.uuid4())[:8] repo_name = f"{upstream_model_id}__{downstream_model_id}" # Create downstream repo on the Hub repo_url = HfApi().create_repo( token=huggingface_token, name=repo_name, organization=organization, exist_ok=True, private=False, ) print(f"[Runner] - Created Hub repo: {repo_url}") # Download repo HF_HUB_DIR = "hf_hub" REPO_ROOT_DIR = os.path.join(self.args.expdir, HF_HUB_DIR, repo_name) REPO_TASK_DIR = os.path.join(REPO_ROOT_DIR, self.args.downstream, self.args.expname) print(f"[Runner] - Cloning Hub repo to {REPO_ROOT_DIR}") model_repo = Repository( local_dir=REPO_ROOT_DIR, clone_from=repo_url, use_auth_token=huggingface_token ) # Pull latest changes if they exist model_repo.git_pull() # Copy checkpoints, tensorboard logs, and args / configs # Note that this copies all files from the experiment directory, # including those from multiple runs shutil.copytree(self.args.expdir, REPO_TASK_DIR, dirs_exist_ok=True, ignore=shutil.ignore_patterns(HF_HUB_DIR)) # By default we use model.ckpt in the PreTrainedModel interface, so # rename the best checkpoint to match this convention checkpoints = list(Path(REPO_TASK_DIR).glob("*best*.ckpt")) if len(checkpoints) == 0: print("[Runner] - Did not find a best checkpoint! Using the final checkpoint instead ...") CKPT_PATH = ( os.path.join(REPO_TASK_DIR, f"states-{self.config['runner']['total_steps']}.ckpt") ) elif len(checkpoints) > 1: print(f"[Runner] - More than one best checkpoint found! Using {checkpoints[0]} as default ...") CKPT_PATH = checkpoints[0] else: print(f"[Runner] - Found best checkpoint {checkpoints[0]}!") CKPT_PATH = checkpoints[0] shutil.move(CKPT_PATH, os.path.join(REPO_TASK_DIR, "model.ckpt")) model_repo.lfs_track("*.ckpt") # Write model card self._create_model_card(REPO_ROOT_DIR) # Push everything to the Hub print("[Runner] - Pushing model files to the Hub ...") model_repo.push_to_hub() print("[Runner] - Training run complete!")