def upload(self, filepaths: List[str], split: str, col_mapping: Dict[str, str]): """Uploads files to the project""" local_dataset_dir = os.path.expanduser(f"~/.huggingface/autonlp/projects/{self.dataset_id}") if os.path.exists(local_dataset_dir): if os.path.isdir(os.path.join(local_dataset_dir, "git")): clone_from = None else: shutil.rmtree(local_dataset_dir) clone_from = "https://huggingface.co/datasets/" + self.dataset_id else: clone_from = "https://huggingface.co/datasets/" + self.dataset_id dataset_repo = Repository( local_dir=local_dataset_dir, clone_from=clone_from, use_auth_token=self._token, ) dataset_repo.git_pull() for idx, file_path in enumerate(filepaths): if not os.path.isfile(file_path): logger.error(f"[{idx + 1}/{len(filepaths)}] ❌ '{file_path}' does not exist or is not a file!") continue file_name = os.path.basename(file_path) file_extension = file_name.split(".")[-1] src = os.path.expanduser(file_path) dst = os.path.join(local_dataset_dir, "raw", file_name) logger.info(f"[{idx + 1}/{len(filepaths)}] 📦 Copying {src} to {dst}...") os.makedirs(os.path.dirname(dst), exist_ok=True) shutil.copyfile(src, dst) logger.info(f"[{idx + 1}/{len(filepaths)}] 🔎 Validating {dst} and column mapping...") validate_file(path=dst, task=self.task, file_ext=file_extension, col_mapping=col_mapping) dataset_repo.lfs_track(patterns=[f"raw/*.{file_extension}"]) dataset_repo.git_pull() try: logger.info("☁ Uploading files to the dataset hub...") dataset_repo.push_to_hub(commit_message="Upload from AutoNLP CLI") logger.info("✅ Successfully uploaded the files!") except OSError as err: if "nothing to commit, working tree clean" in err.args[0]: logger.info("❔ Files did not change since last upload!") dataset_repo.git_push() return logger.error("❌ Something went wrong when uploading the files!") raise for idx, file_path in enumerate(filepaths): file_name = os.path.basename(file_path) logger.info(f"[{idx + 1}/{len(filepaths)}] 📁 Registering file {file_name} into project '{file_name}'...") payload = { "split": split, "col_mapping": col_mapping, "data_files": [{"fname": file_name, "username": self.user}], } http_post(path=f"/projects/{self.proj_id}/data/add", payload=payload, token=self._token) logger.info(f"[{idx + 1}/{len(filepaths)}] ✅ Success!")
def save_to_hub(self, repo_name: str, organization: Optional[str] = None, private: Optional[bool] = None, commit_message: str = "Add new SentenceTransformer model.", local_model_path: Optional[str] = None, exist_ok: bool = False, replace_model_card: bool = False): """ Uploads all elements of this Sentence Transformer to a new HuggingFace Hub repository. :param repo_name: Repository name for your model in the Hub. :param organization: Organization in which you want to push your model or tokenizer (you must be a member of this organization). :param private: Set to true, for hosting a prive model :param commit_message: Message to commit while pushing. :param local_model_path: Path of the model locally. If set, this file path will be uploaded. Otherwise, the current model will be uploaded :param exist_ok: If true, saving to an existing repository is OK. If false, saving only to a new repository is possible :param replace_model_card: If true, replace an existing model card in the hub with the automatically created model card :return: The url of the commit of your model in the given repository. """ token = HfFolder.get_token() if token is None: raise ValueError( "You must login to the Hugging Face hub on this computer by typing `transformers-cli login`." ) if '/' in repo_name: splits = repo_name.split('/', maxsplit=1) if organization is None or organization == splits[0]: organization = splits[0] repo_name = splits[1] else: raise ValueError( "You passed and invalid repository name: {}.".format( repo_name)) endpoint = "https://huggingface.co" repo_url = HfApi(endpoint=endpoint).create_repo( token, repo_name, organization=organization, private=private, repo_type=None, exist_ok=exist_ok, ) full_model_name = repo_url[len(endpoint) + 1:].strip("/") with tempfile.TemporaryDirectory() as tmp_dir: # First create the repo (and clone its content if it's nonempty). logging.info("Create repository and clone it if it exists") repo = Repository(tmp_dir, clone_from=repo_url) # If user provides local files, copy them. if local_model_path: copy_tree(local_model_path, tmp_dir) else: # Else, save model directly into local repo. create_model_card = replace_model_card or not os.path.exists( os.path.join(tmp_dir, 'README.md')) self.save(tmp_dir, model_name=full_model_name, create_model_card=create_model_card) #Find files larger 5M and track with git-lfs large_files = [] for root, dirs, files in os.walk(tmp_dir): for filename in files: file_path = os.path.join(root, filename) rel_path = os.path.relpath(file_path, tmp_dir) if os.path.getsize(file_path) > (5 * 1024 * 1024): large_files.append(rel_path) if len(large_files) > 0: logging.info("Track files with git lfs: {}".format( ", ".join(large_files))) repo.lfs_track(large_files) logging.info("Push model to the hub. This might take a while") push_return = repo.push_to_hub(commit_message=commit_message) def on_rm_error(func, path, exc_info): # path contains the path of the file that couldn't be removed # let's just assume that it's read-only and unlink it. try: os.chmod(path, stat.S_IWRITE) os.unlink(path) except: pass # Remove .git folder. On Windows, the .git folder might be read-only and cannot be deleted # Hence, try to set write permissions on error try: for f in os.listdir(tmp_dir): shutil.rmtree(os.path.join(tmp_dir, f), onerror=on_rm_error) except Exception as e: logging.warning("Error when deleting temp folder: {}".format( str(e))) pass return push_return
def push_to_hf( repo_name: str, serialization_dir: Optional[Union[str, PathLike]] = None, archive_path: Optional[Union[str, PathLike]] = None, organization: Optional[str] = None, commit_message: str = "Update repository", local_repo_path: Union[str, PathLike] = "hub", use_auth_token: Union[bool, str] = True, ) -> str: """Pushes model and related files to the Hugging Face Hub ([hf.co](https://hf.co/)) # Parameters repo_name: `str` Name of the repository in the Hugging Face Hub. serialization_dir : `Union[str, PathLike]`, optional (default = `None`) Full path to a directory with the serialized model. archive_path : `Union[str, PathLike]`, optional (default = `None`) Full path to the zipped model (e.g. model/model.tar.gz). Use `serialization_dir` if possible. organization : `Optional[str]`, optional (default = `None`) Name of organization to which the model should be uploaded. commit_message: `str` (default=`Update repository`) Commit message to use for the push. local_repo_path : `Union[str, Path]`, optional (default=`hub`) Local directory where the repository will be saved. use_auth_token (``str`` or ``bool``, `optional`, defaults ``True``): huggingface_token can be extract from ``HfApi().login(username, password)`` and is used to authenticate against the Hugging Face Hub (useful from Google Colab for instance). It's automatically retrieved if you've done `huggingface-cli login` before. """ if serialization_dir is not None: working_dir = Path(serialization_dir) if archive_path is not None: raise ValueError( "serialization_dir and archive_path are mutually exclusive, please just use one." ) if not working_dir.exists() or not working_dir.is_dir(): raise ValueError( f"Can't find path: {serialization_dir}, please point" "to a directory with the serialized model.") elif archive_path is not None: working_dir = Path(archive_path) if (not working_dir.exists() or not zipfile.is_zipfile(working_dir) and not tarfile.is_tarfile(working_dir)): raise ValueError( f"Can't find path: {archive_path}, please point to a .tar.gz archive" "or to a directory with the serialized model.") else: logging.info( "Using the archive_path is discouraged. Using the serialization_dir" "will also upload metrics and TensorBoard traces to the Hugging Face Hub." ) else: raise ValueError( "please specify either serialization_dir or archive_path") info_msg = f"Preparing repository '{use_auth_token}'" if isinstance(use_auth_token, str): huggingface_token = use_auth_token elif use_auth_token: huggingface_token = HfFolder.get_token() # Create the repo (or clone its content if it's nonempty) api = HfApi() repo_url = api.create_repo( name=repo_name, token=huggingface_token, organization=organization, private=False, exist_ok=True, ) repo_local_path = Path(local_repo_path) / repo_name repo = Repository(repo_local_path, clone_from=repo_url, use_auth_token=use_auth_token) repo.git_pull(rebase=True) # Model file should be tracked with Git LFS repo.lfs_track(["*.th"]) info_msg = f"Preparing repository '{repo_name}'" if organization is not None: info_msg += f" ({organization})" logging.info(info_msg) # Extract information from either serializable directory or a # .tar.gz file if serialization_dir is not None: for filename in working_dir.iterdir(): _copy_allowed_file(Path(filename), repo_local_path) else: with tempfile.TemporaryDirectory() as temp_dir: extracted_dir = Path( cached_path(working_dir, temp_dir, extract_archive=True)) for filename in extracted_dir.iterdir(): _copy_allowed_file(Path(filename), repo_local_path) _create_model_card(repo_local_path) logging.info(f"Pushing repo {repo_name} to the Hugging Face Hub") repo.push_to_hub(commit_message=commit_message) logging.info(f"View your model in {repo_url}") return repo_url
def push_to_huggingface_hub(self): """Creates a downstream repository on the Hub and pushes training artifacts to it.""" if self.args.hf_hub_org.lower() != "none": organization = self.args.hf_hub_org else: organization = os.environ.get("HF_USERNAME") huggingface_token = HfFolder.get_token() print(f"[Runner] - Organisation to push fine-tuned model to: {organization}") # Extract upstream repository metadata if self.args.hub == "huggingface": model_info = HfApi().model_info(self.args.upstream, token=huggingface_token) downstream_model_id = model_info.sha # Exclude "/" characters from downstream repo ID upstream_model_id = model_info.modelId.replace("/", "__") else: upstream_model_id = self.args.upstream.replace("/", "__") downstream_model_id = str(uuid.uuid4())[:8] repo_name = f"{upstream_model_id}__{downstream_model_id}" # Create downstream repo on the Hub repo_url = HfApi().create_repo( token=huggingface_token, name=repo_name, organization=organization, exist_ok=True, private=False, ) print(f"[Runner] - Created Hub repo: {repo_url}") # Download repo HF_HUB_DIR = "hf_hub" REPO_ROOT_DIR = os.path.join(self.args.expdir, HF_HUB_DIR, repo_name) REPO_TASK_DIR = os.path.join(REPO_ROOT_DIR, self.args.downstream, self.args.expname) print(f"[Runner] - Cloning Hub repo to {REPO_ROOT_DIR}") model_repo = Repository( local_dir=REPO_ROOT_DIR, clone_from=repo_url, use_auth_token=huggingface_token ) # Pull latest changes if they exist model_repo.git_pull() # Copy checkpoints, tensorboard logs, and args / configs # Note that this copies all files from the experiment directory, # including those from multiple runs shutil.copytree(self.args.expdir, REPO_TASK_DIR, dirs_exist_ok=True, ignore=shutil.ignore_patterns(HF_HUB_DIR)) # By default we use model.ckpt in the PreTrainedModel interface, so # rename the best checkpoint to match this convention checkpoints = list(Path(REPO_TASK_DIR).glob("*best*.ckpt")) if len(checkpoints) == 0: print("[Runner] - Did not find a best checkpoint! Using the final checkpoint instead ...") CKPT_PATH = ( os.path.join(REPO_TASK_DIR, f"states-{self.config['runner']['total_steps']}.ckpt") ) elif len(checkpoints) > 1: print(f"[Runner] - More than one best checkpoint found! Using {checkpoints[0]} as default ...") CKPT_PATH = checkpoints[0] else: print(f"[Runner] - Found best checkpoint {checkpoints[0]}!") CKPT_PATH = checkpoints[0] shutil.move(CKPT_PATH, os.path.join(REPO_TASK_DIR, "model.ckpt")) model_repo.lfs_track("*.ckpt") # Write model card self._create_model_card(REPO_ROOT_DIR) # Push everything to the Hub print("[Runner] - Pushing model files to the Hub ...") model_repo.push_to_hub() print("[Runner] - Training run complete!")