def get_hash(self): if not self.use_cache: return ostage( self.repo.odb.local, self.path_info, self.fs, self.fs.PARAM_CHECKSUM, ).hash_info return ostage(self.odb, self.path_info, self.fs, self.odb.fs.PARAM_CHECKSUM).hash_info
def save(self): if not self.exists: raise self.DoesNotExistError(self) if not self.isfile and not self.isdir: raise self.IsNotFileOrDirError(self) if self.is_empty: logger.warning(f"'{self}' is empty.") self.ignore() if self.metric or self.plot: self.verify_metric() if not self.use_cache: self.hash_info = self.get_hash() if not self.IS_DEPENDENCY: logger.debug("Output '%s' doesn't use cache. Skipping saving.", self) return assert not self.IS_DEPENDENCY if not self.changed(): logger.debug("Output '%s' didn't change. Skipping saving.", self) return self.obj = ostage(self.odb, self.path_info, self.fs) self.hash_info = self.obj.hash_info self.isexec = self.isfile() and self.fs.isexec(self.path_info)
def commit(self, filter_info=None): if not self.exists: raise self.DoesNotExistError(self) assert self.hash_info if self.use_cache: obj = ostage( self.odb, filter_info or self.path_info, self.fs, self.odb.fs.PARAM_CHECKSUM, dvcignore=self.dvcignore, ) objects.save(self.odb, obj) checkout( filter_info or self.path_info, self.fs, obj, self.odb, relink=True, dvcignore=self.dvcignore, state=self.repo.state, ) self.set_exec()
def _to_checksum(output): if on_working_fs: return ostage( repo.odb.local, output.path_info, repo.odb.local.fs, ).hash_info.value return output.hash_info.value
def _output_paths(repo, targets): from dvc.fs.local import LocalFileSystem from dvc.objects.stage import stage as ostage on_working_fs = isinstance(repo.fs, LocalFileSystem) def _exists(output): if on_working_fs: return output.exists return True def _to_path(output): return ( str(output) if not output.is_dir_checksum else os.path.join(str(output), "") ) for output in repo.index.outs: if _exists(output): yield_output = targets is None or any( output.path_info.isin_or_eq(target) for target in targets ) if on_working_fs: _, _, obj = ostage( repo.odb.local, output.path_info, repo.odb.local.fs, "md5", dry_run=True, dvcignore=output.dvcignore, ) hash_info = obj.hash_info else: hash_info = output.hash_info obj = output.get_obj() if yield_output: yield _to_path(output), hash_info.value if not obj: continue if output.is_dir_checksum and ( yield_output or any(target.isin(output.path_info) for target in targets) ): yield from _dir_output_paths(output.path_info, obj, targets)
def commit(self, filter_info=None): if not self.exists: raise self.DoesNotExistError(self) assert self.hash_info if self.use_cache: obj = ostage(self.odb, filter_info or self.path_info, self.fs) objects.save(self.odb, obj) checkout( filter_info or self.path_info, self.fs, obj, self.odb, relink=True, ) self.set_exec()
def transfer( self, source, odb=None, jobs=None, update=False, no_progress_bar=False, ): from dvc.fs import get_cloud_fs if odb is None: odb = self.odb cls, config = get_cloud_fs(self.repo, url=source) from_fs = cls(**config) from_info = from_fs.path_info # When running import-url --to-remote / add --to-remote/-o ... we # assume that it is unlikely that the odb will contain majority of the # hashes, so we transfer everything as is (even if that file might # already be in the cache) and don't waste an upload to scan the layout # of the source location. But when doing update --to-remote, there is # a high probability that the odb might contain some of the hashes, so # we first calculate all the hashes (but don't transfer anything) and # then only update the missing cache files. upload = not (update and from_fs.isdir(from_info)) jobs = jobs or min((from_fs.jobs, odb.fs.jobs)) obj = ostage( odb, from_info, from_fs, "md5", upload=upload, jobs=jobs, no_progress_bar=no_progress_bar, ) osave(odb, obj, jobs=jobs, move=upload) self.hash_info = obj.hash_info return obj
def get_hash(self): if not self.use_cache: return get_hash(self.path_info, self.fs, self.fs.PARAM_CHECKSUM) return ostage(self.odb, self.path_info, self.fs).hash_info
def _process_stages( repo, sub_targets, stages, no_commit, pbar, to_remote, to_cache, **kwargs ): link_failures = [] from dvc.dvcfile import Dvcfile from ..output.base import OutputDoesNotExistError if to_remote or to_cache: # Already verified in the add() (stage,) = stages (target,) = sub_targets (out,) = stage.outs if to_remote: out.hash_info = repo.cloud.transfer( target, jobs=kwargs.get("jobs"), remote=kwargs.get("remote"), command="add", ) else: from dvc.fs import get_cloud_fs from dvc.objects import save as osave from dvc.objects.stage import stage as ostage from_fs = get_cloud_fs(repo, url=target) jobs = kwargs.get("jobs", min((from_fs.jobs, out.odb.fs.jobs))) obj = ostage( out.odb, from_fs.path_info, from_fs, "md5", upload=True, jobs=jobs, ) osave(out.odb, obj, jobs=jobs, move=False) out.hash_info = obj.hash_info out.checkout() Dvcfile(repo, stage.path).dump(stage) return link_failures with Tqdm( total=len(stages), desc="Processing", unit="file", disable=len(stages) == 1, ) as pbar_stages: for stage in stages: try: stage.save() except OutputDoesNotExistError: pbar.n -= 1 raise try: if not no_commit: stage.commit() except CacheLinkError: link_failures.append(stage) Dvcfile(repo, stage.path).dump(stage) pbar_stages.update() return link_failures