Exemple #1
0
 def get_hash(self):
     if not self.use_cache:
         return ostage(
             self.repo.odb.local,
             self.path_info,
             self.fs,
             self.fs.PARAM_CHECKSUM,
         ).hash_info
     return ostage(self.odb, self.path_info, self.fs,
                   self.odb.fs.PARAM_CHECKSUM).hash_info
Exemple #2
0
    def save(self):
        if not self.exists:
            raise self.DoesNotExistError(self)

        if not self.isfile and not self.isdir:
            raise self.IsNotFileOrDirError(self)

        if self.is_empty:
            logger.warning(f"'{self}' is empty.")

        self.ignore()

        if self.metric or self.plot:
            self.verify_metric()

        if not self.use_cache:
            self.hash_info = self.get_hash()
            if not self.IS_DEPENDENCY:
                logger.debug("Output '%s' doesn't use cache. Skipping saving.",
                             self)
            return

        assert not self.IS_DEPENDENCY

        if not self.changed():
            logger.debug("Output '%s' didn't change. Skipping saving.", self)
            return

        self.obj = ostage(self.odb, self.path_info, self.fs)
        self.hash_info = self.obj.hash_info
        self.isexec = self.isfile() and self.fs.isexec(self.path_info)
Exemple #3
0
    def commit(self, filter_info=None):
        if not self.exists:
            raise self.DoesNotExistError(self)

        assert self.hash_info

        if self.use_cache:
            obj = ostage(
                self.odb,
                filter_info or self.path_info,
                self.fs,
                self.odb.fs.PARAM_CHECKSUM,
                dvcignore=self.dvcignore,
            )
            objects.save(self.odb, obj)
            checkout(
                filter_info or self.path_info,
                self.fs,
                obj,
                self.odb,
                relink=True,
                dvcignore=self.dvcignore,
                state=self.repo.state,
            )
            self.set_exec()
Exemple #4
0
 def _to_checksum(output):
     if on_working_fs:
         return ostage(
             repo.odb.local,
             output.path_info,
             repo.odb.local.fs,
         ).hash_info.value
     return output.hash_info.value
Exemple #5
0
def _output_paths(repo, targets):
    from dvc.fs.local import LocalFileSystem
    from dvc.objects.stage import stage as ostage

    on_working_fs = isinstance(repo.fs, LocalFileSystem)

    def _exists(output):
        if on_working_fs:
            return output.exists
        return True

    def _to_path(output):
        return (
            str(output)
            if not output.is_dir_checksum
            else os.path.join(str(output), "")
        )

    for output in repo.index.outs:
        if _exists(output):
            yield_output = targets is None or any(
                output.path_info.isin_or_eq(target) for target in targets
            )

            if on_working_fs:
                _, _, obj = ostage(
                    repo.odb.local,
                    output.path_info,
                    repo.odb.local.fs,
                    "md5",
                    dry_run=True,
                    dvcignore=output.dvcignore,
                )
                hash_info = obj.hash_info
            else:
                hash_info = output.hash_info
                obj = output.get_obj()

            if yield_output:
                yield _to_path(output), hash_info.value

            if not obj:
                continue

            if output.is_dir_checksum and (
                yield_output
                or any(target.isin(output.path_info) for target in targets)
            ):
                yield from _dir_output_paths(output.path_info, obj, targets)
Exemple #6
0
    def commit(self, filter_info=None):
        if not self.exists:
            raise self.DoesNotExistError(self)

        assert self.hash_info

        if self.use_cache:
            obj = ostage(self.odb, filter_info or self.path_info, self.fs)
            objects.save(self.odb, obj)
            checkout(
                filter_info or self.path_info,
                self.fs,
                obj,
                self.odb,
                relink=True,
            )
            self.set_exec()
Exemple #7
0
    def transfer(
        self,
        source,
        odb=None,
        jobs=None,
        update=False,
        no_progress_bar=False,
    ):
        from dvc.fs import get_cloud_fs

        if odb is None:
            odb = self.odb

        cls, config = get_cloud_fs(self.repo, url=source)
        from_fs = cls(**config)
        from_info = from_fs.path_info

        # When running import-url --to-remote / add --to-remote/-o ... we
        # assume that it is unlikely that the odb will contain majority of the
        # hashes, so we transfer everything as is (even if that file might
        # already be in the cache) and don't waste an upload to scan the layout
        # of the source location. But when doing update --to-remote, there is
        # a high probability that the odb might contain some of the hashes, so
        # we first calculate all the hashes (but don't transfer anything) and
        # then only update the missing cache files.

        upload = not (update and from_fs.isdir(from_info))
        jobs = jobs or min((from_fs.jobs, odb.fs.jobs))
        obj = ostage(
            odb,
            from_info,
            from_fs,
            "md5",
            upload=upload,
            jobs=jobs,
            no_progress_bar=no_progress_bar,
        )
        osave(odb, obj, jobs=jobs, move=upload)

        self.hash_info = obj.hash_info
        return obj
Exemple #8
0
 def get_hash(self):
     if not self.use_cache:
         return get_hash(self.path_info, self.fs, self.fs.PARAM_CHECKSUM)
     return ostage(self.odb, self.path_info, self.fs).hash_info
Exemple #9
0
def _process_stages(
    repo, sub_targets, stages, no_commit, pbar, to_remote, to_cache, **kwargs
):
    link_failures = []
    from dvc.dvcfile import Dvcfile

    from ..output.base import OutputDoesNotExistError

    if to_remote or to_cache:
        # Already verified in the add()
        (stage,) = stages
        (target,) = sub_targets
        (out,) = stage.outs

        if to_remote:
            out.hash_info = repo.cloud.transfer(
                target,
                jobs=kwargs.get("jobs"),
                remote=kwargs.get("remote"),
                command="add",
            )
        else:
            from dvc.fs import get_cloud_fs
            from dvc.objects import save as osave
            from dvc.objects.stage import stage as ostage

            from_fs = get_cloud_fs(repo, url=target)
            jobs = kwargs.get("jobs", min((from_fs.jobs, out.odb.fs.jobs)))
            obj = ostage(
                out.odb,
                from_fs.path_info,
                from_fs,
                "md5",
                upload=True,
                jobs=jobs,
            )
            osave(out.odb, obj, jobs=jobs, move=False)
            out.hash_info = obj.hash_info
            out.checkout()

        Dvcfile(repo, stage.path).dump(stage)
        return link_failures

    with Tqdm(
        total=len(stages),
        desc="Processing",
        unit="file",
        disable=len(stages) == 1,
    ) as pbar_stages:
        for stage in stages:
            try:
                stage.save()
            except OutputDoesNotExistError:
                pbar.n -= 1
                raise

            try:
                if not no_commit:
                    stage.commit()
            except CacheLinkError:
                link_failures.append(stage)

            Dvcfile(repo, stage.path).dump(stage)
            pbar_stages.update()

    return link_failures