Exemple #1
0
def file_md5(fname):
    """ get the (md5 hexdigest, md5 digest) of a file """
    from dvc.progress import Tqdm
    from dvc.istextfile import istextfile

    if os.path.exists(fname):
        hash_md5 = hashlib.md5()
        binary = not istextfile(fname)
        size = os.path.getsize(fname)
        bar = False
        if size >= LARGE_FILE_SIZE:
            bar = True
            msg = "Computing md5 for a large file {}. This is only done once."
            logger.info(msg.format(relpath(fname)))
        name = relpath(fname)

        with Tqdm(
            desc_truncate=name,
            disable=not bar,
            total=size,
            bytes=True,
            leave=False,
        ) as pbar:
            with open(fname, "rb") as fobj:
                while True:
                    data = fobj.read(LOCAL_CHUNK_SIZE)
                    if not data:
                        break

                    if binary:
                        chunk = data
                    else:
                        chunk = dos2unix(data)

                    hash_md5.update(chunk)
                    pbar.update(len(data))

        return (hash_md5.hexdigest(), hash_md5.digest())

    return (None, None)
Exemple #2
0
    def _download_dir(self, from_info, to_info, name, no_progress_bar,
                      file_mode, dir_mode):
        from_infos = list(self.walk_files(from_info))
        to_infos = (to_info / info.relative_to(from_info)
                    for info in from_infos)

        with Tqdm(
                total=len(from_infos),
                desc="Downloading directory",
                unit="Files",
                disable=no_progress_bar,
        ) as pbar:
            download_files = pbar.wrap_fn(
                partial(
                    self._download_file,
                    name=name,
                    no_progress_bar=True,
                    file_mode=file_mode,
                    dir_mode=dir_mode,
                ))
            with ThreadPoolExecutor(max_workers=self.JOBS) as executor:
                futures = [
                    executor.submit(download_files, from_info, to_info)
                    for from_info, to_info in zip(from_infos, to_infos)
                ]

                # NOTE: unlike pulling/fetching cache, where we need to
                # download everything we can, not raising an error here might
                # turn very ugly, as the user might think that he has
                # downloaded a complete directory, while having a partial one,
                # which might cause unexpected results in his pipeline.
                for future in as_completed(futures):
                    # NOTE: executor won't let us raise until all futures that
                    # it has are finished, so we need to cancel them ourselves
                    # before re-raising.
                    exc = future.exception()
                    if exc:
                        for entry in futures:
                            entry.cancel()
                        raise exc
Exemple #3
0
def file_md5(fname, tree=None):
    """ get the (md5 hexdigest, md5 digest) of a file """
    from dvc.istextfile import istextfile
    from dvc.progress import Tqdm

    if tree:
        exists_func = tree.exists
        stat_func = tree.stat
        open_func = tree.open
    else:
        exists_func = os.path.exists
        stat_func = os.stat
        open_func = open

    if exists_func(fname):
        hash_md5 = hashlib.md5()
        binary = not istextfile(fname, tree=tree)
        size = stat_func(fname).st_size
        no_progress_bar = True
        if size >= LARGE_FILE_SIZE:
            no_progress_bar = False
            msg = (
                "Computing md5 for a large file '{}'. This is only done once."
            )
            logger.info(msg.format(relpath(fname)))
        name = relpath(fname)

        with Tqdm(
            desc=name,
            disable=no_progress_bar,
            total=size,
            bytes=True,
            leave=False,
        ) as pbar:
            with open_func(fname, "rb") as fobj:
                _fobj_md5(fobj, hash_md5, binary, pbar.update)

        return (hash_md5.hexdigest(), hash_md5.digest())

    return (None, None)
Exemple #4
0
    def _get_plans(self, download, remote, status_info, status):
        cache = []
        path_infos = []
        names = []
        checksums = []
        for md5, info in Tqdm(
            status_info.items(), desc="Analysing status", unit="file"
        ):
            if info["status"] == status:
                cache.append(self.checksum_to_path_info(md5))
                path_infos.append(remote.checksum_to_path_info(md5))
                names.append(info["name"])
                checksums.append(md5)

        if download:
            to_infos = cache
            from_infos = path_infos
        else:
            to_infos = path_infos
            from_infos = cache

        return from_infos, to_infos, names, checksums
Exemple #5
0
    def cache_exists(self, checksums, jobs=None):
        """This is older implementation used in remote/base.py
        We are reusing it in RemoteSSH, because SSH's batch_exists proved to be
        faster than current approach (relying on exists(path_info)) applied in
        remote/base.
        """
        if not self.no_traverse:
            return list(set(checksums) & set(self.all()))

        with Tqdm(total=len(checksums), unit="md5") as pbar:

            def exists_with_progress(chunks):
                return self.batch_exists(chunks, callback=pbar.update_desc)

            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, num_chunks=self.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(checksums, in_remote))
                pbar.update_desc("", 0)  # clear path name description
                return ret
Exemple #6
0
 def _upload(self,
             from_file,
             to_info,
             name=None,
             no_progress_bar=False,
             **pbar_args):
     total = os.path.getsize(from_file)
     with Tqdm(
             disable=no_progress_bar,
             total=total,
             bytes=True,
             desc=name,
             **pbar_args,
     ) as pbar:
         obj = self._get_obj(to_info)
         obj.upload_file(
             from_file,
             Callback=pbar.update,
             ExtraArgs=self.fs_args.get("s3_additional_kwargs"),
             Config=self._transfer_config,
         )
     self.fs.invalidate_cache(self._with_bucket(to_info.parent))
Exemple #7
0
    def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs):
        if not hash_info.dir_info:
            hash_info.dir_info = tree.cache.get_dir_cache(hash_info)
        hi = self.save_dir_info(hash_info.dir_info, hash_info)
        for entry_info, entry_hash in Tqdm(
                hi.dir_info.items(path_info),
                desc="Saving " + path_info.name,
                unit="file",
        ):
            self._save_file(entry_info,
                            tree,
                            entry_hash,
                            save_link=False,
                            **kwargs)

        if save_link:
            self.tree.state.save_link(path_info)
        if self.tree.exists(path_info):
            self.tree.state.save(path_info, hi)

        cache_info = self.tree.hash_to_path_info(hi.value)
        self.tree.state.save(cache_info, hi)
Exemple #8
0
    def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs):
        if not hash_info.dir_info:
            hash_info.dir_info = tree.cache.get_dir_cache(hash_info)
        hi = self.save_dir_info(hash_info.dir_info, hash_info)
        for entry in Tqdm(
            hi.dir_info, desc="Saving " + path_info.name, unit="file"
        ):
            entry_info = path_info / entry[self.tree.PARAM_RELPATH]
            entry_hash = HashInfo(
                self.tree.PARAM_CHECKSUM, entry[self.tree.PARAM_CHECKSUM]
            )
            self._save_file(
                entry_info, tree, entry_hash, save_link=False, **kwargs
            )

        if save_link:
            self.tree.state.save_link(path_info)
        if self.tree.exists(path_info):
            self.tree.state.save(path_info, hi.value)

        cache_info = self.tree.hash_to_path_info(hi.value)
        self.tree.state.save(cache_info, hi.value)
Exemple #9
0
    def gdrive_upload_file(
        self, args, no_progress_bar=True, from_file="", progress_name=""
    ):
        item = self.drive.CreateFile(
            {"title": args["title"], "parents": [{"id": args["parent_id"]}]}
        )

        with open(from_file, "rb") as fobj:
            total = os.path.getsize(from_file)
            with Tqdm.wrapattr(
                fobj,
                "read",
                desc=progress_name,
                total=total,
                disable=no_progress_bar,
            ) as wrapped:
                # PyDrive doesn't like content property setting for empty files
                # https://github.com/gsuitedevs/PyDrive/issues/121
                if total:
                    item.content = wrapped
                item.Upload()
        return item
Exemple #10
0
def _checkout(
    self, targets=None, with_deps=False, force=False, recursive=False
):
    from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError

    stages = set()
    targets = targets or [None]
    for target in targets:
        try:
            new = self.collect(
                target, with_deps=with_deps, recursive=recursive
            )
            stages.update(new)
        except (StageFileDoesNotExistError, StageFileBadNameError) as exc:
            if not target:
                raise
            raise CheckoutErrorSuggestGit(target, exc)

    _cleanup_unused_links(self, self.stages)
    total = get_all_files_numbers(stages)
    if total == 0:
        logger.info("Nothing to do")
    failed = []
    with Tqdm(
        total=total, unit="file", desc="Checkout", disable=total == 0
    ) as pbar:
        for stage in stages:
            if stage.locked:
                logger.warning(
                    "DVC-file '{path}' is locked. Its dependencies are"
                    " not going to be checked out.".format(path=stage.relpath)
                )

            failed.extend(
                stage.checkout(force=force, progress_callback=pbar.update_desc)
            )
    if failed:
        raise CheckoutError(failed)
Exemple #11
0
    def _cache_exists_traverse(
        self, checksums, remote_checksums, jobs=None, name=None
    ):
        logger.debug(
            "Querying {} checksums via threaded traverse".format(
                len(checksums)
            )
        )

        traverse_prefixes = ["{:02x}".format(i) for i in range(1, 256)]
        if self.TRAVERSE_PREFIX_LEN > 2:
            traverse_prefixes += [
                "{0:0{1}x}".format(i, self.TRAVERSE_PREFIX_LEN)
                for i in range(1, pow(16, self.TRAVERSE_PREFIX_LEN - 2))
            ]
        with Tqdm(
            desc="Querying "
            + ("cache in " + name if name else "remote cache"),
            total=len(traverse_prefixes),
            unit="dir",
        ) as pbar:

            def list_with_update(prefix):
                ret = map(
                    self.path_to_checksum,
                    list(self.list_cache_paths(prefix=prefix)),
                )
                pbar.update_desc(
                    "Querying cache in '{}'".format(self.path_info / prefix)
                )
                return ret

            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                in_remote = executor.map(list_with_update, traverse_prefixes,)
                remote_checksums.update(
                    itertools.chain.from_iterable(in_remote)
                )
            return list(checksums & remote_checksums)
Exemple #12
0
    def _gdrive_download_file(self, item_id, to_file, progress_desc,
                              no_progress_bar):
        param = {"id": item_id}
        # it does not create a file on the remote
        gdrive_file = self._drive.CreateFile(param)

        gdrive_file.FetchMetadata(fields="fileSize")
        size = gdrive_file["fileSize"]

        with Tqdm(
                desc=progress_desc,
                disable=no_progress_bar,
                bytes=True,
                # explicit `bar_format` as `total` will be set by `update_to`
                bar_format=Tqdm.BAR_FMT_DEFAULT,
        ) as pbar:
            if size:
                gdrive_file.GetContentFile(to_file, callback=pbar.update_to)
            else:
                # PyDrive doesn't like downloading empty files
                # https://github.com/iterative/dvc/issues/4286
                with open(to_file, "w"):
                    pass
Exemple #13
0
    def list_hashes_exists(self, hashes, jobs=None, name=None):
        """Return list of the specified hashes which exist in this tree.
        Hashes will be queried individually.
        """
        logger.debug("Querying {} hashes via object_exists".format(
            len(hashes)))
        with Tqdm(
                desc="Querying " +
            ("cache in " + name if name else "remote cache"),
                total=len(hashes),
                unit="file",
        ) as pbar:

            def exists_with_progress(path_info):
                ret = self.exists(path_info)
                pbar.update_msg(str(path_info))
                return ret

            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                path_infos = map(self.hash_to_path_info, hashes)
                in_remote = executor.map(exists_with_progress, path_infos)
                ret = list(itertools.compress(hashes, in_remote))
                return ret
Exemple #14
0
    def _upload(self,
                from_file,
                to_info,
                name=None,
                no_progress_bar=False,
                **_kwargs):
        # First try to create parent directories
        self.makedirs(to_info.parent)

        file_size = os.path.getsize(from_file)
        with open(from_file, "rb") as fd:
            progress_context = (
                nullcontext(fd) if file_size == 0 else Tqdm.wrapattr(
                    fd,
                    "read",
                    total=None if no_progress_bar else file_size,
                    leave=False,
                    desc=to_info.url if name is None else name,
                    disable=no_progress_bar,
                ))
            with progress_context as fd_wrapped:
                self._client.upload_to(buff=fd_wrapped,
                                       remote_path=to_info.path)
Exemple #15
0
    def _download_dir(self, from_info, to_info, name, no_progress_bar,
                      file_mode, dir_mode):
        from_infos = list(self.walk_files(from_info))
        to_infos = (to_info / info.relative_to(from_info)
                    for info in from_infos)

        with ThreadPoolExecutor(max_workers=self.JOBS) as executor:
            download_files = partial(
                self._download_file,
                name=name,
                no_progress_bar=True,
                file_mode=file_mode,
                dir_mode=dir_mode,
            )
            futures = executor.map(download_files, from_infos, to_infos)
            with Tqdm(
                    futures,
                    total=len(from_infos),
                    desc="Downloading directory",
                    unit="Files",
                    disable=no_progress_bar,
            ) as futures:
                return sum(futures)
Exemple #16
0
def copyfile(src, dest, no_progress_bar=False, name=None):
    """Copy file with progress bar"""
    from dvc.exceptions import DvcException
    from dvc.progress import Tqdm
    from dvc.system import System

    name = name if name else os.path.basename(dest)
    total = os.stat(src).st_size

    if os.path.isdir(dest):
        dest = os.path.join(dest, os.path.basename(src))

    try:
        System.reflink(src, dest)
    except DvcException:
        with Tqdm(desc=name, disable=no_progress_bar, total=total,
                  bytes=True) as pbar:
            with open(src, "rb") as fsrc, open(dest, "wb+") as fdest:
                while True:
                    buf = fsrc.read(LOCAL_CHUNK_SIZE)
                    if not buf:
                        break
                    fdest.write(buf)
                    pbar.update(len(buf))
Exemple #17
0
    def _save_dir(
        self,
        path_info,
        tree,
        hash_info,
        filter_info=None,
        **kwargs,
    ):
        if not hash_info.dir_info:
            hash_info.dir_info = tree.cache.get_dir_cache(hash_info)
        hi = self.save_dir_info(hash_info.dir_info, hash_info)
        for entry_info, entry_hash in Tqdm(
                hi.dir_info.items(path_info),
                desc="Saving " + path_info.name,
                unit="file",
        ):
            if filter_info and not entry_info.isin_or_eq(filter_info):
                continue

            self._save_file(entry_info, tree, entry_hash, **kwargs)

        cache_info = self.tree.hash_to_path_info(hi.value)
        self.tree.state.save(cache_info, hi)
        tree.state.save(path_info, hi)
Exemple #18
0
def _checkout(self,
              targets=None,
              with_deps=False,
              force=False,
              recursive=False):
    from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError

    stages = set()

    if not targets:
        targets = [None]
        _cleanup_unused_links(self)

    for target in targets:
        try:
            new = self.collect(target,
                               with_deps=with_deps,
                               recursive=recursive)
            stages.update(new)
        except (StageFileDoesNotExistError, StageFileBadNameError) as exc:
            if not target:
                raise
            raise CheckoutErrorSuggestGit(target, exc)

    total = get_all_files_numbers(stages)
    if total == 0:
        logger.info("Nothing to do")
    failed = []
    with Tqdm(total=total, unit="file", desc="Checkout",
              disable=total == 0) as pbar:
        for stage in stages:
            failed.extend(
                stage.checkout(force=force,
                               progress_callback=pbar.update_desc))
    if failed:
        raise CheckoutError(failed)
Exemple #19
0
def save(
    odb: "ObjectDB",
    obj: "HashFile",
    jobs: Optional[int] = None,
    **kwargs,
):
    if isinstance(obj, Tree):
        with ThreadPoolExecutor(max_workers=jobs) as executor:
            for future in Tqdm(
                    as_completed(
                        executor.submit(
                            odb.add,
                            entry.path_info,
                            entry.fs,
                            entry.hash_info,
                            **kwargs,
                        ) for _, entry in obj),
                    desc="Saving files",
                    total=len(obj),
                    unit="file",
            ):
                future.result()

    odb.add(obj.path_info, obj.fs, obj.hash_info, **kwargs)
Exemple #20
0
    def _gdrive_upload_file(
        self,
        parent_id,
        title,
        no_progress_bar=False,
        from_file="",
        progress_name="",
    ):
        item = self._drive.CreateFile(
            {"title": title, "parents": [{"id": parent_id}]}
        )

        with open(from_file, "rb") as fobj:
            total = os.path.getsize(from_file)
            with Tqdm.wrapattr(
                fobj,
                "read",
                desc=progress_name,
                total=total,
                disable=no_progress_bar,
            ) as wrapped:
                item.content = wrapped
                item.Upload()
        return item
Exemple #21
0
    def progress(*args, **kwargs) -> "Tqdm":
        from dvc.progress import Tqdm

        return Tqdm(*args, **kwargs)
Exemple #22
0
    def fetch_refspecs(
        self,
        url: str,
        refspecs: Iterable[str],
        force: Optional[bool] = False,
        on_diverged: Optional[Callable[[str, str], bool]] = None,
    ):
        from dulwich.client import get_transport_and_path
        from dulwich.objectspec import parse_reftuples
        from dulwich.porcelain import (
            DivergedBranches,
            check_diverged,
            get_remote_repo,
        )

        fetch_refs = []

        def determine_wants(remote_refs):
            fetch_refs.extend(
                parse_reftuples(
                    remote_refs,
                    self.repo.refs,
                    [os.fsencode(refspec) for refspec in refspecs],
                    force=force,
                ))
            return [
                remote_refs[lh] for (lh, _, _) in fetch_refs
                if remote_refs[lh] not in self.repo.object_store
            ]

        try:
            _remote, location = get_remote_repo(self.repo, url)
            client, path = get_transport_and_path(location)
        except Exception as exc:
            raise SCMError(
                f"'{url}' is not a valid Git remote or URL") from exc

        with Tqdm(desc="Fetching git refs",
                  bar_format=self.BAR_FMT_NOTOTAL) as pbar:

            def progress(msg_b):
                msg = msg_b.decode("ascii").strip()
                pbar.update_msg(msg)
                pbar.refresh()
                logger.trace(msg)

            fetch_result = client.fetch(
                path,
                self.repo,
                progress=progress,
                determine_wants=determine_wants,
            )
        for (lh, rh, _) in fetch_refs:
            try:
                if rh in self.repo.refs:
                    check_diverged(self.repo, self.repo.refs[rh],
                                   fetch_result.refs[lh])
            except DivergedBranches:
                if not force:
                    overwrite = False
                    if on_diverged:
                        overwrite = on_diverged(
                            os.fsdecode(rh),
                            os.fsdecode(fetch_result.refs[lh]))
                    if not overwrite:
                        continue
            self.repo.refs[rh] = fetch_result.refs[lh]
Exemple #23
0
    def push_refspec(
        self,
        url: str,
        src: Optional[str],
        dest: str,
        force: bool = False,
        on_diverged: Optional[Callable[[str, str], bool]] = None,
    ):
        from dulwich.client import get_transport_and_path
        from dulwich.errors import NotGitRepository, SendPackError
        from dulwich.porcelain import (
            DivergedBranches,
            check_diverged,
            get_remote_repo,
        )

        dest_refs, values = self._push_dest_refs(src, dest)

        try:
            _remote, location = get_remote_repo(self.repo, url)
            client, path = get_transport_and_path(location)
        except Exception as exc:
            raise SCMError(
                f"'{url}' is not a valid Git remote or URL") from exc

        def update_refs(refs):
            new_refs = {}
            for ref, value in zip(dest_refs, values):
                if ref in refs:
                    local_sha = self.repo.refs[ref]
                    remote_sha = refs[ref]
                    try:
                        check_diverged(self.repo, remote_sha, local_sha)
                    except DivergedBranches:
                        if not force:
                            overwrite = False
                            if on_diverged:
                                overwrite = on_diverged(
                                    os.fsdecode(ref),
                                    os.fsdecode(remote_sha),
                                )
                            if not overwrite:
                                continue
                new_refs[ref] = value
            return new_refs

        try:
            with Tqdm(desc="Pushing git refs",
                      bar_format=self.BAR_FMT_NOTOTAL) as pbar:

                def progress(msg_b):
                    msg = msg_b.decode("ascii").strip()
                    pbar.update_msg(msg)
                    pbar.refresh()
                    logger.trace(msg)

                client.send_pack(
                    path,
                    update_refs,
                    self.repo.object_store.generate_pack_data,
                    progress=progress,
                )
        except (NotGitRepository, SendPackError) as exc:
            raise SCMError("Git failed to push '{src}' to '{url}'") from exc
Exemple #24
0
    def _process(
        self,
        named_cache,
        remote,
        jobs=None,
        show_checksums=False,
        download=False,
    ):
        logger.debug("Preparing to {} '{}'".format(
            "download data from" if download else "upload data to",
            remote.tree.path_info,
        ))

        if download:
            func = partial(
                _log_exceptions(remote.tree.download, "download"),
                dir_mode=self.tree.dir_mode,
                file_mode=self.tree.file_mode,
            )
            status = STATUS_DELETED
            desc = "Downloading"
        else:
            func = _log_exceptions(remote.tree.upload, "upload")
            status = STATUS_NEW
            desc = "Uploading"

        if jobs is None:
            jobs = remote.tree.JOBS

        dir_status, file_status, dir_contents = self._status(
            named_cache,
            remote,
            jobs=jobs,
            show_checksums=show_checksums,
            download=download,
        )

        dir_plans, _ = self._get_plans(download, remote, dir_status, status)
        file_plans, missing_files = self._get_plans(download, remote,
                                                    file_status, status)

        total = len(dir_plans[0]) + len(file_plans[0])
        if total == 0:
            return 0

        with Tqdm(total=total, unit="file", desc=desc) as pbar:
            func = pbar.wrap_fn(func)
            with ThreadPoolExecutor(max_workers=jobs) as executor:
                if download:
                    from_infos, to_infos, names, _ = (
                        d + f for d, f in zip(dir_plans, file_plans))
                    fails = sum(executor.map(func, from_infos, to_infos,
                                             names))
                else:
                    # for uploads, push files first, and any .dir files last

                    file_futures = {}
                    for from_info, to_info, name, hash_ in zip(*file_plans):
                        file_futures[hash_] = executor.submit(
                            func, from_info, to_info, name)
                    dir_futures = {}
                    for from_info, to_info, name, dir_hash in zip(*dir_plans):
                        # if for some reason a file contained in this dir is
                        # missing both locally and in the remote, we want to
                        # push whatever file content we have, but should not
                        # push .dir file
                        for file_hash in missing_files:
                            if file_hash in dir_contents[dir_hash]:
                                logger.debug(
                                    "directory '%s' contains missing files,"
                                    "skipping .dir file upload",
                                    name,
                                )
                                break
                        else:
                            wait_futures = {
                                future
                                for file_hash, future in file_futures.items()
                                if file_hash in dir_contents[dir_hash]
                            }
                            dir_futures[dir_hash] = executor.submit(
                                self._dir_upload,
                                func,
                                wait_futures,
                                from_info,
                                to_info,
                                name,
                            )
                    fails = sum(future.result() for future in concat(
                        file_futures.values(), dir_futures.values()))

        if fails:
            if download:
                remote.index.clear()
                raise DownloadError(fails)
            raise UploadError(fails)

        if not download:
            # index successfully pushed dirs
            for dir_hash, future in dir_futures.items():
                if future.result() == 0:
                    file_hashes = dir_contents[dir_hash]
                    logger.debug("Indexing pushed dir '{}' with "
                                 "'{}' nested files".format(
                                     dir_hash, len(file_hashes)))
                    remote.index.update([dir_hash], file_hashes)

        return len(dir_plans[0]) + len(file_plans[0])
Exemple #25
0
 def progress(*args, **kwargs) -> Tqdm:
     return Tqdm(*args, **kwargs)
Exemple #26
0
 def progress(self, *args, **kwargs) -> Tqdm:
     kwargs.setdefault("file", self.error_output)
     return Tqdm(*args, **kwargs)
Exemple #27
0
    def _process(
        self,
        named_cache,
        remote,
        jobs=None,
        show_checksums=False,
        download=False,
    ):
        logger.debug(
            "Preparing to {} '{}'".format(
                "download data from" if download else "upload data to",
                remote.path_info,
            )
        )

        if download:
            func = partial(
                remote.download,
                dir_mode=self._dir_mode,
                file_mode=self._file_mode,
            )
            status = STATUS_DELETED
            desc = "Downloading"
        else:
            func = remote.upload
            status = STATUS_NEW
            desc = "Uploading"

        if jobs is None:
            jobs = remote.JOBS

        dir_status, file_status, dir_contents = self._status(
            named_cache,
            remote,
            jobs=jobs,
            show_checksums=show_checksums,
            download=download,
        )

        dir_plans = self._get_plans(download, remote, dir_status, status)
        file_plans = self._get_plans(download, remote, file_status, status)

        total = len(dir_plans[0]) + len(file_plans[0])
        if total == 0:
            return 0

        with Tqdm(total=total, unit="file", desc=desc) as pbar:
            func = pbar.wrap_fn(func)
            with ThreadPoolExecutor(max_workers=jobs) as executor:
                if download:
                    fails = sum(executor.map(func, *dir_plans))
                    fails += sum(executor.map(func, *file_plans))
                else:
                    # for uploads, push files first, and any .dir files last

                    file_futures = {}
                    for from_info, to_info, name, checksum in zip(*file_plans):
                        file_futures[checksum] = executor.submit(
                            func, from_info, to_info, name
                        )
                    dir_futures = {}
                    for from_info, to_info, name, dir_checksum in zip(
                        *dir_plans
                    ):
                        wait_futures = {
                            future
                            for file_checksum, future in file_futures.items()
                            if file_checksum in dir_contents[dir_checksum]
                        }
                        dir_futures[dir_checksum] = executor.submit(
                            self._dir_upload,
                            func,
                            wait_futures,
                            from_info,
                            to_info,
                            name,
                        )
                    fails = sum(
                        future.result()
                        for future in concat(
                            file_futures.values(), dir_futures.values()
                        )
                    )

        if fails:
            if download:
                remote.index.clear()
                raise DownloadError(fails)
            raise UploadError(fails)

        if not download:
            # index successfully pushed dirs
            for dir_checksum, future in dir_futures.items():
                if future.result() == 0:
                    file_checksums = dir_contents[dir_checksum]
                    logger.debug(
                        "Indexing pushed dir '{}' with "
                        "'{}' nested files".format(
                            dir_checksum, len(file_checksums)
                        )
                    )
                    remote.index.update([dir_checksum], file_checksums)

        return len(dir_plans[0]) + len(file_plans[0])
Exemple #28
0
def checkout(
    self,
    targets=None,
    with_deps=False,
    force=False,
    relink=False,
    recursive=False,
    allow_persist_missing=False,
):
    from dvc.stage.exceptions import (
        StageFileBadNameError,
        StageFileDoesNotExistError,
    )

    unused = []
    stats = {
        "added": [],
        "deleted": [],
        "modified": [],
        "failed": [],
    }
    if not targets:
        targets = [None]
        unused = _get_unused_links(self)

    stats["deleted"] = [_fspath_dir(u) for u in unused]
    self.state.remove_links(unused)

    if isinstance(targets, str):
        targets = [targets]

    pairs = set()
    for target in targets:
        try:
            pairs.update(
                self.collect_granular(
                    target, with_deps=with_deps, recursive=recursive
                )
            )
        except (
            StageFileDoesNotExistError,
            StageFileBadNameError,
            NoOutputOrStageError,
        ) as exc:
            if not target:
                raise
            raise CheckoutErrorSuggestGit(target) from exc

    total = get_all_files_numbers(pairs)
    with Tqdm(
        total=total, unit="file", desc="Checkout", disable=total == 0
    ) as pbar:
        for stage, filter_info in pairs:
            result = stage.checkout(
                force=force,
                progress_callback=pbar.update_msg,
                relink=relink,
                filter_info=filter_info,
                allow_persist_missing=allow_persist_missing,
            )
            for key, items in result.items():
                stats[key].extend(_fspath_dir(path) for path in items)

    if stats.get("failed"):
        raise CheckoutError(stats["failed"], stats)

    del stats["failed"]
    return stats
Exemple #29
0
 def cache_exists(self, checksums, jobs=None):
     return [
         checksum for checksum in Tqdm(checksums, unit="md5")
         if not self.changed_cache_file(checksum)
     ]
Exemple #30
0
    def upload_fobj(self, fobj, to_info, no_progress_bar=False, **pbar_args):
        from dvc.progress import Tqdm

        with Tqdm(bytes=True, disable=no_progress_bar, **pbar_args) as pbar:
            with pbar.wrapattr(fobj, "read") as fobj:
                self.copy_fobj(fobj, to_info, chunk_size=self.CHUNK_SIZE)