Esempio n. 1
0
    def _process(dest_tree, src_tree, collected_files, download=False):
        from dvc.cache.local import _log_exceptions

        from_infos = []
        to_infos = []
        names = []
        for from_info in collected_files:
            from_infos.append(from_info)
            fname = from_info.relative_to(src_tree.path_info)
            names.append(str(fname))
            to_infos.append(dest_tree.path_info / fname)
        total = len(from_infos)

        if download:
            func = partial(
                _log_exceptions(src_tree.download, "download"),
                dir_mode=dest_tree.dir_mode,
                file_mode=dest_tree.file_mode,
            )
            desc = "Downloading"
        else:
            func = partial(_log_exceptions(dest_tree.upload, "upload"))
            desc = "Uploading"

        with Tqdm(total=total, unit="file", desc=desc) as pbar:
            func = pbar.wrap_fn(func)
            # TODO: parallelize this, currently --jobs for repro applies to
            # number of repro executors not download threads
            with ThreadPoolExecutor(max_workers=1) as dl_executor:
                fails = sum(dl_executor.map(func, from_infos, to_infos, names))

        if fails:
            if download:
                raise DownloadError(fails)
            raise UploadError(fails)
Esempio n. 2
0
    def _upload_plans(
        self, dir_plans, file_plans, dir_contents, missing_files, processor
    ):
        total_fails = 0
        succeeded_dir_hashes = []
        all_file_plans = list(zip(*file_plans))
        for dir_from_info, dir_to_info, dir_name, dir_hash in zip(*dir_plans):
            bound_file_plans = []
            directory_hashes = dir_contents[dir_hash]

            for file_plan in all_file_plans.copy():
                if file_plan[-1] in directory_hashes:
                    bound_file_plans.append(file_plan)
                    all_file_plans.remove(file_plan)

            dir_fails = processor(bound_file_plans)
            if dir_fails:
                logger.debug(
                    "failed to upload full contents of '{}', "
                    "aborting .dir file upload".format(dir_name)
                )
                logger.error(
                    f"failed to upload '{dir_from_info}'"
                    f" to '{dir_to_info}'"
                )
                total_fails += dir_fails + 1
            elif directory_hashes.intersection(missing_files):
                # if for some reason a file contained in this dir is
                # missing both locally and in the remote, we want to
                # push whatever file content we have, but should not
                # push .dir file
                logger.debug(
                    "directory '%s' contains missing files,"
                    "skipping .dir file upload",
                    dir_name,
                )
            else:
                is_dir_failed = processor.transfer_func(
                    dir_from_info, dir_to_info, dir_name
                )
                total_fails += is_dir_failed
                if not is_dir_failed:
                    succeeded_dir_hashes.append(dir_hash)

        # insert the rest
        total_fails += processor(all_file_plans)
        if total_fails:
            raise UploadError(total_fails)

        # index successfully pushed dirs
        for dir_hash in succeeded_dir_hashes:
            file_hashes = dir_contents[dir_hash]
            logger.debug(
                "Indexing pushed dir '{}' with "
                "'{}' nested files".format(dir_hash, len(file_hashes))
            )
            self.index.update([dir_hash], file_hashes)
Esempio n. 3
0
    def _process(
        self,
        named_cache,
        remote,
        jobs=None,
        show_checksums=False,
        download=False,
    ):
        logger.debug(
            "Preparing to {} '{}'".format(
                "download data from" if download else "upload data to",
                remote.path_info,
            )
        )

        if download:
            func = partial(
                remote.download,
                dir_mode=self._dir_mode,
                file_mode=self._file_mode,
            )
            status = STATUS_DELETED
        else:
            func = remote.upload
            status = STATUS_NEW

        if jobs is None:
            jobs = remote.JOBS

        status_info = self.status(
            named_cache,
            remote,
            jobs=jobs,
            show_checksums=show_checksums,
            download=download,
        )

        plans = self._get_plans(download, remote, status_info, status)

        if len(plans[0]) == 0:
            return 0

        if jobs > 1:
            with ThreadPoolExecutor(max_workers=jobs) as executor:
                fails = sum(executor.map(func, *plans))
        else:
            fails = sum(map(func, *plans))

        if fails:
            if download:
                raise DownloadError(fails)
            raise UploadError(fails)

        return len(plans[0])
Esempio n. 4
0
    def _process(dest_tree, src_tree, collected_files, download=False):
        from dvc.remote.base import _log_exceptions

        from_infos = []
        to_infos = []
        names = []
        for from_info in collected_files:
            from_infos.append(from_info)
            fname = from_info.relative_to(src_tree.path_info)
            names.append(str(fname))
            to_infos.append(dest_tree.path_info / fname)
        total = len(from_infos)

        if download:
            func = partial(
                _log_exceptions(src_tree.download, "download"),
                dir_mode=dest_tree.dir_mode,
            )
            desc = "Downloading"
        else:
            func = partial(_log_exceptions(dest_tree.upload, "upload"))
            desc = "Uploading"

        with Tqdm(total=total, unit="file", desc=desc) as pbar:
            func = pbar.wrap_fn(func)
            # TODO: parallelize this, currently --jobs for repro applies to
            # number of repro executors not download threads
            with ThreadPoolExecutor(max_workers=1) as dl_executor:
                mode = None
                stat_func = getattr(src_tree, "stat", None)
                futures = []
                for from_info, to_info, name in zip(
                    from_infos, to_infos, names
                ):
                    if stat_func:
                        mode = stat.S_IMODE(stat_func(from_info).st_mode)
                    futures.append(
                        dl_executor.submit(
                            func, from_info, to_info, name, file_mode=mode
                        )
                    )

                fails = sum(
                    future.result() for future in as_completed(futures)
                )

        if fails:
            if download:
                raise DownloadError(fails)
            raise UploadError(fails)
Esempio n. 5
0
def push(
    self,
    targets=None,
    jobs=None,
    remote=None,
    all_branches=False,
    with_deps=False,
    all_tags=False,
    recursive=False,
    all_commits=False,
    run_cache=False,
    revs=None,
    glob=False,
):
    used_run_cache = self.stage_cache.push(remote) if run_cache else []

    if isinstance(targets, str):
        targets = [targets]

    expanded_targets = glob_targets(targets, glob=glob)

    used = self.used_objs(
        expanded_targets,
        all_branches=all_branches,
        all_tags=all_tags,
        all_commits=all_commits,
        with_deps=with_deps,
        force=True,
        remote=remote,
        jobs=jobs,
        recursive=recursive,
        used_run_cache=used_run_cache,
        revs=revs,
    )

    pushed = len(used_run_cache)
    for odb, objs in used.items():
        if odb and odb.read_only:
            continue
        try:
            pushed += self.cloud.push(objs, jobs, remote=remote, odb=odb)
        except FileTransferError as exc:
            raise UploadError(exc.amount)
    return pushed
Esempio n. 6
0
    def _process(
        self,
        named_cache,
        remote,
        jobs=None,
        show_checksums=False,
        download=False,
    ):
        logger.debug("Preparing to {} '{}'".format(
            "download data from" if download else "upload data to",
            remote.tree.path_info,
        ))

        if download:
            func = partial(
                _log_exceptions(remote.tree.download, "download"),
                dir_mode=self.tree.dir_mode,
                file_mode=self.tree.file_mode,
            )
            status = STATUS_DELETED
            desc = "Downloading"
        else:
            func = _log_exceptions(remote.tree.upload, "upload")
            status = STATUS_NEW
            desc = "Uploading"

        if jobs is None:
            jobs = remote.tree.JOBS

        dir_status, file_status, dir_contents = self._status(
            named_cache,
            remote,
            jobs=jobs,
            show_checksums=show_checksums,
            download=download,
        )

        dir_plans, _ = self._get_plans(download, remote, dir_status, status)
        file_plans, missing_files = self._get_plans(download, remote,
                                                    file_status, status)

        total = len(dir_plans[0]) + len(file_plans[0])
        if total == 0:
            return 0

        with Tqdm(total=total, unit="file", desc=desc) as pbar:
            func = pbar.wrap_fn(func)
            with ThreadPoolExecutor(max_workers=jobs) as executor:
                if download:
                    from_infos, to_infos, names, _ = (
                        d + f for d, f in zip(dir_plans, file_plans))
                    fails = sum(executor.map(func, from_infos, to_infos,
                                             names))
                else:
                    # for uploads, push files first, and any .dir files last

                    file_futures = {}
                    for from_info, to_info, name, hash_ in zip(*file_plans):
                        file_futures[hash_] = executor.submit(
                            func, from_info, to_info, name)
                    dir_futures = {}
                    for from_info, to_info, name, dir_hash in zip(*dir_plans):
                        # if for some reason a file contained in this dir is
                        # missing both locally and in the remote, we want to
                        # push whatever file content we have, but should not
                        # push .dir file
                        for file_hash in missing_files:
                            if file_hash in dir_contents[dir_hash]:
                                logger.debug(
                                    "directory '%s' contains missing files,"
                                    "skipping .dir file upload",
                                    name,
                                )
                                break
                        else:
                            wait_futures = {
                                future
                                for file_hash, future in file_futures.items()
                                if file_hash in dir_contents[dir_hash]
                            }
                            dir_futures[dir_hash] = executor.submit(
                                self._dir_upload,
                                func,
                                wait_futures,
                                from_info,
                                to_info,
                                name,
                            )
                    fails = sum(future.result() for future in concat(
                        file_futures.values(), dir_futures.values()))

        if fails:
            if download:
                remote.index.clear()
                raise DownloadError(fails)
            raise UploadError(fails)

        if not download:
            # index successfully pushed dirs
            for dir_hash, future in dir_futures.items():
                if future.result() == 0:
                    file_hashes = dir_contents[dir_hash]
                    logger.debug("Indexing pushed dir '{}' with "
                                 "'{}' nested files".format(
                                     dir_hash, len(file_hashes)))
                    remote.index.update([dir_hash], file_hashes)

        return len(dir_plans[0]) + len(file_plans[0])
Esempio n. 7
0
    def _upload_plans(
        self,
        dir_plans,
        file_plans,
        dir_contents,
        missing_files,
        executor,
        jobs,
        func,
    ):
        total_fails = 0

        def insert_batched(file_plans):
            fails = 0
            file_plan_iterator = iter(file_plans)

            def create_taskset(amount):
                return {
                    executor.submit(func, from_info, to_info, name)
                    for from_info, to_info, name, _ in itertools.islice(
                        file_plan_iterator, amount)
                }

            tasks = create_taskset(jobs * 5)
            while tasks:
                done, tasks = futures.wait(tasks,
                                           return_when=futures.FIRST_COMPLETED)
                fails += sum(task.result() for task in done)
                tasks.update(create_taskset(len(done)))
            return fails

        succeeded_dir_hashes = []
        all_file_plans = list(zip(*file_plans))
        for dir_from_info, dir_to_info, dir_name, dir_hash in zip(*dir_plans):
            bound_file_plans = []
            directory_hashes = dir_contents[dir_hash]

            for file_plan in all_file_plans.copy():
                if file_plan[-1] in directory_hashes:
                    bound_file_plans.append(file_plan)
                    all_file_plans.remove(file_plan)

            dir_fails = insert_batched(bound_file_plans)
            if dir_fails:
                logger.debug("failed to upload full contents of '{}', "
                             "aborting .dir file upload".format(dir_name))
                logger.error(f"failed to upload '{dir_from_info}'"
                             f" to '{dir_to_info}'")
                total_fails += dir_fails + 1
            elif directory_hashes.intersection(missing_files):
                # if for some reason a file contained in this dir is
                # missing both locally and in the remote, we want to
                # push whatever file content we have, but should not
                # push .dir file
                logger.debug(
                    "directory '%s' contains missing files,"
                    "skipping .dir file upload",
                    dir_name,
                )
            else:
                is_dir_failed = func(dir_from_info, dir_to_info, dir_name)
                total_fails += is_dir_failed
                if not is_dir_failed:
                    succeeded_dir_hashes.append(dir_hash)

        # insert the rest
        total_fails += insert_batched(all_file_plans)
        if total_fails:
            raise UploadError(total_fails)

        # index successfully pushed dirs
        for dir_hash in succeeded_dir_hashes:
            file_hashes = dir_contents[dir_hash]
            logger.debug("Indexing pushed dir '{}' with "
                         "'{}' nested files".format(dir_hash,
                                                    len(file_hashes)))
            self.index.update([dir_hash], file_hashes)
Esempio n. 8
0
def push(
    self,
    targets=None,
    jobs=None,
    remote=None,
    all_branches=False,
    with_deps=False,
    all_tags=False,
    recursive=False,
    all_commits=False,
    run_cache=False,
    revs=None,
    glob=False,
    odb: Optional["ObjectDB"] = None,
    include_imports=False,
):
    used_run_cache = (self.stage_cache.push(remote, odb=odb)
                      if run_cache else [])

    if isinstance(targets, str):
        targets = [targets]

    expanded_targets = glob_targets(targets, glob=glob)

    used = self.used_objs(
        expanded_targets,
        all_branches=all_branches,
        all_tags=all_tags,
        all_commits=all_commits,
        with_deps=with_deps,
        force=True,
        remote=remote,
        jobs=jobs,
        recursive=recursive,
        used_run_cache=used_run_cache,
        revs=revs,
    )

    pushed = len(used_run_cache)
    if odb:
        all_ids = set()
        for dest_odb, obj_ids in used.items():
            if not include_imports and dest_odb and dest_odb.read_only:
                continue
            all_ids.update(obj_ids)
        try:
            pushed += self.cloud.push(all_ids, jobs, remote=remote, odb=odb)
        except FileTransferError as exc:
            raise UploadError(exc.amount)
    else:
        for dest_odb, obj_ids in used.items():
            if dest_odb and dest_odb.read_only:
                continue
            try:
                pushed += self.cloud.push(obj_ids,
                                          jobs,
                                          remote=remote,
                                          odb=odb or dest_odb)
            except FileTransferError as exc:
                raise UploadError(exc.amount)
    return pushed
Esempio n. 9
0
    def _process(
        self,
        named_cache,
        remote,
        jobs=None,
        show_checksums=False,
        download=False,
    ):
        logger.debug(
            "Preparing to {} '{}'".format(
                "download data from" if download else "upload data to",
                remote.path_info,
            )
        )

        if download:
            func = partial(
                remote.download,
                dir_mode=self._dir_mode,
                file_mode=self._file_mode,
            )
            status = STATUS_DELETED
            desc = "Downloading"
        else:
            func = remote.upload
            status = STATUS_NEW
            desc = "Uploading"

        if jobs is None:
            jobs = remote.JOBS

        dir_status, file_status, dir_contents = self._status(
            named_cache,
            remote,
            jobs=jobs,
            show_checksums=show_checksums,
            download=download,
        )

        dir_plans = self._get_plans(download, remote, dir_status, status)
        file_plans = self._get_plans(download, remote, file_status, status)

        total = len(dir_plans[0]) + len(file_plans[0])
        if total == 0:
            return 0

        with Tqdm(total=total, unit="file", desc=desc) as pbar:
            func = pbar.wrap_fn(func)
            with ThreadPoolExecutor(max_workers=jobs) as executor:
                if download:
                    fails = sum(executor.map(func, *dir_plans))
                    fails += sum(executor.map(func, *file_plans))
                else:
                    # for uploads, push files first, and any .dir files last

                    file_futures = {}
                    for from_info, to_info, name, checksum in zip(*file_plans):
                        file_futures[checksum] = executor.submit(
                            func, from_info, to_info, name
                        )
                    dir_futures = {}
                    for from_info, to_info, name, dir_checksum in zip(
                        *dir_plans
                    ):
                        wait_futures = {
                            future
                            for file_checksum, future in file_futures.items()
                            if file_checksum in dir_contents[dir_checksum]
                        }
                        dir_futures[dir_checksum] = executor.submit(
                            self._dir_upload,
                            func,
                            wait_futures,
                            from_info,
                            to_info,
                            name,
                        )
                    fails = sum(
                        future.result()
                        for future in concat(
                            file_futures.values(), dir_futures.values()
                        )
                    )

        if fails:
            if download:
                remote.index.clear()
                raise DownloadError(fails)
            raise UploadError(fails)

        if not download:
            # index successfully pushed dirs
            for dir_checksum, future in dir_futures.items():
                if future.result() == 0:
                    file_checksums = dir_contents[dir_checksum]
                    logger.debug(
                        "Indexing pushed dir '{}' with "
                        "'{}' nested files".format(
                            dir_checksum, len(file_checksums)
                        )
                    )
                    remote.index.update([dir_checksum], file_checksums)

        return len(dir_plans[0]) + len(file_plans[0])
Esempio n. 10
0
    def _process(
        self,
        named_cache,
        remote,
        jobs=None,
        show_checksums=False,
        download=False,
    ):
        logger.debug("Preparing to {} '{}'".format(
            "download data from" if download else "upload data to",
            remote.path_info,
        ))

        if download:
            func = partial(
                remote.download,
                dir_mode=self._dir_mode,
                file_mode=self._file_mode,
            )
            status = STATUS_DELETED
        else:
            func = remote.upload
            status = STATUS_NEW

        if jobs is None:
            jobs = remote.JOBS

        dir_status, file_status, dir_paths = self._status(
            named_cache,
            remote,
            jobs=jobs,
            show_checksums=show_checksums,
            download=download,
        )

        dir_plans = self._get_plans(download, remote, dir_status, status)
        file_plans = self._get_plans(download, remote, file_status, status)

        if len(dir_plans[0]) + len(file_plans[0]) == 0:
            return 0

        with ThreadPoolExecutor(max_workers=jobs) as executor:
            if download:
                fails = sum(executor.map(func, *dir_plans))
                fails += sum(executor.map(func, *file_plans))
            else:
                # for uploads, push files first, and any .dir files last

                file_futures = {}
                for from_info, to_info, name in zip(*file_plans):
                    file_futures[to_info] = executor.submit(
                        func, from_info, to_info, name)
                dir_futures = {}
                for from_info, to_info, name in zip(*dir_plans):
                    wait_futures = {
                        future
                        for file_path, future in file_futures.items()
                        if file_path in dir_paths[to_info]
                    }
                    dir_futures[to_info] = executor.submit(
                        self._dir_upload,
                        func,
                        wait_futures,
                        from_info,
                        to_info,
                        name,
                    )
                fails = sum(future.result() for future in concat(
                    file_futures.values(), dir_futures.values()))

        if fails:
            if download:
                raise DownloadError(fails)
            raise UploadError(fails)

        return len(dir_plans[0]) + len(file_plans[0])