Esempio n. 1
0
    def _collect_dir(self, path_info):
        dir_info = []

        p_info = copy(path_info)
        dpath = p_info.path
        for root, dirs, files in self.walk(path_info):
            if len(files) > LARGE_DIR_SIZE:
                msg = ("Computing md5 for a large directory {}. "
                       "This is only done once.")
                relpath = self.ospath.relpath(root)
                logger.info(msg.format(relpath))
                files = progress(files, name=relpath)

            for fname in files:
                path = self.ospath.join(root, fname)
                p_info.path = path
                relpath = self.to_posixpath(self.ospath.relpath(path, dpath))

                checksum = self.get_file_checksum(p_info)
                dir_info.append({
                    self.PARAM_RELPATH: relpath,
                    self.PARAM_CHECKSUM: checksum,
                })

        # NOTE: sorting the list by path to ensure reproducibility
        return sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))
Esempio n. 2
0
File: base.py Progetto: vasinkd/dvc
    def _collect_dir(self, path_info):
        dir_info = []

        for root, dirs, files in self.walk(path_info):
            if len(files) > LARGE_DIR_SIZE:
                msg = ("Computing md5 for a large directory {}. "
                       "This is only done once.")
                title = str(self.path_cls(root))
                logger.info(msg.format(title))
                files = progress(files, name=title)

            for fname in files:
                file_info = self.path_cls(root) / fname
                relative_path = file_info.relative_to(path_info)
                dir_info.append({
                    # NOTE: this is lossy transformation:
                    #   "hey\there" -> "hey/there"
                    #   "hey/there" -> "hey/there"
                    # The latter is fine filename on Windows,
                    # which will transform to dir/file on back transform.
                    #
                    # Yes, this is a BUG, as long as we permit "/" in
                    # filenames on Windows and "\" on Unix
                    self.PARAM_RELPATH:
                    relative_path.as_posix(),
                    self.PARAM_CHECKSUM:
                    self.get_file_checksum(file_info),
                })

        # NOTE: sorting the list by path to ensure reproducibility
        return sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))
Esempio n. 3
0
    def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info):
        self.makedirs(unpacked_dir_info)

        for entry in progress(dir_info, name="Created unpacked dir"):
            entry_cache_info = self.checksum_to_path_info(
                entry[self.PARAM_CHECKSUM])
            relpath = entry[self.PARAM_RELPATH]
            self.link(entry_cache_info, unpacked_dir_info / relpath,
                      "hardlink")

        self.state.save(unpacked_dir_info, checksum)
Esempio n. 4
0
File: base.py Progetto: kss682/dvc
    def _calculate_checksums(self, file_infos):
        file_infos = list(file_infos)
        with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor:
            tasks = executor.map(self.get_file_checksum, file_infos)

            if len(file_infos) > LARGE_DIR_SIZE:
                msg = ("Computing md5 for a large number of files. "
                       "This is only done once.")
                logger.info(msg)
                tasks = progress(tasks, total=len(file_infos))

        checksums = {
            file_infos[index]: task
            for index, task in enumerate(tasks)
        }
        return checksums
Esempio n. 5
0
    def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info):
        self.makedirs(unpacked_dir_info)

        for entry in progress(dir_info, name="Created unpacked dir"):
            entry_cache_info = self.checksum_to_path_info(
                entry[self.PARAM_CHECKSUM])
            relative_path = entry[self.PARAM_RELPATH]
            # In shared cache mode some cache files might not be owned by the
            # user, so we need to use symlinks because, unless
            # /proc/sys/fs/protected_hardlinks is disabled, the user is not
            # allowed to create hardlinks to files that he doesn't own.
            link_types = ["hardlink", "symlink"]
            self._link(entry_cache_info, unpacked_dir_info / relative_path,
                       link_types)

        self.state.save(unpacked_dir_info, checksum)
Esempio n. 6
0
    def _collect_dir(self, path_info):
        dir_info = {}

        with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor:
            for root, _dirs, files in self.walk(path_info):
                root_info = path_info / root

                for fname in files:

                    if fname == DvcIgnore.DVCIGNORE_FILE:
                        raise DvcIgnoreInCollectedDirError(root)

                    file_info = root_info / fname
                    relative_path = file_info.relative_to(path_info)
                    checksum = executor.submit(
                        self.get_file_checksum, file_info
                    )
                    dir_info[checksum] = {
                        # NOTE: this is lossy transformation:
                        #   "hey\there" -> "hey/there"
                        #   "hey/there" -> "hey/there"
                        # The latter is fine filename on Windows, which
                        # will transform to dir/file on back transform.
                        #
                        # Yes, this is a BUG, as long as we permit "/" in
                        # filenames on Windows and "\" on Unix
                        self.PARAM_RELPATH: relative_path.as_posix()
                    }

        checksums = as_completed(dir_info)
        if len(dir_info) > LARGE_DIR_SIZE:
            msg = (
                "Computing md5 for a large number of files. "
                "This is only done once."
            )
            logger.info(msg)
            checksums = progress(checksums, total=len(dir_info))

        # NOTE: resolving futures
        for checksum in checksums:
            entry = dir_info[checksum]
            entry[self.PARAM_CHECKSUM] = checksum.result()

        # NOTE: sorting the list by path to ensure reproducibility
        return sorted(dir_info.values(), key=itemgetter(self.PARAM_RELPATH))
Esempio n. 7
0
    def _get_plans(self, download, remote, status_info, status):
        cache = []
        path_infos = []
        names = []
        for md5, info in progress(status_info.items(),
                                  name="Analysing status"):
            if info["status"] == status:
                cache.append(self.checksum_to_path_info(md5))
                path_infos.append(remote.checksum_to_path_info(md5))
                names.append(info["name"])

        if download:
            to_infos = cache
            from_infos = path_infos
        else:
            to_infos = path_infos
            from_infos = cache

        return from_infos, to_infos, names
Esempio n. 8
0
 def cache_exists(self, md5s):
     return [
         checksum for checksum in progress(md5s)
         if not self.changed_cache_file(checksum)
     ]
Esempio n. 9
0
 def cache_exists(self, checksums, jobs=None):
     return [
         checksum for checksum in progress(checksums)
         if not self.changed_cache_file(checksum)
     ]