Exemple #1
0
    def collect_dir_cache(self, dname):
        dir_info = []

        for root, dirs, files in os.walk(dname):
            for fname in files:
                path = os.path.join(root, fname)
                relpath = self.unixpath(os.path.relpath(path, dname))

                # FIXME: we could've used md5 = state.update(path, dump=False)
                # here, but it is around twice as slow(on ssd, don't know about
                # hdd) for a directory with small files. What we could do here
                # is introduce some kind of a limit for file size, after which
                # we would actually register it in our state file.
                md5 = file_md5(path)[0]
                dir_info.append({self.PARAM_RELPATH: relpath,
                                 self.PARAM_MD5: md5})

        # NOTE: sorting the list by path to ensure reproducibility
        dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

        md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX
        if self.changed(md5):
            self.dump_dir_cache(md5, dir_info)

        return (md5, dir_info)
Exemple #2
0
    def _compute_md5(self):
        from dvc.output.base import OutputBase

        d = self.dumpd()

        # NOTE: removing md5 manually in order to not affect md5s in deps/outs
        if self.PARAM_MD5 in d.keys():
            del d[self.PARAM_MD5]

        # Ignore the wdir default value. In this case stage file w/o
        # wdir has the same md5 as a file with the default value specified.
        # It's important for backward compatibility with pipelines that
        # didn't have WDIR in their stage files.
        if d.get(self.PARAM_WDIR) == ".":
            del d[self.PARAM_WDIR]

        # NOTE: excluding parameters that don't affect the state of the
        # pipeline. Not excluding `OutputLOCAL.PARAM_CACHE`, because if
        # it has changed, we might not have that output in our cache.
        m = dict_md5(
            d,
            exclude=[
                self.PARAM_LOCKED,
                OutputBase.PARAM_METRIC,
                OutputBase.PARAM_TAGS,
                OutputBase.PARAM_PERSIST,
            ],
        )
        logger.debug("Computed stage '{}' md5: '{}'".format(self.relpath, m))
        return m
Exemple #3
0
    def test_dict_md5(self):
        d = {
            'cmd':
            'python code.py foo file1',
            'locked':
            'true',
            'outs': [{
                'path': 'file1',
                'metric': {
                    'type': 'raw'
                },
                'cache': False,
                'md5': 'acbd18db4cc2f85cedef654fccc4a4d8'
            }],
            'deps': [{
                'path': 'foo',
                'md5': 'acbd18db4cc2f85cedef654fccc4a4d8'
            }, {
                'path': 'code.py',
                'md5': 'd05447644b89960913c7eee5fd776adb'
            }]
        }

        md5 = '8b263fa05ede6c3145c164829be694b4'

        self.assertEqual(md5, utils.dict_md5(d, exclude=['metric', 'locked']))
Exemple #4
0
    def _compute_md5(self):
        from dvc.output.base import BaseOutput

        d = self.dumpd()

        # Remove md5 and meta, these should not affect stage md5
        d.pop(self.PARAM_MD5, None)
        d.pop(self.PARAM_META, None)

        # Ignore the wdir default value. In this case DVC-file w/o
        # wdir has the same md5 as a file with the default value specified.
        # It's important for backward compatibility with pipelines that
        # didn't have WDIR in their DVC-files.
        if d.get(self.PARAM_WDIR) == ".":
            del d[self.PARAM_WDIR]

        # NOTE: excluding parameters that don't affect the state of the
        # pipeline. Not excluding `LocalOutput.PARAM_CACHE`, because if
        # it has changed, we might not have that output in our cache.
        m = dict_md5(
            d,
            exclude=[
                self.PARAM_LOCKED,
                BaseOutput.PARAM_METRIC,
                BaseOutput.PARAM_PERSIST,
            ],
        )
        logger.debug("Computed {} md5: '{}'".format(self, m))
        return m
Exemple #5
0
def get_mtime_and_size(path, dvcignore):
    if os.path.isdir(fspath_py35(path)):
        size = 0
        files_mtimes = {}
        for file_path in walk_files(path, dvcignore):
            try:
                stat = os.stat(file_path)
            except OSError as exc:
                # NOTE: broken symlink case.
                if exc.errno != errno.ENOENT:
                    raise
                continue
            size += stat.st_size
            files_mtimes[file_path] = stat.st_mtime

        # We track file changes and moves, which cannot be detected with simply
        # max(mtime(f) for f in non_ignored_files)
        mtime = dict_md5(files_mtimes)
    else:
        base_stat = os.stat(fspath_py35(path))
        size = base_stat.st_size
        mtime = base_stat.st_mtime
        mtime = int(nanotime.timestamp(mtime))

    # State of files handled by dvc is stored in db as TEXT.
    # We cast results to string for later comparisons with stored values.
    return str(mtime), str(size)
Exemple #6
0
def get_mtime_and_size(path, fs, dvcignore=None):
    import nanotime

    if fs.isdir(path):
        size = 0
        files_mtimes = {}
        if dvcignore:
            walk_iterator = dvcignore.find(fs, path)
        else:
            walk_iterator = fs.find(path)
        for file_path in walk_iterator:
            try:
                stats = fs.info(file_path)
            except OSError as exc:
                # NOTE: broken symlink case.
                if exc.errno != errno.ENOENT:
                    raise
                continue
            size += stats["size"]
            files_mtimes[file_path] = stats["mtime"]

        # We track file changes and moves, which cannot be detected with simply
        # max(mtime(f) for f in non_ignored_files)
        mtime = dict_md5(files_mtimes)
    else:
        base_stat = fs.info(path)
        size = base_stat["size"]
        mtime = base_stat["mtime"]
        mtime = int(nanotime.timestamp(mtime))

    return str(mtime), size
Exemple #7
0
    def test_dict_md5(self):
        d = {
            "cmd":
            "python code.py foo file1",
            "locked":
            "true",
            "outs": [{
                "path": "file1",
                "metric": {
                    "type": "raw"
                },
                "cache": False,
                "md5": "acbd18db4cc2f85cedef654fccc4a4d8",
            }],
            "deps": [
                {
                    "path": "foo",
                    "md5": "acbd18db4cc2f85cedef654fccc4a4d8"
                },
                {
                    "path": "code.py",
                    "md5": "d05447644b89960913c7eee5fd776adb"
                },
            ],
        }

        md5 = "8b263fa05ede6c3145c164829be694b4"

        self.assertEqual(md5, utils.dict_md5(d, exclude=["metric", "locked"]))
Exemple #8
0
    def identifier(self) -> str:
        """Unique identifier for the index.

        We can use this to optimize and skip opening some indices
        eg: on push/pull/fetch/gc --all-commits.

        Currently, it is unique to the platform (windows vs posix).
        """
        return dict_md5(self.dumpd())
Exemple #9
0
    def _compute_md5(self):
        from dvc.output.local import OutputLOCAL

        d = self.dumpd()

        # NOTE: removing md5 manually in order to not affect md5s in deps/outs
        if self.PARAM_MD5 in d.keys():
            del d[self.PARAM_MD5]

        # NOTE: excluding parameters that don't affect the state of the
        # pipeline. Not excluding `OutputLOCAL.PARAM_CACHE`, because if
        # it has changed, we might not have that output in our cache.
        return dict_md5(d,
                        exclude=[self.PARAM_LOCKED, OutputLOCAL.PARAM_METRIC])
Exemple #10
0
    def collect_dir_cache(self, dname):
        dir_info = []

        for root, dirs, files in os.walk(dname):
            bar = False
            if len(files) > LARGE_DIR_SIZE:
                msg = "Computing md5 for a large directory {}. " \
                      "This is only done once."
                Logger.info(msg.format(os.path.relpath(dname)))
                bar = True
                title = os.path.relpath(dname)
                processed = 0
                total = len(files)
                progress.update_target(title, 0, total)

            for fname in files:
                path = os.path.join(root, fname)
                relpath = self.unixpath(os.path.relpath(path, dname))

                if bar:
                    progress.update_target(title, processed, total)
                    processed += 1

                # FIXME: we could've used md5 = state.update(path, dump=False)
                # here, but it is around twice as slow(on ssd, don't know about
                # hdd) for a directory with small files. What we could do here
                # is introduce some kind of a limit for file size, after which
                # we would actually register it in our state file.
                md5 = self.state.update(path, dump=False)
                dir_info.append({
                    self.PARAM_RELPATH: relpath,
                    self.PARAM_MD5: md5
                })

            self.state.dump()

            if bar:
                progress.finish_target(title)

        # NOTE: sorting the list by path to ensure reproducibility
        dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

        md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX
        if self.changed(md5):
            self.dump_dir_cache(md5, dir_info)

        return (md5, dir_info)
Exemple #11
0
    def dumpd(self):
        deps = [x.dumpd() for x in self.deps]
        outs = [x.dumpd() for x in self.outs]

        ret = {}
        if self.cmd != None:
            ret[Stage.PARAM_CMD] = self.cmd

        if len(deps):
            ret[Stage.PARAM_DEPS] = deps

        if len(outs):
            ret[Stage.PARAM_OUTS] = outs

        ret[Stage.PARAM_MD5] = dict_md5(ret)

        return ret
Exemple #12
0
    def collect_dir_cache(self, dname):
        dir_info = []

        for root, dirs, files in os.walk(dname):
            bar = False

            if len(files) > LARGE_DIR_SIZE:
                msg = (
                    "Computing md5 for a large directory {}. "
                    "This is only done once."
                )
                logger.info(msg.format(os.path.relpath(root)))
                bar = True
                title = os.path.relpath(root)
                processed = 0
                total = len(files)
                progress.update_target(title, 0, total)

            for fname in files:
                path = os.path.join(root, fname)
                relpath = self.unixpath(os.path.relpath(path, dname))

                if bar:
                    progress.update_target(title, processed, total)
                    processed += 1

                md5 = self.state.update(path)
                dir_info.append(
                    {self.PARAM_RELPATH: relpath, self.PARAM_CHECKSUM: md5}
                )

            if bar:
                progress.finish_target(title)

        # NOTE: sorting the list by path to ensure reproducibility
        dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

        md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX
        if self.changed_cache_file(md5):
            self.dump_dir_cache(md5, dir_info)

        return (md5, dir_info)
Exemple #13
0
    def _collect_dir(self, dname):
        dir_info = []

        for root, dirs, files in os.walk(dname):
            for fname in files:
                path = os.path.join(root, fname)
                relpath = os.path.relpath(path, dname)

                # FIXME: we could've used `md5 = self.update(path, dump=False)` here,
                # but it is around twice as slow(on ssd, don't know about hdd) for a
                # directory with small files. What we could do here is introduce some
                # kind of a limit for file size, after which we would actually register
                # it in our state file.
                md5 = file_md5(path)[0]
                dir_info.append({
                    self.PARAM_RELPATH: relpath,
                    self.PARAM_MD5: md5
                })

        md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX
        if self.project.cache.local.changed(md5):
            self.project.cache.local.dump_dir_cache(md5, dir_info)

        return (md5, dir_info)
Exemple #14
0
 def compute_md5(self, path):
     if os.path.isdir(path):
         dir_info = self.collect_dir(path)
         return dict_md5(dir_info) + self.MD5_DIR_SUFFIX
     else:
         return file_md5(path)[0]