def collect_dir_cache(self, dname): dir_info = [] for root, dirs, files in os.walk(dname): for fname in files: path = os.path.join(root, fname) relpath = self.unixpath(os.path.relpath(path, dname)) # FIXME: we could've used md5 = state.update(path, dump=False) # here, but it is around twice as slow(on ssd, don't know about # hdd) for a directory with small files. What we could do here # is introduce some kind of a limit for file size, after which # we would actually register it in our state file. md5 = file_md5(path)[0] dir_info.append({self.PARAM_RELPATH: relpath, self.PARAM_MD5: md5}) # NOTE: sorting the list by path to ensure reproducibility dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX if self.changed(md5): self.dump_dir_cache(md5, dir_info) return (md5, dir_info)
def _compute_md5(self): from dvc.output.base import OutputBase d = self.dumpd() # NOTE: removing md5 manually in order to not affect md5s in deps/outs if self.PARAM_MD5 in d.keys(): del d[self.PARAM_MD5] # Ignore the wdir default value. In this case stage file w/o # wdir has the same md5 as a file with the default value specified. # It's important for backward compatibility with pipelines that # didn't have WDIR in their stage files. if d.get(self.PARAM_WDIR) == ".": del d[self.PARAM_WDIR] # NOTE: excluding parameters that don't affect the state of the # pipeline. Not excluding `OutputLOCAL.PARAM_CACHE`, because if # it has changed, we might not have that output in our cache. m = dict_md5( d, exclude=[ self.PARAM_LOCKED, OutputBase.PARAM_METRIC, OutputBase.PARAM_TAGS, OutputBase.PARAM_PERSIST, ], ) logger.debug("Computed stage '{}' md5: '{}'".format(self.relpath, m)) return m
def test_dict_md5(self): d = { 'cmd': 'python code.py foo file1', 'locked': 'true', 'outs': [{ 'path': 'file1', 'metric': { 'type': 'raw' }, 'cache': False, 'md5': 'acbd18db4cc2f85cedef654fccc4a4d8' }], 'deps': [{ 'path': 'foo', 'md5': 'acbd18db4cc2f85cedef654fccc4a4d8' }, { 'path': 'code.py', 'md5': 'd05447644b89960913c7eee5fd776adb' }] } md5 = '8b263fa05ede6c3145c164829be694b4' self.assertEqual(md5, utils.dict_md5(d, exclude=['metric', 'locked']))
def _compute_md5(self): from dvc.output.base import BaseOutput d = self.dumpd() # Remove md5 and meta, these should not affect stage md5 d.pop(self.PARAM_MD5, None) d.pop(self.PARAM_META, None) # Ignore the wdir default value. In this case DVC-file w/o # wdir has the same md5 as a file with the default value specified. # It's important for backward compatibility with pipelines that # didn't have WDIR in their DVC-files. if d.get(self.PARAM_WDIR) == ".": del d[self.PARAM_WDIR] # NOTE: excluding parameters that don't affect the state of the # pipeline. Not excluding `LocalOutput.PARAM_CACHE`, because if # it has changed, we might not have that output in our cache. m = dict_md5( d, exclude=[ self.PARAM_LOCKED, BaseOutput.PARAM_METRIC, BaseOutput.PARAM_PERSIST, ], ) logger.debug("Computed {} md5: '{}'".format(self, m)) return m
def get_mtime_and_size(path, dvcignore): if os.path.isdir(fspath_py35(path)): size = 0 files_mtimes = {} for file_path in walk_files(path, dvcignore): try: stat = os.stat(file_path) except OSError as exc: # NOTE: broken symlink case. if exc.errno != errno.ENOENT: raise continue size += stat.st_size files_mtimes[file_path] = stat.st_mtime # We track file changes and moves, which cannot be detected with simply # max(mtime(f) for f in non_ignored_files) mtime = dict_md5(files_mtimes) else: base_stat = os.stat(fspath_py35(path)) size = base_stat.st_size mtime = base_stat.st_mtime mtime = int(nanotime.timestamp(mtime)) # State of files handled by dvc is stored in db as TEXT. # We cast results to string for later comparisons with stored values. return str(mtime), str(size)
def get_mtime_and_size(path, fs, dvcignore=None): import nanotime if fs.isdir(path): size = 0 files_mtimes = {} if dvcignore: walk_iterator = dvcignore.find(fs, path) else: walk_iterator = fs.find(path) for file_path in walk_iterator: try: stats = fs.info(file_path) except OSError as exc: # NOTE: broken symlink case. if exc.errno != errno.ENOENT: raise continue size += stats["size"] files_mtimes[file_path] = stats["mtime"] # We track file changes and moves, which cannot be detected with simply # max(mtime(f) for f in non_ignored_files) mtime = dict_md5(files_mtimes) else: base_stat = fs.info(path) size = base_stat["size"] mtime = base_stat["mtime"] mtime = int(nanotime.timestamp(mtime)) return str(mtime), size
def test_dict_md5(self): d = { "cmd": "python code.py foo file1", "locked": "true", "outs": [{ "path": "file1", "metric": { "type": "raw" }, "cache": False, "md5": "acbd18db4cc2f85cedef654fccc4a4d8", }], "deps": [ { "path": "foo", "md5": "acbd18db4cc2f85cedef654fccc4a4d8" }, { "path": "code.py", "md5": "d05447644b89960913c7eee5fd776adb" }, ], } md5 = "8b263fa05ede6c3145c164829be694b4" self.assertEqual(md5, utils.dict_md5(d, exclude=["metric", "locked"]))
def identifier(self) -> str: """Unique identifier for the index. We can use this to optimize and skip opening some indices eg: on push/pull/fetch/gc --all-commits. Currently, it is unique to the platform (windows vs posix). """ return dict_md5(self.dumpd())
def _compute_md5(self): from dvc.output.local import OutputLOCAL d = self.dumpd() # NOTE: removing md5 manually in order to not affect md5s in deps/outs if self.PARAM_MD5 in d.keys(): del d[self.PARAM_MD5] # NOTE: excluding parameters that don't affect the state of the # pipeline. Not excluding `OutputLOCAL.PARAM_CACHE`, because if # it has changed, we might not have that output in our cache. return dict_md5(d, exclude=[self.PARAM_LOCKED, OutputLOCAL.PARAM_METRIC])
def collect_dir_cache(self, dname): dir_info = [] for root, dirs, files in os.walk(dname): bar = False if len(files) > LARGE_DIR_SIZE: msg = "Computing md5 for a large directory {}. " \ "This is only done once." Logger.info(msg.format(os.path.relpath(dname))) bar = True title = os.path.relpath(dname) processed = 0 total = len(files) progress.update_target(title, 0, total) for fname in files: path = os.path.join(root, fname) relpath = self.unixpath(os.path.relpath(path, dname)) if bar: progress.update_target(title, processed, total) processed += 1 # FIXME: we could've used md5 = state.update(path, dump=False) # here, but it is around twice as slow(on ssd, don't know about # hdd) for a directory with small files. What we could do here # is introduce some kind of a limit for file size, after which # we would actually register it in our state file. md5 = self.state.update(path, dump=False) dir_info.append({ self.PARAM_RELPATH: relpath, self.PARAM_MD5: md5 }) self.state.dump() if bar: progress.finish_target(title) # NOTE: sorting the list by path to ensure reproducibility dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX if self.changed(md5): self.dump_dir_cache(md5, dir_info) return (md5, dir_info)
def dumpd(self): deps = [x.dumpd() for x in self.deps] outs = [x.dumpd() for x in self.outs] ret = {} if self.cmd != None: ret[Stage.PARAM_CMD] = self.cmd if len(deps): ret[Stage.PARAM_DEPS] = deps if len(outs): ret[Stage.PARAM_OUTS] = outs ret[Stage.PARAM_MD5] = dict_md5(ret) return ret
def collect_dir_cache(self, dname): dir_info = [] for root, dirs, files in os.walk(dname): bar = False if len(files) > LARGE_DIR_SIZE: msg = ( "Computing md5 for a large directory {}. " "This is only done once." ) logger.info(msg.format(os.path.relpath(root))) bar = True title = os.path.relpath(root) processed = 0 total = len(files) progress.update_target(title, 0, total) for fname in files: path = os.path.join(root, fname) relpath = self.unixpath(os.path.relpath(path, dname)) if bar: progress.update_target(title, processed, total) processed += 1 md5 = self.state.update(path) dir_info.append( {self.PARAM_RELPATH: relpath, self.PARAM_CHECKSUM: md5} ) if bar: progress.finish_target(title) # NOTE: sorting the list by path to ensure reproducibility dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX if self.changed_cache_file(md5): self.dump_dir_cache(md5, dir_info) return (md5, dir_info)
def _collect_dir(self, dname): dir_info = [] for root, dirs, files in os.walk(dname): for fname in files: path = os.path.join(root, fname) relpath = os.path.relpath(path, dname) # FIXME: we could've used `md5 = self.update(path, dump=False)` here, # but it is around twice as slow(on ssd, don't know about hdd) for a # directory with small files. What we could do here is introduce some # kind of a limit for file size, after which we would actually register # it in our state file. md5 = file_md5(path)[0] dir_info.append({ self.PARAM_RELPATH: relpath, self.PARAM_MD5: md5 }) md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX if self.project.cache.local.changed(md5): self.project.cache.local.dump_dir_cache(md5, dir_info) return (md5, dir_info)
def compute_md5(self, path): if os.path.isdir(path): dir_info = self.collect_dir(path) return dict_md5(dir_info) + self.MD5_DIR_SUFFIX else: return file_md5(path)[0]