def file_md5(fname, fs): """get the (md5 hexdigest, md5 digest) of a file""" from dvc.istextfile import istextfile from dvc.progress import Tqdm hash_md5 = hashlib.md5() binary = not istextfile(fname, fs=fs) size = fs.getsize(fname) or 0 no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False msg = (f"Computing md5 for a large file '{fname}'. " "This is only done once.") logger.info(msg) with Tqdm( desc=str(fname), disable=no_progress_bar, total=size, bytes=True, leave=False, ) as pbar: with fs.open(fname, "rb") as fobj: _fobj_md5(fobj, hash_md5, binary, pbar.update) return hash_md5.hexdigest()
def _verify_metric(self): if not self.metric: return if os.path.isdir(self.path): msg = 'Directory \'{}\' cannot be used as metrics.' raise DvcException(msg.format(self.rel_path)) if not istextfile(self.path): msg = 'Binary file \'{}\' cannot be used as metrics.' raise DvcException(msg.format(self.rel_path))
def verify_metric(self): if not self.metric: return if not os.path.exists(self.path): return if os.path.isdir(self.path): msg = "directory '{}' cannot be used as metrics." raise DvcException(msg.format(self.rel_path)) if not istextfile(self.path): msg = "binary file '{}' cannot be used as metrics." raise DvcException(msg.format(self.rel_path))
def verify_metric(self): if not self.metric or self.plot: return path = os.fspath(self.path_info) if not os.path.exists(path): return name = "metrics" if self.metric else "plot" if os.path.isdir(path): msg = "directory '{}' cannot be used as {}." raise IsADirectoryError(msg.format(self.path_info, name)) if not istextfile(path): msg = "binary file '{}' cannot be used as {}." raise DvcException(msg.format(self.path_info, name))
def verify_metric(self): if not self.metric or self.plot: return path = os.fspath(self.path_info) if not os.path.exists(path): return name = "metrics" if self.metric else "plot" if os.path.isdir(path): msg = "directory '%s' cannot be used as %s." logger.debug(msg, str(self.path_info), name) return if not istextfile(path, self.fs): msg = "binary file '{}' cannot be used as {}." raise DvcException(msg.format(self.path_info, name))
def test_hashed_stream_reader_compatibility(tmp_dir, contents): # Always read more than the DEFAULT_CHUNK_SIZE (512 bytes). # This imitates the read actions performed by upload_fobj. chunk_size = DEFAULT_CHUNK_SIZE * 2 tmp_dir.gen("data", contents) data = tmp_dir / "data" with open(data, "rb") as fobj: stream_reader = HashedStreamReader(fobj) stream_reader.read(chunk_size) local_fs = LocalFileSystem() hex_digest = file_md5(data, local_fs) assert stream_reader.is_text_file is istextfile(data, local_fs) assert stream_reader.hash_info.value == hex_digest
def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm from dvc.istextfile import istextfile fname = fspath_py35(fname) if os.path.exists(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname) size = os.path.getsize(fname) no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False msg = ( "Computing md5 for a large file '{}'. This is only done once." ) logger.info(msg.format(relpath(fname))) name = relpath(fname) with Tqdm( desc=name, disable=no_progress_bar, total=size, bytes=True, leave=False, ) as pbar: with open(fname, "rb") as fobj: while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: break if binary: chunk = data else: chunk = dos2unix(data) hash_md5.update(chunk) pbar.update(len(data)) return (hash_md5.hexdigest(), hash_md5.digest()) return (None, None)
def file_md5(fname, tree=None): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm from dvc.istextfile import istextfile if tree: exists_func = tree.exists stat_func = tree.stat open_func = tree.open # assume we don't need to run dos2unix when comparing git blobs binary = True else: exists_func = os.path.exists stat_func = os.stat open_func = open binary = False if exists_func(fname): hash_md5 = hashlib.md5() if not binary: binary = not istextfile(fname) size = stat_func(fname).st_size no_progress_bar = True if size >= LARGE_FILE_SIZE: no_progress_bar = False msg = ( "Computing md5 for a large file '{}'. This is only done once.") logger.info(msg.format(relpath(fname))) name = relpath(fname) with Tqdm( desc=name, disable=no_progress_bar, total=size, bytes=True, leave=False, ) as pbar: with open_func(fname, "rb") as fobj: _fobj_md5(fobj, hash_md5, binary, pbar.update) return (hash_md5.hexdigest(), hash_md5.digest()) return (None, None)
def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ import dvc.logger as logger from dvc.progress import progress from dvc.istextfile import istextfile if os.path.exists(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname) size = os.path.getsize(fname) bar = False if size >= LARGE_FILE_SIZE: bar = True msg = "Computing md5 for a large file {}. This is only done once." logger.info(msg.format(os.path.relpath(fname))) name = os.path.relpath(fname) total = 0 with open(fname, "rb") as fobj: while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: break if bar: total += len(data) progress.update_target(name, total, size) if binary: chunk = data else: chunk = dos2unix(data) hash_md5.update(chunk) if bar: progress.finish_target(name) return (hash_md5.hexdigest(), hash_md5.digest()) else: return (None, None)
def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ if os.path.exists(fname): hash_md5 = hashlib.md5() binary = not istextfile(fname) with open(fname, 'rb') as fobj: while True: data = fobj.read(LOCAL_CHUNK_SIZE) if not data: break if binary: chunk = data else: chunk = dos2unix(data) hash_md5.update(chunk) return (hash_md5.hexdigest(), hash_md5.digest()) else: return (None, None)