Ejemplo n.º 1
0
def file_md5(fname, fs):
    """get the (md5 hexdigest, md5 digest) of a file"""
    from dvc.istextfile import istextfile
    from dvc.progress import Tqdm

    hash_md5 = hashlib.md5()
    binary = not istextfile(fname, fs=fs)
    size = fs.getsize(fname) or 0
    no_progress_bar = True
    if size >= LARGE_FILE_SIZE:
        no_progress_bar = False
        msg = (f"Computing md5 for a large file '{fname}'. "
               "This is only done once.")
        logger.info(msg)

    with Tqdm(
            desc=str(fname),
            disable=no_progress_bar,
            total=size,
            bytes=True,
            leave=False,
    ) as pbar:
        with fs.open(fname, "rb") as fobj:
            _fobj_md5(fobj, hash_md5, binary, pbar.update)

    return hash_md5.hexdigest()
Ejemplo n.º 2
0
Archivo: local.py Proyecto: wellic/dvc
    def _verify_metric(self):
        if not self.metric:
            return

        if os.path.isdir(self.path):
            msg = 'Directory \'{}\' cannot be used as metrics.'
            raise DvcException(msg.format(self.rel_path))

        if not istextfile(self.path):
            msg = 'Binary file \'{}\' cannot be used as metrics.'
            raise DvcException(msg.format(self.rel_path))
Ejemplo n.º 3
0
    def verify_metric(self):
        if not self.metric:
            return

        if not os.path.exists(self.path):
            return

        if os.path.isdir(self.path):
            msg = "directory '{}' cannot be used as metrics."
            raise DvcException(msg.format(self.rel_path))

        if not istextfile(self.path):
            msg = "binary file '{}' cannot be used as metrics."
            raise DvcException(msg.format(self.rel_path))
Ejemplo n.º 4
0
    def verify_metric(self):
        if not self.metric or self.plot:
            return

        path = os.fspath(self.path_info)
        if not os.path.exists(path):
            return

        name = "metrics" if self.metric else "plot"
        if os.path.isdir(path):
            msg = "directory '{}' cannot be used as {}."
            raise IsADirectoryError(msg.format(self.path_info, name))

        if not istextfile(path):
            msg = "binary file '{}' cannot be used as {}."
            raise DvcException(msg.format(self.path_info, name))
Ejemplo n.º 5
0
    def verify_metric(self):
        if not self.metric or self.plot:
            return

        path = os.fspath(self.path_info)
        if not os.path.exists(path):
            return

        name = "metrics" if self.metric else "plot"
        if os.path.isdir(path):
            msg = "directory '%s' cannot be used as %s."
            logger.debug(msg, str(self.path_info), name)
            return

        if not istextfile(path, self.fs):
            msg = "binary file '{}' cannot be used as {}."
            raise DvcException(msg.format(self.path_info, name))
Ejemplo n.º 6
0
def test_hashed_stream_reader_compatibility(tmp_dir, contents):
    # Always read more than the DEFAULT_CHUNK_SIZE (512 bytes).
    # This imitates the read actions performed by upload_fobj.
    chunk_size = DEFAULT_CHUNK_SIZE * 2

    tmp_dir.gen("data", contents)
    data = tmp_dir / "data"

    with open(data, "rb") as fobj:
        stream_reader = HashedStreamReader(fobj)
        stream_reader.read(chunk_size)

    local_fs = LocalFileSystem()
    hex_digest = file_md5(data, local_fs)

    assert stream_reader.is_text_file is istextfile(data, local_fs)
    assert stream_reader.hash_info.value == hex_digest
Ejemplo n.º 7
0
def file_md5(fname):
    """ get the (md5 hexdigest, md5 digest) of a file """
    from dvc.progress import Tqdm
    from dvc.istextfile import istextfile

    fname = fspath_py35(fname)

    if os.path.exists(fname):
        hash_md5 = hashlib.md5()
        binary = not istextfile(fname)
        size = os.path.getsize(fname)
        no_progress_bar = True
        if size >= LARGE_FILE_SIZE:
            no_progress_bar = False
            msg = (
                "Computing md5 for a large file '{}'. This is only done once."
            )
            logger.info(msg.format(relpath(fname)))
        name = relpath(fname)

        with Tqdm(
            desc=name,
            disable=no_progress_bar,
            total=size,
            bytes=True,
            leave=False,
        ) as pbar:
            with open(fname, "rb") as fobj:
                while True:
                    data = fobj.read(LOCAL_CHUNK_SIZE)
                    if not data:
                        break

                    if binary:
                        chunk = data
                    else:
                        chunk = dos2unix(data)

                    hash_md5.update(chunk)
                    pbar.update(len(data))

        return (hash_md5.hexdigest(), hash_md5.digest())

    return (None, None)
Ejemplo n.º 8
0
def file_md5(fname, tree=None):
    """ get the (md5 hexdigest, md5 digest) of a file """
    from dvc.progress import Tqdm
    from dvc.istextfile import istextfile

    if tree:
        exists_func = tree.exists
        stat_func = tree.stat
        open_func = tree.open
        # assume we don't need to run dos2unix when comparing git blobs
        binary = True
    else:
        exists_func = os.path.exists
        stat_func = os.stat
        open_func = open
        binary = False

    if exists_func(fname):
        hash_md5 = hashlib.md5()
        if not binary:
            binary = not istextfile(fname)
        size = stat_func(fname).st_size
        no_progress_bar = True
        if size >= LARGE_FILE_SIZE:
            no_progress_bar = False
            msg = (
                "Computing md5 for a large file '{}'. This is only done once.")
            logger.info(msg.format(relpath(fname)))
        name = relpath(fname)

        with Tqdm(
                desc=name,
                disable=no_progress_bar,
                total=size,
                bytes=True,
                leave=False,
        ) as pbar:
            with open_func(fname, "rb") as fobj:
                _fobj_md5(fobj, hash_md5, binary, pbar.update)

        return (hash_md5.hexdigest(), hash_md5.digest())

    return (None, None)
Ejemplo n.º 9
0
def file_md5(fname):
    """ get the (md5 hexdigest, md5 digest) of a file """
    import dvc.logger as logger
    from dvc.progress import progress
    from dvc.istextfile import istextfile

    if os.path.exists(fname):
        hash_md5 = hashlib.md5()
        binary = not istextfile(fname)
        size = os.path.getsize(fname)
        bar = False
        if size >= LARGE_FILE_SIZE:
            bar = True
            msg = "Computing md5 for a large file {}. This is only done once."
            logger.info(msg.format(os.path.relpath(fname)))
            name = os.path.relpath(fname)
            total = 0

        with open(fname, "rb") as fobj:
            while True:
                data = fobj.read(LOCAL_CHUNK_SIZE)
                if not data:
                    break

                if bar:
                    total += len(data)
                    progress.update_target(name, total, size)

                if binary:
                    chunk = data
                else:
                    chunk = dos2unix(data)

                hash_md5.update(chunk)

        if bar:
            progress.finish_target(name)

        return (hash_md5.hexdigest(), hash_md5.digest())
    else:
        return (None, None)
Ejemplo n.º 10
0
Archivo: utils.py Proyecto: tonyle9/dvc
def file_md5(fname):
    """ get the (md5 hexdigest, md5 digest) of a file """
    if os.path.exists(fname):
        hash_md5 = hashlib.md5()
        binary = not istextfile(fname)

        with open(fname, 'rb') as fobj:
            while True:
                data = fobj.read(LOCAL_CHUNK_SIZE)
                if not data:
                    break

                if binary:
                    chunk = data
                else:
                    chunk = dos2unix(data)

                hash_md5.update(chunk)

        return (hash_md5.hexdigest(), hash_md5.digest())
    else:
        return (None, None)