Ejemplo n.º 1
0
Archivo: local.py Proyecto: Faadal/dvc
    def push(self, checksum_infos, remote, jobs=1):
        md5s = [
            info[self.PARAM_MD5] for info in self._collect(checksum_infos)[0]
        ]

        # NOTE: verifying that our cache is not corrupted
        md5s = list(filter(lambda md5: not self.changed(md5), md5s))

        # NOTE: filter files that are already uploaded
        path_infos = remote.md5s_to_path_infos(md5s)
        md5s_exist = filter(lambda x: not x[1],
                            list(zip(md5s, remote.exists(path_infos))))
        md5s = [md5 for md5, exists in md5s_exist]

        cache = [self.get(md5) for md5 in md5s]
        path_infos = remote.md5s_to_path_infos(md5s)

        assert len(path_infos) == len(cache) == len(md5s)

        chunks = list(
            zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs),
                to_chunks(md5s, jobs)))

        progress.set_n_total(len(md5s))

        if len(chunks) == 0:
            return

        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for path_infos, paths, md5s in chunks:
                executor.submit(remote.upload, paths, path_infos, names=md5s)
Ejemplo n.º 2
0
    def _get_chunks(self, download, remote, status_info, status, jobs):
        title = "Analysing status."

        progress.set_n_total(1)
        total = len(status_info)
        current = 0

        cache = []
        path_infos = []
        names = []
        for md5, info in status_info.items():
            if info["status"] == status:
                cache.append(self.checksum_to_path_info(md5))
                path_infos.append(remote.checksum_to_path_info(md5))
                names.append(info["name"])
            current += 1
            progress.update_target(title, current, total)

        progress.finish_target(title)

        progress.set_n_total(len(names))

        if download:
            to_infos = cache
            from_infos = path_infos
        else:
            to_infos = path_infos
            from_infos = cache

        return list(
            zip(
                to_chunks(from_infos, jobs),
                to_chunks(to_infos, jobs),
                to_chunks(names, jobs),
            ))
Ejemplo n.º 3
0
Archivo: local.py Proyecto: hfchong/dvc
    def _do_pull(self, checksum_infos, remote, jobs=1, no_progress_bar=False):
        md5s = [info[self.PARAM_MD5] for info in checksum_infos]

        # NOTE: filter files that are not corrupted
        md5s = list(filter(lambda md5: self.changed(md5), md5s))

        cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s]
        path_infos = remote.md5s_to_path_infos(md5s)

        assert len(path_infos) == len(cache) == len(md5s)

        chunks = list(
            zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs),
                to_chunks(md5s, jobs)))

        progress.set_n_total(len(md5s))

        if len(chunks) == 0:
            return

        futures = []
        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for from_infos, to_infos, md5s in chunks:
                res = executor.submit(remote.download,
                                      from_infos,
                                      to_infos,
                                      names=md5s,
                                      no_progress_bar=no_progress_bar)
                futures.append(res)

        for f in futures:
            f.result()
Ejemplo n.º 4
0
    def _do_pull(self, checksum_infos, remote, jobs=1, show_checksums=False):
        title = "Collecting information"

        progress.set_n_total(1)
        progress.update_target(title, 0, 100)

        grouped = zip(
            *self._group(checksum_infos, show_checksums=show_checksums))

        progress.update_target(title, 10, 100)

        md5s = []
        names = []
        # NOTE: filter files that are not corrupted
        for md5, name in grouped:
            if self.changed_cache(md5):
                md5s.append(md5)
                names.append(name)

        progress.update_target(title, 30, 100)

        cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s]

        progress.update_target(title, 50, 100)

        path_infos = remote.md5s_to_path_infos(md5s)

        progress.update_target(title, 60, 100)

        # NOTE: dummy call to try to establish a connection
        # to see if we need to ask user for a password.
        remote.exists(remote.md5s_to_path_infos(['000']))

        progress.update_target(title, 70, 100)

        assert len(path_infos) == len(cache) == len(md5s) == len(names)

        chunks = list(
            zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs),
                to_chunks(names, jobs)))

        progress.finish_target(title)

        progress.set_n_total(len(names))

        if len(chunks) == 0:
            return

        futures = []
        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for from_infos, to_infos, names in chunks:
                res = executor.submit(remote.download,
                                      from_infos,
                                      to_infos,
                                      names=names)
                futures.append(res)

        for f in futures:
            f.result()
Ejemplo n.º 5
0
Archivo: ssh.py Proyecto: zang3tsu/dvc
    def hashes_exist(self, hashes, jobs=None, name=None):
        """This is older implementation used in remote/base.py
        We are reusing it in RemoteSSH, because SSH's batch_exists proved to be
        faster than current approach (relying on exists(path_info)) applied in
        remote/base.
        """
        if not self.tree.CAN_TRAVERSE:
            return list(set(hashes) & set(self.tree.all()))

        # possibly prompt for credentials before "Querying" progress output
        self.tree.ensure_credentials()

        with Tqdm(
                desc="Querying " +
            ("cache in " + name if name else "remote cache"),
                total=len(hashes),
                unit="file",
        ) as pbar:

            def exists_with_progress(chunks):
                return self.batch_exists(chunks, callback=pbar.update_msg)

            with ThreadPoolExecutor(
                    max_workers=jobs or self.tree.JOBS) as executor:
                path_infos = [self.tree.hash_to_path_info(x) for x in hashes]
                chunks = to_chunks(path_infos, num_chunks=self.tree.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(hashes, in_remote))
                return ret
Ejemplo n.º 6
0
    def cache_exists(self, checksums, jobs=None, name=None):
        """This is older implementation used in remote/base.py
        We are reusing it in RemoteSSH, because SSH's batch_exists proved to be
        faster than current approach (relying on exists(path_info)) applied in
        remote/base.
        """
        if not self.no_traverse:
            return list(set(checksums) & set(self.all()))

        with Tqdm(
                desc="Querying " +
            ("cache in " + name if name else "remote cache"),
                total=len(checksums),
                unit="file",
        ) as pbar:

            def exists_with_progress(chunks):
                return self.batch_exists(chunks, callback=pbar.update_desc)

            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, num_chunks=self.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(checksums, in_remote))
                return ret
Ejemplo n.º 7
0
Archivo: base.py Proyecto: vibhor98/dvc
    def cache_exists(self, checksums):
        """Check if the given checksums are stored in the remote.

        There are two ways of performing this check:

        - Traverse: Get a list of all the files in the remote
            (traversing the cache directory) and compare it with
            the given checksums.

        - No traverse: For each given checksum, run the `exists`
            method and filter the checksums that aren't on the remote.
            This is done in parallel threads.

        The reason for such an odd logic is that most of the remotes
        take much shorter time to just retrieve everything they have under
        a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can
        check if particular file exists much quicker, use their own
        implementation of cache_exists (see http, local).

        Returns:
            A list with checksums that were found in the remote
        """
        if self.no_traverse and self.SUPPORTS_NO_TRAVERSE:
            with ThreadPoolExecutor(max_workers=self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, self.JOBS)
                results = executor.map(self.exists, chunks)
                in_remote = itertools.chain.from_iterable(results)
                return list(itertools.compress(checksums, in_remote))

        return list(set(checksums) & set(self.all()))
Ejemplo n.º 8
0
Archivo: ssh.py Proyecto: zang3tsu/dvc
    def batch_exists(self, path_infos, callback):
        def _exists(chunk_and_channel):
            chunk, channel = chunk_and_channel
            ret = []
            for path in chunk:
                try:
                    channel.stat(path)
                    ret.append(True)
                except OSError as exc:
                    if exc.errno != errno.ENOENT:
                        raise
                    ret.append(False)
                callback(path)
            return ret

        with self.tree.ssh(path_infos[0]) as ssh:
            channels = ssh.open_max_sftp_channels()
            max_workers = len(channels)

            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                paths = [path_info.path for path_info in path_infos]
                chunks = to_chunks(paths, num_chunks=max_workers)
                chunks_and_channels = zip(chunks, channels)
                outcome = executor.map(_exists, chunks_and_channels)
                results = list(itertools.chain.from_iterable(outcome))

            return results
Ejemplo n.º 9
0
    def remove_unused_links(self, used):
        """Removes all saved links except the ones that are used.

        Args:
            used (list): list of used links that should not be removed.
        """
        unused = []

        self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE))
        for row in self.cursor:
            relpath, inode, mtime = row
            inode = self._from_sqlite(inode)
            path = os.path.join(self.root_dir, relpath)

            if path in used:
                continue

            if not os.path.exists(path):
                continue

            actual_inode = get_inode(path)
            actual_mtime, _ = get_mtime_and_size(path, self.repo.dvcignore)

            if inode == actual_inode and mtime == actual_mtime:
                logger.debug("Removing '{}' as unused link.".format(path))
                remove(path)
                unused.append(relpath)

        for chunk_unused in to_chunks(unused,
                                      chunk_size=SQLITE_MAX_VARIABLES_NUMBER):
            cmd = "DELETE FROM {} WHERE path IN ({})".format(
                self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused)))
            self._execute(cmd, tuple(chunk_unused))
Ejemplo n.º 10
0
    def push(self, checksum_infos, remote, jobs=1, show_checksums=False):
        checksum_infos = self._collect(checksum_infos)[0]

        # NOTE: verifying that our cache is not corrupted
        def func(info):
            return not self.changed(info[self.PARAM_MD5])
        checksum_infos = list(filter(func, checksum_infos))

        # NOTE: filter files that are already uploaded
        md5s = [i[self.PARAM_MD5] for i in checksum_infos]
        exists = remote.exists(remote.md5s_to_path_infos(md5s))

        def func(entry):
            return not entry[0]

        assert len(exists) == len(checksum_infos)
        infos_exist = list(filter(func, zip(exists, checksum_infos)))
        checksum_infos = [i for e, i in infos_exist]

        md5s, names = self._group(checksum_infos,
                                  show_checksums=show_checksums)
        cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s]
        path_infos = remote.md5s_to_path_infos(md5s)

        assert len(path_infos) == len(cache) == len(md5s) == len(names)

        chunks = list(zip(to_chunks(path_infos, jobs),
                          to_chunks(cache, jobs),
                          to_chunks(names, jobs)))

        progress.set_n_total(len(names))

        if len(chunks) == 0:
            return

        futures = []
        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for to_infos, from_infos, names in chunks:
                res = executor.submit(remote.upload,
                                      from_infos,
                                      to_infos,
                                      names=names)
                futures.append(res)

        for f in futures:
            f.result()
Ejemplo n.º 11
0
    def remove_links(self, unused):
        for path in unused:
            remove(path)

        for chunk_unused in to_chunks(unused,
                                      chunk_size=SQLITE_MAX_VARIABLES_NUMBER):
            cmd = "DELETE FROM {} WHERE path IN ({})".format(
                self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused)))
            self._execute(cmd, tuple(chunk_unused))
Ejemplo n.º 12
0
    def _do_pull(self,
                 checksum_infos,
                 remote,
                 jobs=1,
                 no_progress_bar=False,
                 show_checksums=False):
        md5s = []
        names = []
        # NOTE: filter files that are not corrupted
        for md5, name in zip(*self._group(checksum_infos,
                                          show_checksums=show_checksums)):
            if self.changed(md5):
                md5s.append(md5)
                names.append(name)

        cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s]
        path_infos = remote.md5s_to_path_infos(md5s)

        assert len(path_infos) == len(cache) == len(md5s) == len(names)

        chunks = list(zip(to_chunks(path_infos, jobs),
                          to_chunks(cache, jobs),
                          to_chunks(names, jobs)))

        progress.set_n_total(len(names))

        if len(chunks) == 0:
            return

        futures = []
        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for from_infos, to_infos, names in chunks:
                res = executor.submit(remote.download,
                                      from_infos,
                                      to_infos,
                                      names=names,
                                      no_progress_bar=no_progress_bar)
                futures.append(res)

        for f in futures:
            f.result()
Ejemplo n.º 13
0
    def push(self, checksum_infos, remote, jobs=1):
        checksum_infos = self._collect(checksum_infos)[0]
        md5s = [info[self.PARAM_MD5] for info in checksum_infos]

        # NOTE: verifying that our cache is not corrupted
        md5s = list(filter(lambda md5: not self.changed(md5), md5s))

        # NOTE: filter files that are already uploaded
        path_infos = remote.md5s_to_path_infos(md5s)
        lexists = remote.exists(path_infos)
        md5s_exist = filter(lambda x: not x[1], list(zip(md5s, lexists)))
        md5s = [md5 for md5, exists in md5s_exist]

        cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s]
        path_infos = remote.md5s_to_path_infos(md5s)

        assert len(path_infos) == len(cache) == len(md5s)

        chunks = list(
            zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs),
                to_chunks(md5s, jobs)))

        progress.set_n_total(len(md5s))

        if len(chunks) == 0:
            return

        futures = []
        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for to_infos, from_infos, md5s in chunks:
                res = executor.submit(remote.upload,
                                      from_infos,
                                      to_infos,
                                      names=md5s)
                futures.append(res)

        for f in futures:
            f.result()
Ejemplo n.º 14
0
    def _get_chunks(self, download, remote, status_info, status, jobs):
        cache = []
        path_infos = []
        names = []
        for md5, info in progress(status_info.items(),
                                  name="Analysing status"):
            if info["status"] == status:
                cache.append(self.checksum_to_path_info(md5))
                path_infos.append(remote.checksum_to_path_info(md5))
                names.append(info["name"])

        if download:
            to_infos = cache
            from_infos = path_infos
        else:
            to_infos = path_infos
            from_infos = cache

        return (
            to_chunks(from_infos, jobs),
            to_chunks(to_infos, jobs),
            to_chunks(names, jobs),
        )
Ejemplo n.º 15
0
    def remove_links(self, unused, fs):
        if not isinstance(fs, LocalFileSystem):
            return

        for path in unused:
            remove(path)

        for chunk_unused in to_chunks(
            unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER
        ):
            cmd = "DELETE FROM {} WHERE path IN ({})".format(
                self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused))
            )
            self._execute(cmd, tuple(chunk_unused))
Ejemplo n.º 16
0
    def cache_exists(self, checksums, jobs=None):
        """This is older implementation used in remote/base.py
        We are reusing it in RemoteSSH, because SSH's batch_exists proved to be
        faster than current approach (relying on exists(path_info)) applied in
        remote/base.
        """
        progress_callback = ProgressCallback(len(checksums))

        def exists_with_progress(chunks):
            return self.batch_exists(chunks, callback=progress_callback)

        if self.no_traverse:
            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, num_chunks=self.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(checksums, in_remote))
                progress_callback.finish("")
                return ret

        return list(set(checksums) & set(self.all()))
Ejemplo n.º 17
0
Archivo: base.py Proyecto: kss682/dvc
    def cache_exists(self, checksums, jobs=None):
        """Check if the given checksums are stored in the remote.

        There are two ways of performing this check:

        - Traverse: Get a list of all the files in the remote
            (traversing the cache directory) and compare it with
            the given checksums.

        - No traverse: For each given checksum, run the `exists`
            method and filter the checksums that aren't on the remote.
            This is done in parallel threads.
            It also shows a progress bar when performing the check.

        The reason for such an odd logic is that most of the remotes
        take much shorter time to just retrieve everything they have under
        a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can
        check if particular file exists much quicker, use their own
        implementation of cache_exists (see http, local).

        Returns:
            A list with checksums that were found in the remote
        """
        progress_callback = ProgressCallback(len(checksums))

        def exists_with_progress(chunks):
            return self.batch_exists(chunks, callback=progress_callback)

        if self.no_traverse and hasattr(self, "batch_exists"):
            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, num_chunks=self.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(checksums, in_remote))
                progress_callback.finish("")
                return ret

        return list(set(checksums) & set(self.all()))
Ejemplo n.º 18
0
    def push(self, checksum_infos, remote, jobs=None, show_checksums=False):
        Logger.info("Preparing to push data to {}".format(remote.url))
        title = "Collecting information"

        progress.set_n_total(1)
        progress.update_target(title, 0, 100)

        checksum_infos = self._collect(checksum_infos)[0]

        progress.update_target(title, 10, 100)

        # NOTE: verifying that our cache is not corrupted
        def func(info):
            return not self.changed_cache_file(info[self.PARAM_MD5])

        checksum_infos = list(filter(func, checksum_infos))

        progress.update_target(title, 20, 100)

        # NOTE: filter files that are already uploaded
        md5s = [i[self.PARAM_MD5] for i in checksum_infos]
        exists = remote.exists(remote.md5s_to_path_infos(md5s))

        progress.update_target(title, 30, 100)

        def func(entry):
            return not entry[0]

        assert len(exists) == len(checksum_infos)
        infos_exist = list(filter(func, zip(exists, checksum_infos)))
        checksum_infos = [i for e, i in infos_exist]

        progress.update_target(title, 70, 100)

        md5s, names = self._group(checksum_infos,
                                  show_checksums=show_checksums)
        cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s]

        progress.update_target(title, 80, 100)

        path_infos = remote.md5s_to_path_infos(md5s)

        assert len(path_infos) == len(cache) == len(md5s) == len(names)

        progress.update_target(title, 90, 100)

        if jobs is None:
            jobs = remote.JOBS

        chunks = list(
            zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs),
                to_chunks(names, jobs)))

        progress.finish_target(title)

        progress.set_n_total(len(names))

        if len(chunks) == 0:
            return

        futures = []
        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for to_infos, from_infos, names in chunks:
                res = executor.submit(remote.upload,
                                      from_infos,
                                      to_infos,
                                      names=names)
                futures.append(res)

        for f in futures:
            f.result()
Ejemplo n.º 19
0
def test_to_chunks_chunk_size(chunk_size, expected_chunks):
    list_to_chunk = [1, 2, 3, 4]
    result = list(to_chunks(list_to_chunk, chunk_size=chunk_size))
    assert result == expected_chunks
Ejemplo n.º 20
0
def test_to_chunks_should_raise(num_chunks, chunk_size):
    list_to_chunk = [1, 2, 3]
    with pytest.raises(ValueError):
        to_chunks(list_to_chunk, num_chunks, chunk_size)
Ejemplo n.º 21
0
def test_to_chunks_num_chunks(num_chunks, expected_chunks):
    list_to_chunk = [1, 2, 3, 4]
    result = to_chunks(list_to_chunk, num_chunks=num_chunks)
    assert result == expected_chunks