def push(self, checksum_infos, remote, jobs=1): md5s = [ info[self.PARAM_MD5] for info in self._collect(checksum_infos)[0] ] # NOTE: verifying that our cache is not corrupted md5s = list(filter(lambda md5: not self.changed(md5), md5s)) # NOTE: filter files that are already uploaded path_infos = remote.md5s_to_path_infos(md5s) md5s_exist = filter(lambda x: not x[1], list(zip(md5s, remote.exists(path_infos)))) md5s = [md5 for md5, exists in md5s_exist] cache = [self.get(md5) for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(md5s, jobs))) progress.set_n_total(len(md5s)) if len(chunks) == 0: return with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for path_infos, paths, md5s in chunks: executor.submit(remote.upload, paths, path_infos, names=md5s)
def _get_chunks(self, download, remote, status_info, status, jobs): title = "Analysing status." progress.set_n_total(1) total = len(status_info) current = 0 cache = [] path_infos = [] names = [] for md5, info in status_info.items(): if info["status"] == status: cache.append(self.checksum_to_path_info(md5)) path_infos.append(remote.checksum_to_path_info(md5)) names.append(info["name"]) current += 1 progress.update_target(title, current, total) progress.finish_target(title) progress.set_n_total(len(names)) if download: to_infos = cache from_infos = path_infos else: to_infos = path_infos from_infos = cache return list( zip( to_chunks(from_infos, jobs), to_chunks(to_infos, jobs), to_chunks(names, jobs), ))
def _do_pull(self, checksum_infos, remote, jobs=1, no_progress_bar=False): md5s = [info[self.PARAM_MD5] for info in checksum_infos] # NOTE: filter files that are not corrupted md5s = list(filter(lambda md5: self.changed(md5), md5s)) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(md5s, jobs))) progress.set_n_total(len(md5s)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for from_infos, to_infos, md5s in chunks: res = executor.submit(remote.download, from_infos, to_infos, names=md5s, no_progress_bar=no_progress_bar) futures.append(res) for f in futures: f.result()
def _do_pull(self, checksum_infos, remote, jobs=1, show_checksums=False): title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) grouped = zip( *self._group(checksum_infos, show_checksums=show_checksums)) progress.update_target(title, 10, 100) md5s = [] names = [] # NOTE: filter files that are not corrupted for md5, name in grouped: if self.changed_cache(md5): md5s.append(md5) names.append(name) progress.update_target(title, 30, 100) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] progress.update_target(title, 50, 100) path_infos = remote.md5s_to_path_infos(md5s) progress.update_target(title, 60, 100) # NOTE: dummy call to try to establish a connection # to see if we need to ask user for a password. remote.exists(remote.md5s_to_path_infos(['000'])) progress.update_target(title, 70, 100) assert len(path_infos) == len(cache) == len(md5s) == len(names) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.finish_target(title) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for from_infos, to_infos, names in chunks: res = executor.submit(remote.download, from_infos, to_infos, names=names) futures.append(res) for f in futures: f.result()
def hashes_exist(self, hashes, jobs=None, name=None): """This is older implementation used in remote/base.py We are reusing it in RemoteSSH, because SSH's batch_exists proved to be faster than current approach (relying on exists(path_info)) applied in remote/base. """ if not self.tree.CAN_TRAVERSE: return list(set(hashes) & set(self.tree.all())) # possibly prompt for credentials before "Querying" progress output self.tree.ensure_credentials() with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), total=len(hashes), unit="file", ) as pbar: def exists_with_progress(chunks): return self.batch_exists(chunks, callback=pbar.update_msg) with ThreadPoolExecutor( max_workers=jobs or self.tree.JOBS) as executor: path_infos = [self.tree.hash_to_path_info(x) for x in hashes] chunks = to_chunks(path_infos, num_chunks=self.tree.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(hashes, in_remote)) return ret
def cache_exists(self, checksums, jobs=None, name=None): """This is older implementation used in remote/base.py We are reusing it in RemoteSSH, because SSH's batch_exists proved to be faster than current approach (relying on exists(path_info)) applied in remote/base. """ if not self.no_traverse: return list(set(checksums) & set(self.all())) with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), total=len(checksums), unit="file", ) as pbar: def exists_with_progress(chunks): return self.batch_exists(chunks, callback=pbar.update_desc) with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, num_chunks=self.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(checksums, in_remote)) return ret
def cache_exists(self, checksums): """Check if the given checksums are stored in the remote. There are two ways of performing this check: - Traverse: Get a list of all the files in the remote (traversing the cache directory) and compare it with the given checksums. - No traverse: For each given checksum, run the `exists` method and filter the checksums that aren't on the remote. This is done in parallel threads. The reason for such an odd logic is that most of the remotes take much shorter time to just retrieve everything they have under a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can check if particular file exists much quicker, use their own implementation of cache_exists (see http, local). Returns: A list with checksums that were found in the remote """ if self.no_traverse and self.SUPPORTS_NO_TRAVERSE: with ThreadPoolExecutor(max_workers=self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, self.JOBS) results = executor.map(self.exists, chunks) in_remote = itertools.chain.from_iterable(results) return list(itertools.compress(checksums, in_remote)) return list(set(checksums) & set(self.all()))
def batch_exists(self, path_infos, callback): def _exists(chunk_and_channel): chunk, channel = chunk_and_channel ret = [] for path in chunk: try: channel.stat(path) ret.append(True) except OSError as exc: if exc.errno != errno.ENOENT: raise ret.append(False) callback(path) return ret with self.tree.ssh(path_infos[0]) as ssh: channels = ssh.open_max_sftp_channels() max_workers = len(channels) with ThreadPoolExecutor(max_workers=max_workers) as executor: paths = [path_info.path for path_info in path_infos] chunks = to_chunks(paths, num_chunks=max_workers) chunks_and_channels = zip(chunks, channels) outcome = executor.map(_exists, chunks_and_channels) results = list(itertools.chain.from_iterable(outcome)) return results
def remove_unused_links(self, used): """Removes all saved links except the ones that are used. Args: used (list): list of used links that should not be removed. """ unused = [] self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE)) for row in self.cursor: relpath, inode, mtime = row inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relpath) if path in used: continue if not os.path.exists(path): continue actual_inode = get_inode(path) actual_mtime, _ = get_mtime_and_size(path, self.repo.dvcignore) if inode == actual_inode and mtime == actual_mtime: logger.debug("Removing '{}' as unused link.".format(path)) remove(path) unused.append(relpath) for chunk_unused in to_chunks(unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER): cmd = "DELETE FROM {} WHERE path IN ({})".format( self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused))) self._execute(cmd, tuple(chunk_unused))
def push(self, checksum_infos, remote, jobs=1, show_checksums=False): checksum_infos = self._collect(checksum_infos)[0] # NOTE: verifying that our cache is not corrupted def func(info): return not self.changed(info[self.PARAM_MD5]) checksum_infos = list(filter(func, checksum_infos)) # NOTE: filter files that are already uploaded md5s = [i[self.PARAM_MD5] for i in checksum_infos] exists = remote.exists(remote.md5s_to_path_infos(md5s)) def func(entry): return not entry[0] assert len(exists) == len(checksum_infos) infos_exist = list(filter(func, zip(exists, checksum_infos))) checksum_infos = [i for e, i in infos_exist] md5s, names = self._group(checksum_infos, show_checksums=show_checksums) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) == len(names) chunks = list(zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for to_infos, from_infos, names in chunks: res = executor.submit(remote.upload, from_infos, to_infos, names=names) futures.append(res) for f in futures: f.result()
def remove_links(self, unused): for path in unused: remove(path) for chunk_unused in to_chunks(unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER): cmd = "DELETE FROM {} WHERE path IN ({})".format( self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused))) self._execute(cmd, tuple(chunk_unused))
def _do_pull(self, checksum_infos, remote, jobs=1, no_progress_bar=False, show_checksums=False): md5s = [] names = [] # NOTE: filter files that are not corrupted for md5, name in zip(*self._group(checksum_infos, show_checksums=show_checksums)): if self.changed(md5): md5s.append(md5) names.append(name) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) == len(names) chunks = list(zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for from_infos, to_infos, names in chunks: res = executor.submit(remote.download, from_infos, to_infos, names=names, no_progress_bar=no_progress_bar) futures.append(res) for f in futures: f.result()
def push(self, checksum_infos, remote, jobs=1): checksum_infos = self._collect(checksum_infos)[0] md5s = [info[self.PARAM_MD5] for info in checksum_infos] # NOTE: verifying that our cache is not corrupted md5s = list(filter(lambda md5: not self.changed(md5), md5s)) # NOTE: filter files that are already uploaded path_infos = remote.md5s_to_path_infos(md5s) lexists = remote.exists(path_infos) md5s_exist = filter(lambda x: not x[1], list(zip(md5s, lexists))) md5s = [md5 for md5, exists in md5s_exist] cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(md5s, jobs))) progress.set_n_total(len(md5s)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for to_infos, from_infos, md5s in chunks: res = executor.submit(remote.upload, from_infos, to_infos, names=md5s) futures.append(res) for f in futures: f.result()
def _get_chunks(self, download, remote, status_info, status, jobs): cache = [] path_infos = [] names = [] for md5, info in progress(status_info.items(), name="Analysing status"): if info["status"] == status: cache.append(self.checksum_to_path_info(md5)) path_infos.append(remote.checksum_to_path_info(md5)) names.append(info["name"]) if download: to_infos = cache from_infos = path_infos else: to_infos = path_infos from_infos = cache return ( to_chunks(from_infos, jobs), to_chunks(to_infos, jobs), to_chunks(names, jobs), )
def remove_links(self, unused, fs): if not isinstance(fs, LocalFileSystem): return for path in unused: remove(path) for chunk_unused in to_chunks( unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER ): cmd = "DELETE FROM {} WHERE path IN ({})".format( self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused)) ) self._execute(cmd, tuple(chunk_unused))
def cache_exists(self, checksums, jobs=None): """This is older implementation used in remote/base.py We are reusing it in RemoteSSH, because SSH's batch_exists proved to be faster than current approach (relying on exists(path_info)) applied in remote/base. """ progress_callback = ProgressCallback(len(checksums)) def exists_with_progress(chunks): return self.batch_exists(chunks, callback=progress_callback) if self.no_traverse: with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, num_chunks=self.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(checksums, in_remote)) progress_callback.finish("") return ret return list(set(checksums) & set(self.all()))
def cache_exists(self, checksums, jobs=None): """Check if the given checksums are stored in the remote. There are two ways of performing this check: - Traverse: Get a list of all the files in the remote (traversing the cache directory) and compare it with the given checksums. - No traverse: For each given checksum, run the `exists` method and filter the checksums that aren't on the remote. This is done in parallel threads. It also shows a progress bar when performing the check. The reason for such an odd logic is that most of the remotes take much shorter time to just retrieve everything they have under a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can check if particular file exists much quicker, use their own implementation of cache_exists (see http, local). Returns: A list with checksums that were found in the remote """ progress_callback = ProgressCallback(len(checksums)) def exists_with_progress(chunks): return self.batch_exists(chunks, callback=progress_callback) if self.no_traverse and hasattr(self, "batch_exists"): with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, num_chunks=self.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(checksums, in_remote)) progress_callback.finish("") return ret return list(set(checksums) & set(self.all()))
def push(self, checksum_infos, remote, jobs=None, show_checksums=False): Logger.info("Preparing to push data to {}".format(remote.url)) title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) checksum_infos = self._collect(checksum_infos)[0] progress.update_target(title, 10, 100) # NOTE: verifying that our cache is not corrupted def func(info): return not self.changed_cache_file(info[self.PARAM_MD5]) checksum_infos = list(filter(func, checksum_infos)) progress.update_target(title, 20, 100) # NOTE: filter files that are already uploaded md5s = [i[self.PARAM_MD5] for i in checksum_infos] exists = remote.exists(remote.md5s_to_path_infos(md5s)) progress.update_target(title, 30, 100) def func(entry): return not entry[0] assert len(exists) == len(checksum_infos) infos_exist = list(filter(func, zip(exists, checksum_infos))) checksum_infos = [i for e, i in infos_exist] progress.update_target(title, 70, 100) md5s, names = self._group(checksum_infos, show_checksums=show_checksums) cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s] progress.update_target(title, 80, 100) path_infos = remote.md5s_to_path_infos(md5s) assert len(path_infos) == len(cache) == len(md5s) == len(names) progress.update_target(title, 90, 100) if jobs is None: jobs = remote.JOBS chunks = list( zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs), to_chunks(names, jobs))) progress.finish_target(title) progress.set_n_total(len(names)) if len(chunks) == 0: return futures = [] with ThreadPoolExecutor(max_workers=len(chunks)) as executor: for to_infos, from_infos, names in chunks: res = executor.submit(remote.upload, from_infos, to_infos, names=names) futures.append(res) for f in futures: f.result()
def test_to_chunks_chunk_size(chunk_size, expected_chunks): list_to_chunk = [1, 2, 3, 4] result = list(to_chunks(list_to_chunk, chunk_size=chunk_size)) assert result == expected_chunks
def test_to_chunks_should_raise(num_chunks, chunk_size): list_to_chunk = [1, 2, 3] with pytest.raises(ValueError): to_chunks(list_to_chunk, num_chunks, chunk_size)
def test_to_chunks_num_chunks(num_chunks, expected_chunks): list_to_chunk = [1, 2, 3, 4] result = to_chunks(list_to_chunk, num_chunks=num_chunks) assert result == expected_chunks