def cache_exists(self, checksums, jobs=None): """Check if the given checksums are stored in the remote. There are two ways of performing this check: - Traverse: Get a list of all the files in the remote (traversing the cache directory) and compare it with the given checksums. - No traverse: For each given checksum, run the `exists` method and filter the checksums that aren't on the remote. This is done in parallel threads. It also shows a progress bar when performing the check. The reason for such an odd logic is that most of the remotes take much shorter time to just retrieve everything they have under a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can check if particular file exists much quicker, use their own implementation of cache_exists (see http, local). Returns: A list with checksums that were found in the remote """ progress_callback = ProgressCallback(len(checksums)) def exists_with_progress(chunks): return self.batch_exists(chunks, callback=progress_callback) if self.no_traverse and hasattr(self, "batch_exists"): with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, num_chunks=self.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(checksums, in_remote)) progress_callback.finish("") return ret return list(set(checksums) & set(self.all()))
def cache_exists(self, checksums, jobs=None): """This is older implementation used in remote/base.py We are reusing it in RemoteSSH, because SSH's batch_exists proved to be faster than current approach (relying on exists(path_info)) applied in remote/base. """ progress_callback = ProgressCallback(len(checksums)) def exists_with_progress(chunks): return self.batch_exists(chunks, callback=progress_callback) if self.no_traverse: with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: path_infos = [self.checksum_to_path_info(x) for x in checksums] chunks = to_chunks(path_infos, num_chunks=self.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) ret = list(itertools.compress(checksums, in_remote)) progress_callback.finish("") return ret return list(set(checksums) & set(self.all()))
def get_progress_callback(stages): total_files_num = get_all_files_numbers(stages) if total_files_num == 0: return None return ProgressCallback(total_files_num)
def test_should_init_reset_progress(self, progress_mock): total_files_num = 1 ProgressCallback(total_files_num) assert [mock.call.reset()] == progress_mock.method_calls