def test_download(snapshotter, uploader, storage, tmpdir): with snapshotter.lock: snapshotter.create_4foobar() ss1 = snapshotter.get_snapshot_state() hashes = snapshotter.get_snapshot_hashes() uploader.write_hashes_to_storage(snapshotter=snapshotter, hashes=hashes, progress=Progress(), parallel=1) # Download the old backup from storage dst2 = Path(tmpdir / "dst2") dst2.mkdir() dst3 = Path(tmpdir / "dst3") dst3.mkdir() snapshotter = Snapshotter(src=dst2, dst=dst3, globs=["*"], parallel=1) downloader = Downloader(storage=storage, snapshotter=snapshotter, dst=dst2, parallel=1) with snapshotter.lock: downloader.download_from_storage(progress=Progress(), snapshotstate=ss1) # And ensure we get same snapshot state by snapshotting it assert snapshotter.snapshot(progress=Progress()) > 0 ss2 = snapshotter.get_snapshot_state() # Ensure the files are same (modulo mtime_ns, which doesn't # guaranteedly hit quite same numbers) for ssfile1, ssfile2 in zip(ss1.files, ss2.files): assert ssfile1.equals_excluding_mtime(ssfile2)
def create_4foobar(self): (self.src / "foo").write_text("foobar") (self.src / "foo2").write_text("foobar") (self.src / "foobig").write_text("foobar" * magic.EMBEDDED_FILE_SIZE) (self.src / "foobig2").write_text("foobar" * magic.EMBEDDED_FILE_SIZE) progress = Progress() assert self.snapshot(progress=progress) > 0 ss1 = self.get_snapshot_state() assert self.snapshot(progress=Progress()) == 0 ss2 = self.get_snapshot_state() assert ss1 == ss2
def test_snapshot_error_filenotfound(snapshotter, mocker, test): (obj, fun, exp_progress_1, exp_progress_2) = test def _not_really_found(*a, **kw): raise FileNotFoundError obj = obj or snapshotter mocker.patch.object(obj, fun, new=_not_really_found) (snapshotter.src / "foo").write_text("foobar") (snapshotter.src / "bar").write_text("foobar") with snapshotter.lock: progress = Progress() assert snapshotter.snapshot(progress=progress) == exp_progress_1 progress = Progress() assert snapshotter.snapshot(progress=progress) == exp_progress_2
async def wait_successful_results(self, start_results, *, result_class, all_nodes=True): urls = [] for i, result in enumerate(start_results, 1): if not result or isinstance(result, Exception): logger.info( "wait_successful_results: Incorrect start result for #%d/%d: %r", i, len(start_results), result) return [] parsed_result = op.Op.StartResult.parse_obj(result) urls.append(parsed_result.status_url) if all_nodes and len(urls) != len(self.nodes): return [] delay = self.config.poll.delay_start results = [None] * len(urls) # Note that we don't have timeout mechanism here as such, # however, if re-locking times out, we will bail out. TBD if # we need timeout mechanism here anyway. failures = {} def _event_awaitable_factory(): return self.subresult_received_event.wait() async for _ in utils.exponential_backoff( initial=delay, multiplier=self.config.poll.delay_multiplier, maximum=self.config.poll.delay_max, duration=self.config.poll.duration, event_awaitable_factory=_event_awaitable_factory, ): self.subresult_received_event.clear() for i, (url, result) in enumerate(zip(urls, results)): # TBD: This could be done in parallel too if result is not None and result.progress.final: continue r = await utils.httpx_request( url, caller="CoordinatorOp.wait_successful_results", timeout=self.config.poll.result_timeout) if r is None: failures[i] = failures.get(i, 0) + 1 if failures[i] >= self.config.poll.maximum_failures: return [] continue # We got something -> decode the result result = result_class.parse_obj(r) results[i] = result failures[i] = 0 assert self.current_step self.step_progress[self.current_step] = Progress.merge( r.progress for r in results if r is not None) if result.progress.finished_failed: return [] if not any(True for result in results if result is None or not result.progress.final): break else: logger.debug("wait_successful_results timed out") return [] return results
def write_hashes_to_storage(self, *, snapshotter: Snapshotter, hashes, parallel: int, progress: Progress, still_running_callback=lambda: True): todo = set(hash.hexdigest for hash in hashes) progress.start(len(todo)) sizes = {"total": 0, "stored": 0} def _upload_hexdigest_in_thread(hexdigest): storage = self.local_storage assert hexdigest files = snapshotter.hexdigest_to_snapshotfiles.get(hexdigest, []) for snapshotfile in files: path = snapshotter.dst / snapshotfile.relative_path if not path.is_file(): logger.warning("%s disappeared post-snapshot", path) continue with snapshotfile.open_for_reading(snapshotter.dst) as f: current_hexdigest = hash_hexdigest_readable(f) if current_hexdigest != snapshotfile.hexdigest: logger.info("Hash of %s changed before upload", snapshotfile.relative_path) continue try: with snapshotfile.open_for_reading(snapshotter.dst) as f: upload_result = storage.upload_hexdigest_from_file( hexdigest, f) except exceptions.TransientException as ex: # Do not pollute logs with transient exceptions logger.debug("Transient exception uploading %r: %r", path, ex) return progress.upload_failure, 0, 0 except exceptions.AstacusException: # Report failure - whole step will be retried later logger.exception("Exception uploading %r", path) return progress.upload_failure, 0, 0 with snapshotfile.open_for_reading(snapshotter.dst) as f: current_hexdigest = hash_hexdigest_readable(f) if current_hexdigest != snapshotfile.hexdigest: logger.info("Hash of %s changed after upload", snapshotfile.relative_path) storage.delete_hexdigest(hexdigest) continue return progress.upload_success, upload_result.size, upload_result.stored_size # We didn't find single file with the matching hexdigest. # Report it as missing but keep uploading other files. return progress.upload_missing, 0, 0 def _result_cb(*, map_in, map_out): # progress callback in 'main' thread progress_callback, total, stored = map_out sizes["total"] += total sizes["stored"] += stored progress_callback(map_in) # hexdigest return still_running_callback() sorted_todo = sorted( todo, key=lambda hexdigest: -snapshotter.hexdigest_to_snapshotfiles[ hexdigest][0].file_size) if not utils.parallel_map_to(fun=_upload_hexdigest_in_thread, iterable=sorted_todo, result_callback=_result_cb, n=parallel): progress.add_fail() return sizes["total"], sizes["stored"]
def snapshot(self, *, progress: Optional[Progress] = None): assert self.lock.locked() if progress is None: progress = Progress() src_dirs, src_files = self._list_dirs_and_files(self.src) progress.start(1) if self.src == self.dst: # The src=dst mode should be used if and only if it is # known that files will not disappear between snapshot and # upload steps (e.g. Astacus controls the lifecycle of the # files within). In that case, there is little point in # making extra symlinks and we can just use the src # directory contents as-is. dst_dirs, dst_files = src_dirs, src_files else: progress.add_total(3) dst_dirs, dst_files = self._list_dirs_and_files(self.dst) # Create missing directories changes = self._snapshot_create_missing_directories(src_dirs=src_dirs, dst_dirs=dst_dirs) progress.add_success() # Remove extra files changes += self._snapshot_remove_extra_files(src_files=src_files, dst_files=dst_files) progress.add_success() # Add missing files changes += self._snapshot_add_missing_files(src_files=src_files, dst_files=dst_files) progress.add_success() # We COULD also remove extra directories, but it is not # probably really worth it and due to ignored files it # actually might not even work. # Then, create/update corresponding snapshotfile objects (old # ones were already removed) dst_dirs, dst_files = self._list_dirs_and_files(self.dst) snapshotfiles = list(self._get_snapshot_hash_list(dst_files)) progress.add_total(len(snapshotfiles)) def _cb(snapshotfile): # src may or may not be present; dst is present as it is in snapshot with snapshotfile.open_for_reading(self.dst) as f: if snapshotfile.file_size <= magic.EMBEDDED_FILE_SIZE: snapshotfile.content_b64 = base64.b64encode(f.read()).decode() else: snapshotfile.hexdigest = hash_hexdigest_readable(f) return snapshotfile def _result_cb(*, map_in, map_out): self._add_snapshotfile(map_out) progress.add_success() return True changes += len(snapshotfiles) utils.parallel_map_to(iterable=snapshotfiles, fun=_cb, result_callback=_result_cb, n=self.parallel) # We initially started with 1 extra progress.add_success() return changes
def progress(self): return Progress.merge(self.step_progress.values())
def test_snapshot(snapshotter, uploader): with snapshotter.lock: # Start with empty assert snapshotter.snapshot(progress=Progress()) == 0 src = snapshotter.src dst = snapshotter.dst assert not (dst / "foo").is_file() # Create files in src, run snapshot snapshotter.create_4foobar() ss2 = snapshotter.get_snapshot_state() assert (dst / "foo").is_file() assert (dst / "foo").read_text() == "foobar" assert (dst / "foo2").read_text() == "foobar" hashes = snapshotter.get_snapshot_hashes() assert len(hashes) == 1 assert hashes == [ ipc.SnapshotHash( hexdigest= '326827fe6fd23503bf16eed91861766df522748794814a1bf46d479d9feae1a0', size=600) ] while True: (src / "foo").write_text("barfoo") # same length if snapshotter.snapshot(progress=Progress()) > 0: # Sometimes fails on first iteration(s) due to same mtime # (inaccurate timestamps) break ss3 = snapshotter.get_snapshot_state() assert ss2 != ss3 assert snapshotter.snapshot(progress=Progress()) == 0 assert (dst / "foo").is_file() assert (dst / "foo").read_text() == "barfoo" uploader.write_hashes_to_storage(snapshotter=snapshotter, hashes=hashes, parallel=1, progress=Progress()) # Remove file from src, run snapshot for filename in ["foo", "foo2", "foobig", "foobig2"]: (src / filename).unlink() assert snapshotter.snapshot(progress=Progress()) > 0 assert snapshotter.snapshot(progress=Progress()) == 0 assert not (dst / filename).is_file() # Now shouldn't have any data hashes hashes_empty = snapshotter.get_snapshot_hashes() assert not hashes_empty with pytest.raises(AssertionError): snapshotter.snapshot(progress=Progress()) with pytest.raises(AssertionError): snapshotter.get_snapshot_state() with pytest.raises(AssertionError): snapshotter.get_snapshot_hashes()