def fetch_all(self): for retry in range(3): try: with self.manager_class() as manager: self._setup_progress_tracking(manager) with self.pool_class( processes=self._process_count()) as pool: self._queue_jobs(pool) self._wait_for_jobs_to_complete() break except TimeoutError: self.pending_jobs.clear() self.last_progress_ts = time.monotonic() if self.errors: break if retry == 2: self.log.error( "Download stalled despite retries, aborting") self.errors = 1 break if self.errors: raise RestoreError( "Backup download/extraction failed with {} errors".format( self.errors)) self._create_tablespace_symlinks() with compat.suppress(OSError): os.rmdir(os.path.join(self.pgdata, "pgdata"))
def _process_completed_download_operations(self, timeout=None): while True: try: result = self.server.download_results.get(block=timeout is not None, timeout=timeout) key = result["opaque"] with self.server.lock: op = self.server.pending_download_ops.pop(key, None) if not op: self.server.log.warning("Orphaned download operation %r completed: %r", key, result) if result["success"]: with suppress(OSError): os.unlink(result["target_path"]) continue if result["success"]: if os.path.isfile(op["target_path"]): self.server.log.warning("Target path for %r already exists, skipping", key) continue os.rename(result["target_path"], op["target_path"]) metadata = result["metadata"] or {} self.server.log.info( "Renamed %s to %s. Original upload from %r, hash %s:%s", result["target_path"], op["target_path"], metadata.get("host"), metadata.get("hash-algorithm"), metadata.get("hash") ) else: ex = result.get("exception", Error) if isinstance(ex, FileNotFoundFromStorageError): # don't try prefetching this file again self.server.prefetch_404.append(key) else: self.server.log.warning("Fetching %r failed (%s), took: %.3fs", key, ex.__class__.__name__, time.monotonic() - op["started_at"]) except Empty: return
def handle_upload(self, site, key, file_to_transfer): try: storage = self.get_object_storage(site) unlink_local = False if "blob" in file_to_transfer: storage.store_file_from_memory( key, file_to_transfer["blob"], metadata=file_to_transfer["metadata"]) else: # Basebackups may be multipart uploads, depending on the driver. # Swift needs to know about this so it can do possible cleanups. multipart = file_to_transfer["filetype"] == "basebackup" try: storage.store_file_from_disk( key, file_to_transfer["local_path"], metadata=file_to_transfer["metadata"], multipart=multipart) unlink_local = True except LocalFileIsRemoteFileError: pass if unlink_local: try: self.log.debug( "Deleting file: %r since it has been uploaded", file_to_transfer["local_path"]) os.unlink(file_to_transfer["local_path"]) metadata_path = file_to_transfer["local_path"] + ".metadata" with suppress(FileNotFoundError): os.unlink(metadata_path) except Exception as ex: # pylint: disable=broad-except self.log.exception("Problem in deleting file: %r", file_to_transfer["local_path"]) self.stats.unexpected_exception( ex, where="handle_upload_unlink") return {"success": True, "opaque": file_to_transfer.get("opaque")} except Exception as ex: # pylint: disable=broad-except if file_to_transfer.get("retry_number", 0) > 0: self.log.exception("Problem in moving file: %r, need to retry", file_to_transfer["local_path"]) # Ignore the exception the first time round as some object stores have frequent Internal Errors # and the upload usually goes through without any issues the second time round self.stats.unexpected_exception(ex, where="handle_upload") else: self.log.warning( "Problem in moving file: %r, need to retry (%s: %s)", file_to_transfer["local_path"], ex.__class__.__name__, ex) # Sleep for a bit to avoid busy looping time.sleep(0.5) file_to_transfer["retry_number"] = file_to_transfer.get( "retry_number", 0) + 1 if file_to_transfer["retry_number"] > self.config[ "upload_retries_warning_limit"]: create_alert_file(self.config, "upload_retries_warning") self.transfer_queue.put(file_to_transfer) return {"success": False, "call_callback": False, "exception": ex}
def handle_site(self, site, site_config): self.set_state_defaults(site) xlog_path, basebackup_path = self.create_backup_site_paths(site) if not site_config["active"]: return # If a site has been marked inactive, don't bother checking anything self._cleanup_inactive_receivexlogs(site) chosen_backup_node = random.choice(site_config["nodes"]) if site not in self.receivexlogs and site not in self.walreceivers: if site_config["active_backup_mode"] == "pg_receivexlog": self.receivexlog_listener(site, chosen_backup_node, xlog_path + "_incoming") elif site_config["active_backup_mode"] == "walreceiver": state_file_path = self.config["json_state_file_path"] walreceiver_state = {} with suppress(FileNotFoundError): with open(state_file_path, "r") as fp: old_state_file = json.load(fp) walreceiver_state = old_state_file.get( "walreceivers", {}).get(site, {}) self.start_walreceiver( site=site, chosen_backup_node=chosen_backup_node, last_flushed_lsn=walreceiver_state.get("last_flushed_lsn")) last_check_time = self.time_of_last_backup_check.get(site) if not last_check_time or (time.monotonic() - self.time_of_last_backup_check[site]) > 300: self.refresh_backup_list_and_delete_old(site) self.time_of_last_backup_check[site] = time.monotonic() # check if a basebackup is running, or if a basebackup has just completed if site in self.basebackups: try: result = self.basebackups_callbacks[site].get(block=False) except Empty: # previous basebackup (or its compression and upload) still in progress return if self.basebackups[site].is_alive(): self.basebackups[site].join() del self.basebackups[site] del self.basebackups_callbacks[site] self.log.debug("Basebackup has finished for %r: %r", site, result) self.refresh_backup_list_and_delete_old(site) self.time_of_last_backup_check[site] = time.monotonic() metadata = self.get_new_backup_details(site=site, site_config=site_config) if metadata and not os.path.exists( self.config["maintenance_mode_file"]): self.basebackups_callbacks[site] = Queue() self.create_basebackup(site, chosen_backup_node, basebackup_path, self.basebackups_callbacks[site], metadata)
def _transfer_agent_op(self, site, filename, filetype, method, *, retries=2, target_path=None): start_time = time.time() tmp_target_path = None if method == "DOWNLOAD": # NOTE: we request download on a temporary download path so we can atomically overwrite the file if / # when we successfully receive it. try: fd, tmp_target_path = tempfile.mkstemp(prefix="{}.".format(target_path), suffix=".pghoard.tmp") os.close(fd) except OSError as ex: raise HttpResponse("Unable to create temporary file for {0!r}: {1.__class__.__name__}: {1}" .format(target_path, ex), status=400) self.server.log.debug("Requesting site: %r, filename: %r, filetype: %r, target_path: %r", site, filename, filetype, target_path) callback_queue = Queue() self.server.transfer_queue.put({ "callback_queue": callback_queue, "filetype": filetype, "local_path": filename, "site": site, "target_path": tmp_target_path, "type": method, }) try: try: response = callback_queue.get(timeout=30.0) self.server.log.debug("Handled a %s request for: %r %r, took: %.3fs", method, site, target_path, time.time() - start_time) except Empty: self.server.log.exception("Timeout on a %s request for: %r %r, took: %.3fs", method, site, target_path, time.time() - start_time) raise HttpResponse("TIMEOUT", status=500) if not response["success"]: if isinstance(response.get("exception"), FileNotFoundFromStorageError): raise HttpResponse("{0.__class__.__name__}: {0}".format(response["exception"]), status=404) raise HttpResponse(status=500) except HttpResponse as ex: if tmp_target_path: with suppress(Exception): os.unlink(tmp_target_path) if ex.status == 500 and retries: self.server.log.warning("Transfer operation failed, retrying (%r retries left)", retries) return self._transfer_agent_op(site, filename, filetype, method, retries=retries - 1, target_path=target_path) raise if tmp_target_path: self._save_and_verify_restored_file(filetype, filename, tmp_target_path, target_path) return response
def _try_save_and_verify_restored_file(self, filetype, filename, prefetch_target_path, target_path, unlink=True): try: self._save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path) self.server.log.info("Renamed %s to %s", prefetch_target_path, target_path) return None except (ValueError, HttpResponse) as e: # Just try loading the file again with suppress(OSError): self.server.log.warning("Verification of prefetch file %s failed: %r", prefetch_target_path, e) if unlink: os.unlink(prefetch_target_path) return e
def check_command_success(self, proc, output_file): rc = terminate_subprocess(proc, log=self.log) msg = "Ran: {!r}, took: {:.3f}s to run, returncode: {}".format( proc.args, time.monotonic() - proc.basebackup_start_time, rc) if rc == 0 and os.path.exists(output_file): self.log.info(msg) return True if output_file: with suppress(FileNotFoundError): os.unlink(output_file) raise BackupFailure(msg)
def fetch_all(self): with self.manager_class() as manager: self._setup_progress_tracking(manager) with self.pool_class(processes=self._process_count()) as pool: self._queue_jobs(pool) self._wait_for_jobs_to_complete() if self.errors: raise RestoreError("Backup download/extraction failed with {} errors".format(self.errors)) self._create_tablespace_symlinks() with compat.suppress(OSError): os.rmdir(os.path.join(self.pgdata, "pgdata"))
def get_paths_for_backup(basebackup_path): i = 0 while True: tsdir = datetime.datetime.utcnow().strftime("%Y-%m-%d") + "_" + str(i) raw_basebackup = os.path.join(basebackup_path + "_incoming", tsdir) compressed_basebackup = os.path.join(basebackup_path, tsdir) # The backup directory names need not to be a sequence, so we lean towards skipping over any # partial or leftover progress below. Make sure we only return paths if we're able to create the # raw_basebackup directory. if not os.path.exists(raw_basebackup) and not os.path.exists(compressed_basebackup): with suppress(FileExistsError): os.makedirs(raw_basebackup) return raw_basebackup, compressed_basebackup i += 1
def handle_upload(self, site, key, file_to_transfer): try: storage = self.get_object_storage(site) unlink_local = False if "blob" in file_to_transfer: storage.store_file_from_memory(key, file_to_transfer["blob"], metadata=file_to_transfer["metadata"]) else: # Basebackups may be multipart uploads, depending on the driver. # Swift needs to know about this so it can do possible cleanups. multipart = file_to_transfer["filetype"] in {"basebackup", "basebackup_chunk"} try: storage.store_file_from_disk(key, file_to_transfer["local_path"], metadata=file_to_transfer["metadata"], multipart=multipart) unlink_local = True except LocalFileIsRemoteFileError: pass if unlink_local: try: self.log.debug("Deleting file: %r since it has been uploaded", file_to_transfer["local_path"]) os.unlink(file_to_transfer["local_path"]) metadata_path = file_to_transfer["local_path"] + ".metadata" with suppress(FileNotFoundError): os.unlink(metadata_path) except Exception as ex: # pylint: disable=broad-except self.log.exception("Problem in deleting file: %r", file_to_transfer["local_path"]) self.metrics.unexpected_exception(ex, where="handle_upload_unlink") return {"success": True, "opaque": file_to_transfer.get("opaque")} except Exception as ex: # pylint: disable=broad-except if file_to_transfer.get("retry_number", 0) > 0: self.log.exception("Problem in moving file: %r, need to retry", file_to_transfer["local_path"]) # Ignore the exception the first time round as some object stores have frequent Internal Errors # and the upload usually goes through without any issues the second time round self.metrics.unexpected_exception(ex, where="handle_upload") else: self.log.warning("Problem in moving file: %r, need to retry (%s: %s)", file_to_transfer["local_path"], ex.__class__.__name__, ex) file_to_transfer["retry_number"] = file_to_transfer.get("retry_number", 0) + 1 if file_to_transfer["retry_number"] > self.config["upload_retries_warning_limit"]: create_alert_file(self.config, "upload_retries_warning") # Sleep for a bit to avoid busy looping. Increase sleep time if the op fails multiple times self.sleep(min(0.5 * 2 ** (file_to_transfer["retry_number"] - 1), 20)) self.transfer_queue.put(file_to_transfer) return {"success": False, "call_callback": False, "exception": ex}
def get_command_success(self, proc, output_file): rc = terminate_subprocess(proc, log=self.log) msg = "Ran: {!r}, took: {:.3f}s to run, returncode: {}".format( proc.args, time.monotonic() - proc.basebackup_start_time, rc) if rc == 0 and os.path.exists(output_file): self.log.info(msg) return True self.log.error(msg) if output_file: with suppress(FileNotFoundError): os.unlink(output_file) if self.callback_queue: # post a failure event self.callback_queue.put({"success": False}) self.running = False
def _proc_success(self, proc, output_file): rc = terminate_subprocess(proc, log=self.log) msg = "Ran: {!r}, took: {:.3f}s to run, returncode: {}".format( proc.args, time.monotonic() - proc.basebackup_start_time, rc) if rc == 0 and os.path.exists(output_file): self.log.info(msg) return True self.log.error(msg) if output_file: with suppress(FileNotFoundError): os.unlink(output_file) if self.callback_queue: # post a failure event self.callback_queue.put({"success": False}) self.running = False
def _create_tablespace_symlinks(self): if not self.tablespaces: return tblspc_dir = os.path.join(self.pgdata, "pg_tblspc") os.makedirs(tblspc_dir, exist_ok=True) for settings in self.tablespaces.values(): if os.path.isdir(settings["path"]): link_name = os.path.join(self.pgdata, "pg_tblspc", str(settings["oid"])) try: os.symlink(settings["path"], link_name) except OSError as e: if e.errno != errno.EEXIST: raise # Remove empty directories that could not be excluded when extracting tar due to # tar's limitations in exclude parameter behavior tsnames = [os.path.join("tablespaces", tsname) for tsname in self.tablespaces.keys()] for exclude in tsnames + ["tablespaces"]: with compat.suppress(OSError): os.rmdir(os.path.join(self.pgdata, exclude))
def handle_upload(self, site, key, file_to_transfer): try: storage = self.get_object_storage(site) unlink_local = False if "blob" in file_to_transfer: storage.store_file_from_memory(key, file_to_transfer["blob"], metadata=file_to_transfer["metadata"]) else: # Basebackups may be multipart uploads, depending on the driver. # Swift needs to know about this so it can do possible cleanups. multipart = file_to_transfer["filetype"] == "basebackup" try: storage.store_file_from_disk( key, file_to_transfer["local_path"], metadata=file_to_transfer["metadata"], multipart=multipart ) unlink_local = True except LocalFileIsRemoteFileError: pass if unlink_local: try: self.log.debug("Deleting file: %r since it has been uploaded", file_to_transfer["local_path"]) os.unlink(file_to_transfer["local_path"]) metadata_path = file_to_transfer["local_path"] + ".metadata" with suppress(FileNotFoundError): os.unlink(metadata_path) except Exception as ex: # pylint: disable=broad-except self.log.exception("Problem in deleting file: %r", file_to_transfer["local_path"]) self.stats.unexpected_exception(ex, where="handle_upload_unlink") return {"success": True, "opaque": file_to_transfer.get("opaque")} except Exception as ex: # pylint: disable=broad-except self.log.exception("Problem in moving file: %r, need to retry", file_to_transfer["local_path"]) self.stats.unexpected_exception(ex, where="handle_upload") # Sleep for a bit to avoid busy looping time.sleep(0.5) file_to_transfer["retries"] = file_to_transfer.get("retries", 0) + 1 if file_to_transfer["retries"] > self.config["upload_retries_warning_limit"]: create_alert_file(self.config, "upload_retries_warning") self.transfer_queue.put(file_to_transfer) return {"success": False, "call_callback": False, "exception": ex}
def handle_upload(self, site, key, file_to_transfer): try: storage = self.get_object_storage(site) unlink_local = False if "blob" in file_to_transfer: storage.store_file_from_memory(key, file_to_transfer["blob"], metadata=file_to_transfer["metadata"]) else: # Basebackups may be multipart uploads, depending on the driver. # Swift needs to know about this so it can do possible cleanups. multipart = file_to_transfer["filetype"] == "basebackup" try: storage.store_file_from_disk(key, file_to_transfer["local_path"], metadata=file_to_transfer["metadata"], multipart=multipart) unlink_local = True except LocalFileIsRemoteFileError: pass if unlink_local: try: self.log.debug("Deleting file: %r since it has been uploaded", file_to_transfer["local_path"]) os.unlink(file_to_transfer["local_path"]) metadata_path = file_to_transfer["local_path"] + ".metadata" with suppress(FileNotFoundError): os.unlink(metadata_path) except: # pylint: disable=bare-except self.log.exception("Problem in deleting file: %r", file_to_transfer["local_path"]) return {"success": True, "opaque": file_to_transfer.get("opaque")} except Exception as ex: # pylint: disable=broad-except self.log.exception("Problem in moving file: %r, need to retry", file_to_transfer["local_path"]) # Sleep for a bit to avoid busy looping time.sleep(0.5) file_to_transfer["retries"] = file_to_transfer.get("retries", 0) + 1 if file_to_transfer["retries"] > self.config["upload_retries_warning_limit"]: create_alert_file(self.config, "upload_retries_warning") self.transfer_queue.put(file_to_transfer) return {"success": False, "call_callback": False, "exception": ex}
def fetch_all(self): for retry in range(3): try: with self.manager_class() as manager: self._setup_progress_tracking(manager) with self.pool_class(processes=self._process_count()) as pool: self._queue_jobs(pool) self._wait_for_jobs_to_complete() break except TimeoutError: self.pending_jobs.clear() self.last_progress_ts = time.monotonic() if self.errors: break elif retry == 2: self.log.error("Download stalled despite retries, aborting") self.errors = 1 break if self.errors: raise RestoreError("Backup download/extraction failed with {} errors".format(self.errors)) self._create_tablespace_symlinks() with compat.suppress(OSError): os.rmdir(os.path.join(self.pgdata, "pgdata"))
def _prefetch(self, site, filetype, names): if not names: return start_time = time.monotonic() callback_queue = Queue() site_config = self.server.config["backup_sites"][site] xlog_dir = site_config["pg_xlog_directory"] downloads = {} for obname in names: if obname in self.server.prefetch_404: continue # previously failed to prefetch this file, don't try again prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname)) if os.path.exists(prefetch_target_path): continue # already fetched this file try: fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format(xlog_dir, obname), suffix=".pghoard.tmp") os.close(fd) except OSError as ex: self.server.log.error("Unable to create temporary file to prefetch %r: %s: %s", obname, ex.__class__.__name__, ex) continue self.server.log.debug("Prefetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r", site, obname, filetype, tmp_target_path) downloads[obname] = tmp_target_path self.server.transfer_queue.put({ "callback_queue": callback_queue, "filetype": filetype, "local_path": obname, "opaque": obname, "site": site, "target_path": tmp_target_path, "type": "DOWNLOAD", }) # allow something else to happen try: yield finally: # process results (timeout is 30 seconds after start but at least 5 seconds) timeout_at = max(start_time + 30, time.monotonic() + 5) while downloads: time_left = timeout_at - time.monotonic() try: response = callback_queue.get(timeout=time_left) except Empty: break # timeout obname = response["opaque"] tmp_target_path = downloads.pop(response["opaque"]) if response["success"]: prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname)) os.rename(tmp_target_path, prefetch_target_path) self.server.log.debug("Prefetched %r %r to %r, took: %.3fs", site, obname, prefetch_target_path, time.monotonic() - start_time) else: ex = response.get("exception", Error) if isinstance(ex, FileNotFoundFromStorageError): # don't try prefetching this file again self.server.prefetch_404.append(obname) self.server.log.debug("Prefetching %r %r failed (%s), took: %.3fs", site, obname, ex.__class__.__name__, time.monotonic() - start_time) with suppress(Exception): os.unlink(tmp_target_path) # everything else timed out while downloads: obname, tmp_target_path = downloads.popitem() self.server.log.debug("Prefetching %r %r timed out, took: %.3fs", site, obname, time.monotonic() - start_time) with suppress(Exception): os.unlink(tmp_target_path)
def setup_pg(): tmpdir_obj = py_path.local(tempfile.mkdtemp(prefix="pghoard_dbtest_")) tmpdir = str(tmpdir_obj) # try to find the binaries for these versions in some path pgdata = os.path.join(tmpdir, "pgdata") db = PGTester(pgdata) # pylint: disable=redefined-outer-name db.run_cmd("initdb", "-D", pgdata, "--encoding", "utf-8") # NOTE: does not use TCP ports, no port conflicts db.user = dict(host=pgdata, user="******", password="******", dbname="postgres", port="5432") # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but # psql triest to find .pgpass file from there as do our functions that # manipulate pgpass. By pointing $HOME there we make sure we're not # making persistent changes to the environment. os.environ["HOME"] = tmpdir # allow replication connections with open(os.path.join(pgdata, "pg_hba.conf"), "w") as fp: fp.write( "local all disabled reject\n" "local all passwordy md5\n" "local all all trust\n" "local replication disabled reject\n" "local replication passwordy md5\n" "local replication all trust\n" ) # rewrite postgresql.conf with open(os.path.join(pgdata, "postgresql.conf"), "r+") as fp: lines = fp.read().splitlines() fp.seek(0) fp.truncate() config = {} for line in lines: line = line.strip() if not line or line.startswith("#"): continue key, val = re.split(r"\s*=\s*", line, 1) config[key] = re.sub(r"\s*(#.*)?$", "", val) config.update({ "hot_standby": "on", "logging_collector": "off", "max_wal_senders": 2, "wal_keep_segments": 100, "wal_level": "hot_standby", # disable fsync and synchronous_commit to speed up the tests a bit "fsync": "off", "synchronous_commit": "off", # don't need to wait for autovacuum workers when shutting down "autovacuum": "off", }) lines = ["{} = {}\n".format(key, val) for key, val in sorted(config.items())] # noqa fp.write("".join(lines)) # now start pg and create test users db.run_pg() try: db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "disabled") db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "passwordy") db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "-s", db.user["user"]) yield db finally: db.kill() with suppress(Exception): tmpdir_obj.remove(rec=1)
def _prefetch(self, site, filetype, names): if not names: return start_time = time.monotonic() callback_queue = Queue() site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) downloads = {} for obname in names: if obname in self.server.prefetch_404: continue # previously failed to prefetch this file, don't try again prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(obname)) if os.path.exists(prefetch_target_path): continue # already fetched this file try: fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format( xlog_dir, obname), suffix=".pghoard.tmp") os.close(fd) except OSError as ex: self.server.log.error( "Unable to create temporary file to prefetch %r: %s: %s", obname, ex.__class__.__name__, ex) continue self.server.log.debug( "Prefetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r", site, obname, filetype, tmp_target_path) downloads[obname] = tmp_target_path self.server.transfer_queue.put({ "callback_queue": callback_queue, "filetype": filetype, "local_path": obname, "opaque": obname, "site": site, "target_path": tmp_target_path, "type": "DOWNLOAD", }) # allow something else to happen try: yield finally: # process results (timeout is 30 seconds after start but at least 5 seconds) timeout_at = max(start_time + 30, time.monotonic() + 5) while downloads: time_left = timeout_at - time.monotonic() try: response = callback_queue.get(timeout=time_left) except Empty: break # timeout obname = response["opaque"] tmp_target_path = downloads.pop(response["opaque"]) if response["success"]: prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(obname)) os.rename(tmp_target_path, prefetch_target_path) self.server.log.debug( "Prefetched %r %r to %r, took: %.3fs", site, obname, prefetch_target_path, time.monotonic() - start_time) else: ex = response.get("exception", Error) if isinstance(ex, FileNotFoundFromStorageError): # don't try prefetching this file again self.server.prefetch_404.append(obname) self.server.log.debug( "Prefetching %r %r failed (%s), took: %.3fs", site, obname, ex.__class__.__name__, time.monotonic() - start_time) with suppress(Exception): os.unlink(tmp_target_path) # everything else timed out while downloads: obname, tmp_target_path = downloads.popitem() self.server.log.debug( "Prefetching %r %r timed out, took: %.3fs", site, obname, time.monotonic() - start_time) with suppress(Exception): os.unlink(tmp_target_path)
def get_wal_or_timeline_file(self, site, filename, filetype): target_path = self.headers.get("x-pghoard-target-path") if not target_path: raise HttpResponse("x-pghoard-target-path header missing from download", status=400) self._process_completed_download_operations() # See if we have already prefetched the file site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(filename)) if os.path.exists(prefetch_target_path): ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path) if not ex: self._create_prefetch_operations(site, filetype, filename) raise HttpResponse(status=201) # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch # files from the archive starting from the latest checkpoint. We have potentially fetched these files # already earlier. Check if we have the files already and if we do, don't go over the network to refetch # them yet again but just rename them to the path that PG is requesting. xlog_path = os.path.join(xlog_dir, filename) if os.path.exists(xlog_path): self.server.log.info("Requested %r, found it in pg_xlog directory as: %r, returning directly", filename, xlog_path) ex = self._try_save_and_verify_restored_file(filetype, filename, xlog_path, target_path, unlink=False) if ex: self.server.log.warning("Found file: %r but it was invalid: %s", xlog_path, ex) else: raise HttpResponse(status=201) key = self._make_file_key(site, filetype, filename) with suppress(ValueError): self.server.prefetch_404.remove(key) self._create_fetch_operation(key, site, filetype, filename, max_age=5, suppress_error=False) self._create_prefetch_operations(site, filetype, filename) last_schedule_call = time.monotonic() start_time = time.monotonic() retries = 2 while (time.monotonic() - start_time) <= 30: self._process_completed_download_operations(timeout=0.01) with self.server.lock: if os.path.isfile(prefetch_target_path): ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path) if not ex: raise HttpResponse(status=201) elif ex and retries == 0: raise ex # pylint: disable=raising-bad-type retries -= 1 if key in self.server.prefetch_404: raise HttpResponse(status=404) with self.server.lock: if key not in self.server.pending_download_ops: if retries == 0: raise HttpResponse(status=500) retries -= 1 self._create_fetch_operation(key, site, filetype, filename, suppress_error=False) if time.monotonic() - last_schedule_call >= 1: last_schedule_call = time.monotonic() # Replace existing download operation if it has been executing for too long self._create_fetch_operation(key, site, filetype, filename, max_age=10, suppress_error=False) raise HttpResponse("TIMEOUT", status=500)
def get_wal_or_timeline_file(self, site, filename, filetype): target_path = self.headers.get("x-pghoard-target-path") if not target_path: raise HttpResponse( "x-pghoard-target-path header missing from download", status=400) self._process_completed_download_operations() # See if we have already prefetched the file site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(filename)) if os.path.exists(prefetch_target_path): ex = self._try_save_and_verify_restored_file( filetype, filename, prefetch_target_path, target_path) if not ex: self._create_prefetch_operations(site, filetype, filename) raise HttpResponse(status=201) # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch # files from the archive starting from the latest checkpoint. We have potentially fetched these files # already earlier. Check if we have the files already and if we do, don't go over the network to refetch # them yet again but just rename them to the path that PG is requesting. xlog_path = os.path.join(xlog_dir, filename) if os.path.exists(xlog_path): self.server.log.info( "Requested %r, found it in pg_xlog directory as: %r, returning directly", filename, xlog_path) ex = self._try_save_and_verify_restored_file(filetype, filename, xlog_path, target_path, unlink=False) if ex: self.server.log.warning( "Found file: %r but it was invalid: %s", xlog_path, ex) else: raise HttpResponse(status=201) key = self._make_file_key(site, filetype, filename) with suppress(ValueError): self.server.prefetch_404.remove(key) self._create_fetch_operation(key, site, filetype, filename, max_age=5, suppress_error=False) self._create_prefetch_operations(site, filetype, filename) last_schedule_call = time.monotonic() start_time = time.monotonic() retries = 2 while (time.monotonic() - start_time) <= 30: self._process_completed_download_operations(timeout=0.01) with self.server.lock: if os.path.isfile(prefetch_target_path): ex = self._try_save_and_verify_restored_file( filetype, filename, prefetch_target_path, target_path) if not ex: raise HttpResponse(status=201) elif ex and retries == 0: raise ex # pylint: disable=raising-bad-type retries -= 1 if key in self.server.prefetch_404: raise HttpResponse(status=404) with self.server.lock: if key not in self.server.pending_download_ops: if retries == 0: raise HttpResponse(status=500) retries -= 1 self._create_fetch_operation(key, site, filetype, filename, suppress_error=False) if time.monotonic() - last_schedule_call >= 1: last_schedule_call = time.monotonic() # Replace existing download operation if it has been executing for too long self._create_fetch_operation(key, site, filetype, filename, max_age=10, suppress_error=False) raise HttpResponse("TIMEOUT", status=500)
def handle_site(self, site, site_config): self.set_state_defaults(site) xlog_path, basebackup_path = self.create_backup_site_paths(site) if not site_config["active"]: return # If a site has been marked inactive, don't bother checking anything self._cleanup_inactive_receivexlogs(site) chosen_backup_node = random.choice(site_config["nodes"]) if site not in self.receivexlogs and site not in self.walreceivers: if site_config["active_backup_mode"] == "pg_receivexlog": self.receivexlog_listener(site, chosen_backup_node, xlog_path + "_incoming") elif site_config["active_backup_mode"] == "walreceiver": state_file_path = self.config["json_state_file_path"] walreceiver_state = {} with suppress(FileNotFoundError): with open(state_file_path, "r") as fp: old_state_file = json.load(fp) walreceiver_state = old_state_file.get( "walreceivers", {}).get(site, {}) self.start_walreceiver( site=site, chosen_backup_node=chosen_backup_node, last_flushed_lsn=walreceiver_state.get("last_flushed_lsn")) if site not in self.time_of_last_backup_check or \ time.monotonic() - self.time_of_last_backup_check[site] > 300: self.time_of_last_backup[site] = self.check_backup_count_and_state( site) self.time_of_last_backup_check[site] = time.monotonic() # check if a basebackup is running, or if a basebackup has just completed if site in self.basebackups: try: result = self.basebackups_callbacks[site].get(block=False) except Empty: # previous basebackup (or its compression and upload) still in progress return if self.basebackups[site].is_alive(): self.basebackups[site].join() del self.basebackups[site] del self.basebackups_callbacks[site] self.log.debug("Basebackup has finished for %r: %r", site, result) self.time_of_last_backup[site] = self.check_backup_count_and_state( site) self.time_of_last_backup_check[site] = time.monotonic() new_backup_needed = False if site in self.requested_basebackup_sites: self.log.info("Creating a new basebackup for %r due to request", site) self.requested_basebackup_sites.discard(site) new_backup_needed = True elif site_config["basebackup_interval_hours"] is None: # Basebackups are disabled for this site (but they can still be requested over the API.) pass elif self.time_of_last_backup.get(site) is None: self.log.info( "Creating a new basebackup for %r because there are currently none", site) new_backup_needed = True else: delta_since_last_backup = datetime.datetime.now( datetime.timezone.utc) - self.time_of_last_backup[site] if delta_since_last_backup >= datetime.timedelta( hours=site_config["basebackup_interval_hours"]): self.log.info( "Creating a new basebackup for %r by schedule (%s from previous)", site, delta_since_last_backup) new_backup_needed = True if new_backup_needed: self.basebackups_callbacks[site] = Queue() self.create_basebackup(site, chosen_backup_node, basebackup_path, self.basebackups_callbacks[site])
def setup_pg(): tmpdir_obj = py_path.local(tempfile.mkdtemp(prefix="pghoard_dbtest_")) tmpdir = str(tmpdir_obj) # try to find the binaries for these versions in some path pgdata = os.path.join(tmpdir, "pgdata") db = TestPG(pgdata) # pylint: disable=redefined-outer-name db.run_cmd("initdb", "-D", pgdata, "--encoding", "utf-8") # NOTE: does not use TCP ports, no port conflicts db.user = dict(host=pgdata, user="******", password="******", dbname="postgres", port="5432") # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but # psql triest to find .pgpass file from there as do our functions that # manipulate pgpass. By pointing $HOME there we make sure we're not # making persistent changes to the environment. os.environ["HOME"] = tmpdir # allow replication connections with open(os.path.join(pgdata, "pg_hba.conf"), "w") as fp: fp.write("local all disabled reject\n" "local all passwordy md5\n" "local all all trust\n" "local replication disabled reject\n" "local replication passwordy md5\n" "local replication all trust\n") # rewrite postgresql.conf with open(os.path.join(pgdata, "postgresql.conf"), "r+") as fp: lines = fp.read().splitlines() fp.seek(0) fp.truncate() config = {} for line in lines: line = line.strip() if not line or line.startswith("#"): continue key, val = re.split(r"\s*=\s*", line, 1) config[key] = re.sub(r"\s*(#.*)?$", "", val) config.update({ "hot_standby": "on", "logging_collector": "off", "max_wal_senders": 2, "wal_keep_segments": 100, "wal_level": "hot_standby", # disable fsync and synchronous_commit to speed up the tests a bit "fsync": "off", "synchronous_commit": "off", # don't need to wait for autovacuum workers when shutting down "autovacuum": "off", }) lines = [ "{} = {}\n".format(key, val) for key, val in sorted(config.items()) ] # noqa fp.write("".join(lines)) # now start pg and create test users db.run_pg() try: db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "disabled") db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "passwordy") db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "-s", db.user["user"]) yield db finally: db.kill() with suppress(Exception): tmpdir_obj.remove(rec=1)
def handle_site(self, site, site_config): self.set_state_defaults(site) xlog_path, basebackup_path = self.create_backup_site_paths(site) if not site_config["active"]: return # If a site has been marked inactive, don't bother checking anything self._cleanup_inactive_receivexlogs(site) chosen_backup_node = random.choice(site_config["nodes"]) if site not in self.receivexlogs and site not in self.walreceivers: if site_config["active_backup_mode"] == "pg_receivexlog": self.receivexlog_listener(site, chosen_backup_node, xlog_path + "_incoming") elif site_config["active_backup_mode"] == "walreceiver": state_file_path = self.config["json_state_file_path"] walreceiver_state = {} with suppress(FileNotFoundError): with open(state_file_path, "r") as fp: old_state_file = json.load(fp) walreceiver_state = old_state_file.get("walreceivers", {}).get(site, {}) self.start_walreceiver( site=site, chosen_backup_node=chosen_backup_node, last_flushed_lsn=walreceiver_state.get("last_flushed_lsn")) last_check_time = self.time_of_last_backup_check.get(site) if not last_check_time or (time.monotonic() - self.time_of_last_backup_check[site]) > 300: self.time_of_last_backup[site] = self.check_backup_count_and_state(site) self.time_of_last_backup_check[site] = time.monotonic() # check if a basebackup is running, or if a basebackup has just completed if site in self.basebackups: try: result = self.basebackups_callbacks[site].get(block=False) except Empty: # previous basebackup (or its compression and upload) still in progress return if self.basebackups[site].is_alive(): self.basebackups[site].join() del self.basebackups[site] del self.basebackups_callbacks[site] self.log.debug("Basebackup has finished for %r: %r", site, result) self.time_of_last_backup[site] = self.check_backup_count_and_state(site) self.time_of_last_backup_check[site] = time.monotonic() new_backup_needed = False if site in self.requested_basebackup_sites: self.log.info("Creating a new basebackup for %r due to request", site) self.requested_basebackup_sites.discard(site) new_backup_needed = True elif site_config["basebackup_interval_hours"] is None: # Basebackups are disabled for this site (but they can still be requested over the API.) pass elif self.time_of_last_backup.get(site) is None: self.log.info("Creating a new basebackup for %r because there are currently none", site) new_backup_needed = True else: delta_since_last_backup = datetime.datetime.now(datetime.timezone.utc) - self.time_of_last_backup[site] if delta_since_last_backup >= datetime.timedelta(hours=site_config["basebackup_interval_hours"]): self.log.info("Creating a new basebackup for %r by schedule (%s from previous)", site, delta_since_last_backup) new_backup_needed = True if new_backup_needed and not os.path.exists(self.config["maintenance_mode_file"]): self.basebackups_callbacks[site] = Queue() self.create_basebackup(site, chosen_backup_node, basebackup_path, self.basebackups_callbacks[site])
def handle_site(self, site, site_config): self.set_state_defaults(site) xlog_path, basebackup_path = self.create_backup_site_paths(site) if not site_config["active"]: return # If a site has been marked inactive, don't bother checking anything self._cleanup_inactive_receivexlogs(site) chosen_backup_node = random.choice(site_config["nodes"]) if site not in self.receivexlogs and site not in self.walreceivers: if site_config["active_backup_mode"] == "pg_receivexlog": self.receivexlog_listener(site, chosen_backup_node, xlog_path + "_incoming") elif site_config["active_backup_mode"] == "walreceiver": state_file_path = self.config["json_state_file_path"] walreceiver_state = {} with suppress(FileNotFoundError): with open(state_file_path, "r") as fp: old_state_file = json.load(fp) walreceiver_state = old_state_file.get( "walreceivers", {}).get(site, {}) self.start_walreceiver( site=site, chosen_backup_node=chosen_backup_node, last_flushed_lsn=walreceiver_state.get("last_flushed_lsn")) last_check_time = self.time_of_last_backup_check.get(site) if not last_check_time or (time.monotonic() - self.time_of_last_backup_check[site]) > 300: self.refresh_backup_list_and_delete_old(site) self.time_of_last_backup_check[site] = time.monotonic() # check if a basebackup is running, or if a basebackup has just completed if site in self.basebackups: try: result = self.basebackups_callbacks[site].get(block=False) if result["success"]: # No matter which mode, if succeeded reset the counter self.delta_backup_failures.pop(site, None) elif site_config["basebackup_mode"] == BaseBackupMode.delta: last_failed_time = utc_now() if site not in self.delta_backup_failures: self.delta_backup_failures[ site] = DeltaBaseBackupFailureInfo( retries=0, last_failed_time=last_failed_time) else: self.delta_backup_failures[site].retries += 1 self.delta_backup_failures[ site].last_failed_time = last_failed_time except Empty: # previous basebackup (or its compression and upload) still in progress return if self.basebackups[site].is_alive(): self.basebackups[site].join() del self.basebackups[site] del self.basebackups_callbacks[site] self.log.debug("Basebackup has finished for %r: %r", site, result) self.refresh_backup_list_and_delete_old(site) self.time_of_last_backup_check[site] = time.monotonic() metadata = self.get_new_backup_details(site=site, site_config=site_config) if metadata and not os.path.exists( self.config["maintenance_mode_file"]): if site in self.delta_backup_failures: retries = self.delta_backup_failures[site].retries if retries > site_config["basebackup_delta_mode_max_retries"]: self.log.info( "Giving up backup after exceeding max retries: %r", retries) return else: # Start from ~2 min with cap of one hour retry_interval = min(2**(retries + 7), 60 * 60) if utc_now() >= self.delta_backup_failures[ site].last_failed_time + datetime.timedelta( seconds=retry_interval): self.log.info("Re-trying delta basebackup") else: self.log.info( "Waiting for backoff time before re-trying new delta backup due to previous failures" ) return self.basebackups_callbacks[site] = Queue() self.create_basebackup(site, chosen_backup_node, basebackup_path, self.basebackups_callbacks[site], metadata)
def delete_alert_file(config, filename): filepath = os.path.join(config["alert_file_dir"], filename) with suppress(FileNotFoundError): os.unlink(filepath)