Example #1
0
    def fetch_all(self):
        for retry in range(3):
            try:
                with self.manager_class() as manager:
                    self._setup_progress_tracking(manager)
                    with self.pool_class(
                            processes=self._process_count()) as pool:
                        self._queue_jobs(pool)
                        self._wait_for_jobs_to_complete()
                        break
            except TimeoutError:
                self.pending_jobs.clear()
                self.last_progress_ts = time.monotonic()
                if self.errors:
                    break

                if retry == 2:
                    self.log.error(
                        "Download stalled despite retries, aborting")
                    self.errors = 1
                    break

        if self.errors:
            raise RestoreError(
                "Backup download/extraction failed with {} errors".format(
                    self.errors))
        self._create_tablespace_symlinks()
        with compat.suppress(OSError):
            os.rmdir(os.path.join(self.pgdata, "pgdata"))
Example #2
0
 def _process_completed_download_operations(self, timeout=None):
     while True:
         try:
             result = self.server.download_results.get(block=timeout is not None, timeout=timeout)
             key = result["opaque"]
             with self.server.lock:
                 op = self.server.pending_download_ops.pop(key, None)
                 if not op:
                     self.server.log.warning("Orphaned download operation %r completed: %r", key, result)
                     if result["success"]:
                         with suppress(OSError):
                             os.unlink(result["target_path"])
                     continue
                 if result["success"]:
                     if os.path.isfile(op["target_path"]):
                         self.server.log.warning("Target path for %r already exists, skipping", key)
                         continue
                     os.rename(result["target_path"], op["target_path"])
                     metadata = result["metadata"] or {}
                     self.server.log.info(
                         "Renamed %s to %s. Original upload from %r, hash %s:%s", result["target_path"],
                         op["target_path"], metadata.get("host"), metadata.get("hash-algorithm"), metadata.get("hash")
                     )
                 else:
                     ex = result.get("exception", Error)
                     if isinstance(ex, FileNotFoundFromStorageError):
                         # don't try prefetching this file again
                         self.server.prefetch_404.append(key)
                     else:
                         self.server.log.warning("Fetching %r failed (%s), took: %.3fs",
                                                 key, ex.__class__.__name__, time.monotonic() - op["started_at"])
         except Empty:
             return
Example #3
0
 def _process_completed_download_operations(self, timeout=None):
     while True:
         try:
             result = self.server.download_results.get(block=timeout is not None, timeout=timeout)
             key = result["opaque"]
             with self.server.lock:
                 op = self.server.pending_download_ops.pop(key, None)
                 if not op:
                     self.server.log.warning("Orphaned download operation %r completed: %r", key, result)
                     if result["success"]:
                         with suppress(OSError):
                             os.unlink(result["target_path"])
                     continue
                 if result["success"]:
                     if os.path.isfile(op["target_path"]):
                         self.server.log.warning("Target path for %r already exists, skipping", key)
                         continue
                     os.rename(result["target_path"], op["target_path"])
                     metadata = result["metadata"] or {}
                     self.server.log.info(
                         "Renamed %s to %s. Original upload from %r, hash %s:%s", result["target_path"],
                         op["target_path"], metadata.get("host"), metadata.get("hash-algorithm"), metadata.get("hash")
                     )
                 else:
                     ex = result.get("exception", Error)
                     if isinstance(ex, FileNotFoundFromStorageError):
                         # don't try prefetching this file again
                         self.server.prefetch_404.append(key)
                     else:
                         self.server.log.warning("Fetching %r failed (%s), took: %.3fs",
                                                 key, ex.__class__.__name__, time.monotonic() - op["started_at"])
         except Empty:
             return
Example #4
0
    def handle_upload(self, site, key, file_to_transfer):
        try:
            storage = self.get_object_storage(site)
            unlink_local = False
            if "blob" in file_to_transfer:
                storage.store_file_from_memory(
                    key,
                    file_to_transfer["blob"],
                    metadata=file_to_transfer["metadata"])
            else:
                # Basebackups may be multipart uploads, depending on the driver.
                # Swift needs to know about this so it can do possible cleanups.
                multipart = file_to_transfer["filetype"] == "basebackup"
                try:
                    storage.store_file_from_disk(
                        key,
                        file_to_transfer["local_path"],
                        metadata=file_to_transfer["metadata"],
                        multipart=multipart)
                    unlink_local = True
                except LocalFileIsRemoteFileError:
                    pass
            if unlink_local:
                try:
                    self.log.debug(
                        "Deleting file: %r since it has been uploaded",
                        file_to_transfer["local_path"])
                    os.unlink(file_to_transfer["local_path"])
                    metadata_path = file_to_transfer["local_path"] + ".metadata"
                    with suppress(FileNotFoundError):
                        os.unlink(metadata_path)
                except Exception as ex:  # pylint: disable=broad-except
                    self.log.exception("Problem in deleting file: %r",
                                       file_to_transfer["local_path"])
                    self.stats.unexpected_exception(
                        ex, where="handle_upload_unlink")
            return {"success": True, "opaque": file_to_transfer.get("opaque")}
        except Exception as ex:  # pylint: disable=broad-except
            if file_to_transfer.get("retry_number", 0) > 0:
                self.log.exception("Problem in moving file: %r, need to retry",
                                   file_to_transfer["local_path"])
                # Ignore the exception the first time round as some object stores have frequent Internal Errors
                # and the upload usually goes through without any issues the second time round
                self.stats.unexpected_exception(ex, where="handle_upload")
            else:
                self.log.warning(
                    "Problem in moving file: %r, need to retry (%s: %s)",
                    file_to_transfer["local_path"], ex.__class__.__name__, ex)
            # Sleep for a bit to avoid busy looping
            time.sleep(0.5)

            file_to_transfer["retry_number"] = file_to_transfer.get(
                "retry_number", 0) + 1
            if file_to_transfer["retry_number"] > self.config[
                    "upload_retries_warning_limit"]:
                create_alert_file(self.config, "upload_retries_warning")

            self.transfer_queue.put(file_to_transfer)
            return {"success": False, "call_callback": False, "exception": ex}
Example #5
0
    def handle_site(self, site, site_config):
        self.set_state_defaults(site)
        xlog_path, basebackup_path = self.create_backup_site_paths(site)

        if not site_config["active"]:
            return  # If a site has been marked inactive, don't bother checking anything

        self._cleanup_inactive_receivexlogs(site)

        chosen_backup_node = random.choice(site_config["nodes"])

        if site not in self.receivexlogs and site not in self.walreceivers:
            if site_config["active_backup_mode"] == "pg_receivexlog":
                self.receivexlog_listener(site, chosen_backup_node,
                                          xlog_path + "_incoming")
            elif site_config["active_backup_mode"] == "walreceiver":
                state_file_path = self.config["json_state_file_path"]
                walreceiver_state = {}
                with suppress(FileNotFoundError):
                    with open(state_file_path, "r") as fp:
                        old_state_file = json.load(fp)
                        walreceiver_state = old_state_file.get(
                            "walreceivers", {}).get(site, {})
                self.start_walreceiver(
                    site=site,
                    chosen_backup_node=chosen_backup_node,
                    last_flushed_lsn=walreceiver_state.get("last_flushed_lsn"))

        last_check_time = self.time_of_last_backup_check.get(site)
        if not last_check_time or (time.monotonic() -
                                   self.time_of_last_backup_check[site]) > 300:
            self.refresh_backup_list_and_delete_old(site)
            self.time_of_last_backup_check[site] = time.monotonic()

        # check if a basebackup is running, or if a basebackup has just completed
        if site in self.basebackups:
            try:
                result = self.basebackups_callbacks[site].get(block=False)
            except Empty:
                # previous basebackup (or its compression and upload) still in progress
                return
            if self.basebackups[site].is_alive():
                self.basebackups[site].join()
            del self.basebackups[site]
            del self.basebackups_callbacks[site]
            self.log.debug("Basebackup has finished for %r: %r", site, result)
            self.refresh_backup_list_and_delete_old(site)
            self.time_of_last_backup_check[site] = time.monotonic()

        metadata = self.get_new_backup_details(site=site,
                                               site_config=site_config)
        if metadata and not os.path.exists(
                self.config["maintenance_mode_file"]):
            self.basebackups_callbacks[site] = Queue()
            self.create_basebackup(site, chosen_backup_node, basebackup_path,
                                   self.basebackups_callbacks[site], metadata)
Example #6
0
    def _transfer_agent_op(self, site, filename, filetype, method, *, retries=2, target_path=None):
        start_time = time.time()
        tmp_target_path = None

        if method == "DOWNLOAD":
            # NOTE: we request download on a temporary download path so we can atomically overwrite the file if /
            # when we successfully receive it.
            try:
                fd, tmp_target_path = tempfile.mkstemp(prefix="{}.".format(target_path), suffix=".pghoard.tmp")
                os.close(fd)
            except OSError as ex:
                raise HttpResponse("Unable to create temporary file for {0!r}: {1.__class__.__name__}: {1}"
                                   .format(target_path, ex), status=400)

        self.server.log.debug("Requesting site: %r, filename: %r, filetype: %r, target_path: %r",
                              site, filename, filetype, target_path)

        callback_queue = Queue()
        self.server.transfer_queue.put({
            "callback_queue": callback_queue,
            "filetype": filetype,
            "local_path": filename,
            "site": site,
            "target_path": tmp_target_path,
            "type": method,
        })

        try:
            try:
                response = callback_queue.get(timeout=30.0)
                self.server.log.debug("Handled a %s request for: %r %r, took: %.3fs",
                                      method, site, target_path, time.time() - start_time)
            except Empty:
                self.server.log.exception("Timeout on a %s request for: %r %r, took: %.3fs",
                                          method, site, target_path, time.time() - start_time)
                raise HttpResponse("TIMEOUT", status=500)

            if not response["success"]:
                if isinstance(response.get("exception"), FileNotFoundFromStorageError):
                    raise HttpResponse("{0.__class__.__name__}: {0}".format(response["exception"]), status=404)
                raise HttpResponse(status=500)
        except HttpResponse as ex:
            if tmp_target_path:
                with suppress(Exception):
                    os.unlink(tmp_target_path)
            if ex.status == 500 and retries:
                self.server.log.warning("Transfer operation failed, retrying (%r retries left)", retries)
                return self._transfer_agent_op(site, filename, filetype, method,
                                               retries=retries - 1, target_path=target_path)
            raise

        if tmp_target_path:
            self._save_and_verify_restored_file(filetype, filename, tmp_target_path, target_path)
        return response
Example #7
0
    def _transfer_agent_op(self, site, filename, filetype, method, *, retries=2, target_path=None):
        start_time = time.time()
        tmp_target_path = None

        if method == "DOWNLOAD":
            # NOTE: we request download on a temporary download path so we can atomically overwrite the file if /
            # when we successfully receive it.
            try:
                fd, tmp_target_path = tempfile.mkstemp(prefix="{}.".format(target_path), suffix=".pghoard.tmp")
                os.close(fd)
            except OSError as ex:
                raise HttpResponse("Unable to create temporary file for {0!r}: {1.__class__.__name__}: {1}"
                                   .format(target_path, ex), status=400)

        self.server.log.debug("Requesting site: %r, filename: %r, filetype: %r, target_path: %r",
                              site, filename, filetype, target_path)

        callback_queue = Queue()
        self.server.transfer_queue.put({
            "callback_queue": callback_queue,
            "filetype": filetype,
            "local_path": filename,
            "site": site,
            "target_path": tmp_target_path,
            "type": method,
        })

        try:
            try:
                response = callback_queue.get(timeout=30.0)
                self.server.log.debug("Handled a %s request for: %r %r, took: %.3fs",
                                      method, site, target_path, time.time() - start_time)
            except Empty:
                self.server.log.exception("Timeout on a %s request for: %r %r, took: %.3fs",
                                          method, site, target_path, time.time() - start_time)
                raise HttpResponse("TIMEOUT", status=500)

            if not response["success"]:
                if isinstance(response.get("exception"), FileNotFoundFromStorageError):
                    raise HttpResponse("{0.__class__.__name__}: {0}".format(response["exception"]), status=404)
                raise HttpResponse(status=500)
        except HttpResponse as ex:
            if tmp_target_path:
                with suppress(Exception):
                    os.unlink(tmp_target_path)
            if ex.status == 500 and retries:
                self.server.log.warning("Transfer operation failed, retrying (%r retries left)", retries)
                return self._transfer_agent_op(site, filename, filetype, method,
                                               retries=retries - 1, target_path=target_path)
            raise

        if tmp_target_path:
            self._save_and_verify_restored_file(filetype, filename, tmp_target_path, target_path)
        return response
Example #8
0
 def _try_save_and_verify_restored_file(self, filetype, filename, prefetch_target_path, target_path, unlink=True):
     try:
         self._save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path)
         self.server.log.info("Renamed %s to %s", prefetch_target_path, target_path)
         return None
     except (ValueError, HttpResponse) as e:
         # Just try loading the file again
         with suppress(OSError):
             self.server.log.warning("Verification of prefetch file %s failed: %r", prefetch_target_path, e)
             if unlink:
                 os.unlink(prefetch_target_path)
         return e
Example #9
0
 def _try_save_and_verify_restored_file(self, filetype, filename, prefetch_target_path, target_path, unlink=True):
     try:
         self._save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path)
         self.server.log.info("Renamed %s to %s", prefetch_target_path, target_path)
         return None
     except (ValueError, HttpResponse) as e:
         # Just try loading the file again
         with suppress(OSError):
             self.server.log.warning("Verification of prefetch file %s failed: %r", prefetch_target_path, e)
             if unlink:
                 os.unlink(prefetch_target_path)
         return e
Example #10
0
    def check_command_success(self, proc, output_file):
        rc = terminate_subprocess(proc, log=self.log)
        msg = "Ran: {!r}, took: {:.3f}s to run, returncode: {}".format(
            proc.args, time.monotonic() - proc.basebackup_start_time, rc)
        if rc == 0 and os.path.exists(output_file):
            self.log.info(msg)
            return True

        if output_file:
            with suppress(FileNotFoundError):
                os.unlink(output_file)
        raise BackupFailure(msg)
Example #11
0
    def fetch_all(self):
        with self.manager_class() as manager:
            self._setup_progress_tracking(manager)
            with self.pool_class(processes=self._process_count()) as pool:
                self._queue_jobs(pool)
                self._wait_for_jobs_to_complete()

        if self.errors:
            raise RestoreError("Backup download/extraction failed with {} errors".format(self.errors))
        self._create_tablespace_symlinks()
        with compat.suppress(OSError):
            os.rmdir(os.path.join(self.pgdata, "pgdata"))
Example #12
0
    def check_command_success(self, proc, output_file):
        rc = terminate_subprocess(proc, log=self.log)
        msg = "Ran: {!r}, took: {:.3f}s to run, returncode: {}".format(
            proc.args, time.monotonic() - proc.basebackup_start_time, rc)
        if rc == 0 and os.path.exists(output_file):
            self.log.info(msg)
            return True

        if output_file:
            with suppress(FileNotFoundError):
                os.unlink(output_file)
        raise BackupFailure(msg)
Example #13
0
 def get_paths_for_backup(basebackup_path):
     i = 0
     while True:
         tsdir = datetime.datetime.utcnow().strftime("%Y-%m-%d") + "_" + str(i)
         raw_basebackup = os.path.join(basebackup_path + "_incoming", tsdir)
         compressed_basebackup = os.path.join(basebackup_path, tsdir)
         # The backup directory names need not to be a sequence, so we lean towards skipping over any
         # partial or leftover progress below.  Make sure we only return paths if we're able to create the
         # raw_basebackup directory.
         if not os.path.exists(raw_basebackup) and not os.path.exists(compressed_basebackup):
             with suppress(FileExistsError):
                 os.makedirs(raw_basebackup)
                 return raw_basebackup, compressed_basebackup
         i += 1
Example #14
0
 def get_paths_for_backup(basebackup_path):
     i = 0
     while True:
         tsdir = datetime.datetime.utcnow().strftime("%Y-%m-%d") + "_" + str(i)
         raw_basebackup = os.path.join(basebackup_path + "_incoming", tsdir)
         compressed_basebackup = os.path.join(basebackup_path, tsdir)
         # The backup directory names need not to be a sequence, so we lean towards skipping over any
         # partial or leftover progress below.  Make sure we only return paths if we're able to create the
         # raw_basebackup directory.
         if not os.path.exists(raw_basebackup) and not os.path.exists(compressed_basebackup):
             with suppress(FileExistsError):
                 os.makedirs(raw_basebackup)
                 return raw_basebackup, compressed_basebackup
         i += 1
Example #15
0
    def handle_upload(self, site, key, file_to_transfer):
        try:
            storage = self.get_object_storage(site)
            unlink_local = False
            if "blob" in file_to_transfer:
                storage.store_file_from_memory(key, file_to_transfer["blob"],
                                               metadata=file_to_transfer["metadata"])
            else:
                # Basebackups may be multipart uploads, depending on the driver.
                # Swift needs to know about this so it can do possible cleanups.
                multipart = file_to_transfer["filetype"] in {"basebackup", "basebackup_chunk"}
                try:
                    storage.store_file_from_disk(key, file_to_transfer["local_path"],
                                                 metadata=file_to_transfer["metadata"],
                                                 multipart=multipart)
                    unlink_local = True
                except LocalFileIsRemoteFileError:
                    pass
            if unlink_local:
                try:
                    self.log.debug("Deleting file: %r since it has been uploaded", file_to_transfer["local_path"])
                    os.unlink(file_to_transfer["local_path"])
                    metadata_path = file_to_transfer["local_path"] + ".metadata"
                    with suppress(FileNotFoundError):
                        os.unlink(metadata_path)
                except Exception as ex:  # pylint: disable=broad-except
                    self.log.exception("Problem in deleting file: %r", file_to_transfer["local_path"])
                    self.metrics.unexpected_exception(ex, where="handle_upload_unlink")
            return {"success": True, "opaque": file_to_transfer.get("opaque")}
        except Exception as ex:  # pylint: disable=broad-except
            if file_to_transfer.get("retry_number", 0) > 0:
                self.log.exception("Problem in moving file: %r, need to retry", file_to_transfer["local_path"])
                # Ignore the exception the first time round as some object stores have frequent Internal Errors
                # and the upload usually goes through without any issues the second time round
                self.metrics.unexpected_exception(ex, where="handle_upload")
            else:
                self.log.warning("Problem in moving file: %r, need to retry (%s: %s)",
                                 file_to_transfer["local_path"], ex.__class__.__name__, ex)

            file_to_transfer["retry_number"] = file_to_transfer.get("retry_number", 0) + 1
            if file_to_transfer["retry_number"] > self.config["upload_retries_warning_limit"]:
                create_alert_file(self.config, "upload_retries_warning")

            # Sleep for a bit to avoid busy looping. Increase sleep time if the op fails multiple times
            self.sleep(min(0.5 * 2 ** (file_to_transfer["retry_number"] - 1), 20))

            self.transfer_queue.put(file_to_transfer)
            return {"success": False, "call_callback": False, "exception": ex}
Example #16
0
    def get_command_success(self, proc, output_file):
        rc = terminate_subprocess(proc, log=self.log)
        msg = "Ran: {!r}, took: {:.3f}s to run, returncode: {}".format(
            proc.args, time.monotonic() - proc.basebackup_start_time, rc)
        if rc == 0 and os.path.exists(output_file):
            self.log.info(msg)
            return True

        self.log.error(msg)
        if output_file:
            with suppress(FileNotFoundError):
                os.unlink(output_file)
        if self.callback_queue:
            # post a failure event
            self.callback_queue.put({"success": False})
        self.running = False
Example #17
0
    def _proc_success(self, proc, output_file):
        rc = terminate_subprocess(proc, log=self.log)
        msg = "Ran: {!r}, took: {:.3f}s to run, returncode: {}".format(
            proc.args,
            time.monotonic() - proc.basebackup_start_time, rc)
        if rc == 0 and os.path.exists(output_file):
            self.log.info(msg)
            return True

        self.log.error(msg)
        if output_file:
            with suppress(FileNotFoundError):
                os.unlink(output_file)
        if self.callback_queue:
            # post a failure event
            self.callback_queue.put({"success": False})
        self.running = False
Example #18
0
 def _create_tablespace_symlinks(self):
     if not self.tablespaces:
         return
     tblspc_dir = os.path.join(self.pgdata, "pg_tblspc")
     os.makedirs(tblspc_dir, exist_ok=True)
     for settings in self.tablespaces.values():
         if os.path.isdir(settings["path"]):
             link_name = os.path.join(self.pgdata, "pg_tblspc", str(settings["oid"]))
             try:
                 os.symlink(settings["path"], link_name)
             except OSError as e:
                 if e.errno != errno.EEXIST:
                     raise
     # Remove empty directories that could not be excluded when extracting tar due to
     # tar's limitations in exclude parameter behavior
     tsnames = [os.path.join("tablespaces", tsname) for tsname in self.tablespaces.keys()]
     for exclude in tsnames + ["tablespaces"]:
         with compat.suppress(OSError):
             os.rmdir(os.path.join(self.pgdata, exclude))
Example #19
0
 def _create_tablespace_symlinks(self):
     if not self.tablespaces:
         return
     tblspc_dir = os.path.join(self.pgdata, "pg_tblspc")
     os.makedirs(tblspc_dir, exist_ok=True)
     for settings in self.tablespaces.values():
         if os.path.isdir(settings["path"]):
             link_name = os.path.join(self.pgdata, "pg_tblspc", str(settings["oid"]))
             try:
                 os.symlink(settings["path"], link_name)
             except OSError as e:
                 if e.errno != errno.EEXIST:
                     raise
     # Remove empty directories that could not be excluded when extracting tar due to
     # tar's limitations in exclude parameter behavior
     tsnames = [os.path.join("tablespaces", tsname) for tsname in self.tablespaces.keys()]
     for exclude in tsnames + ["tablespaces"]:
         with compat.suppress(OSError):
             os.rmdir(os.path.join(self.pgdata, exclude))
Example #20
0
    def handle_upload(self, site, key, file_to_transfer):
        try:
            storage = self.get_object_storage(site)
            unlink_local = False
            if "blob" in file_to_transfer:
                storage.store_file_from_memory(key, file_to_transfer["blob"], metadata=file_to_transfer["metadata"])
            else:
                # Basebackups may be multipart uploads, depending on the driver.
                # Swift needs to know about this so it can do possible cleanups.
                multipart = file_to_transfer["filetype"] == "basebackup"
                try:
                    storage.store_file_from_disk(
                        key, file_to_transfer["local_path"], metadata=file_to_transfer["metadata"], multipart=multipart
                    )
                    unlink_local = True
                except LocalFileIsRemoteFileError:
                    pass
            if unlink_local:
                try:
                    self.log.debug("Deleting file: %r since it has been uploaded", file_to_transfer["local_path"])
                    os.unlink(file_to_transfer["local_path"])
                    metadata_path = file_to_transfer["local_path"] + ".metadata"
                    with suppress(FileNotFoundError):
                        os.unlink(metadata_path)
                except Exception as ex:  # pylint: disable=broad-except
                    self.log.exception("Problem in deleting file: %r", file_to_transfer["local_path"])
                    self.stats.unexpected_exception(ex, where="handle_upload_unlink")
            return {"success": True, "opaque": file_to_transfer.get("opaque")}
        except Exception as ex:  # pylint: disable=broad-except
            self.log.exception("Problem in moving file: %r, need to retry", file_to_transfer["local_path"])
            self.stats.unexpected_exception(ex, where="handle_upload")
            # Sleep for a bit to avoid busy looping
            time.sleep(0.5)

            file_to_transfer["retries"] = file_to_transfer.get("retries", 0) + 1
            if file_to_transfer["retries"] > self.config["upload_retries_warning_limit"]:
                create_alert_file(self.config, "upload_retries_warning")

            self.transfer_queue.put(file_to_transfer)
            return {"success": False, "call_callback": False, "exception": ex}
Example #21
0
    def handle_upload(self, site, key, file_to_transfer):
        try:
            storage = self.get_object_storage(site)
            unlink_local = False
            if "blob" in file_to_transfer:
                storage.store_file_from_memory(key, file_to_transfer["blob"],
                                               metadata=file_to_transfer["metadata"])
            else:
                # Basebackups may be multipart uploads, depending on the driver.
                # Swift needs to know about this so it can do possible cleanups.
                multipart = file_to_transfer["filetype"] == "basebackup"
                try:
                    storage.store_file_from_disk(key, file_to_transfer["local_path"],
                                                 metadata=file_to_transfer["metadata"],
                                                 multipart=multipart)
                    unlink_local = True
                except LocalFileIsRemoteFileError:
                    pass
            if unlink_local:
                try:
                    self.log.debug("Deleting file: %r since it has been uploaded", file_to_transfer["local_path"])
                    os.unlink(file_to_transfer["local_path"])
                    metadata_path = file_to_transfer["local_path"] + ".metadata"
                    with suppress(FileNotFoundError):
                        os.unlink(metadata_path)
                except:  # pylint: disable=bare-except
                    self.log.exception("Problem in deleting file: %r", file_to_transfer["local_path"])
            return {"success": True, "opaque": file_to_transfer.get("opaque")}
        except Exception as ex:  # pylint: disable=broad-except
            self.log.exception("Problem in moving file: %r, need to retry", file_to_transfer["local_path"])
            # Sleep for a bit to avoid busy looping
            time.sleep(0.5)

            file_to_transfer["retries"] = file_to_transfer.get("retries", 0) + 1
            if file_to_transfer["retries"] > self.config["upload_retries_warning_limit"]:
                create_alert_file(self.config, "upload_retries_warning")

            self.transfer_queue.put(file_to_transfer)
            return {"success": False, "call_callback": False, "exception": ex}
Example #22
0
    def fetch_all(self):
        for retry in range(3):
            try:
                with self.manager_class() as manager:
                    self._setup_progress_tracking(manager)
                    with self.pool_class(processes=self._process_count()) as pool:
                        self._queue_jobs(pool)
                        self._wait_for_jobs_to_complete()
                        break
            except TimeoutError:
                self.pending_jobs.clear()
                self.last_progress_ts = time.monotonic()
                if self.errors:
                    break
                elif retry == 2:
                    self.log.error("Download stalled despite retries, aborting")
                    self.errors = 1
                    break

        if self.errors:
            raise RestoreError("Backup download/extraction failed with {} errors".format(self.errors))
        self._create_tablespace_symlinks()
        with compat.suppress(OSError):
            os.rmdir(os.path.join(self.pgdata, "pgdata"))
Example #23
0
    def _prefetch(self, site, filetype, names):
        if not names:
            return

        start_time = time.monotonic()
        callback_queue = Queue()

        site_config = self.server.config["backup_sites"][site]
        xlog_dir = site_config["pg_xlog_directory"]
        downloads = {}
        for obname in names:
            if obname in self.server.prefetch_404:
                continue  # previously failed to prefetch this file, don't try again
            prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname))
            if os.path.exists(prefetch_target_path):
                continue  # already fetched this file
            try:
                fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format(xlog_dir, obname), suffix=".pghoard.tmp")
                os.close(fd)
            except OSError as ex:
                self.server.log.error("Unable to create temporary file to prefetch %r: %s: %s",
                                      obname, ex.__class__.__name__, ex)
                continue
            self.server.log.debug("Prefetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r",
                                  site, obname, filetype, tmp_target_path)
            downloads[obname] = tmp_target_path
            self.server.transfer_queue.put({
                "callback_queue": callback_queue,
                "filetype": filetype,
                "local_path": obname,
                "opaque": obname,
                "site": site,
                "target_path": tmp_target_path,
                "type": "DOWNLOAD",
            })

        # allow something else to happen
        try:
            yield
        finally:
            # process results (timeout is 30 seconds after start but at least 5 seconds)
            timeout_at = max(start_time + 30, time.monotonic() + 5)
            while downloads:
                time_left = timeout_at - time.monotonic()
                try:
                    response = callback_queue.get(timeout=time_left)
                except Empty:
                    break  # timeout
                obname = response["opaque"]
                tmp_target_path = downloads.pop(response["opaque"])
                if response["success"]:
                    prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname))
                    os.rename(tmp_target_path, prefetch_target_path)
                    self.server.log.debug("Prefetched %r %r to %r, took: %.3fs",
                                          site, obname, prefetch_target_path, time.monotonic() - start_time)
                else:
                    ex = response.get("exception", Error)
                    if isinstance(ex, FileNotFoundFromStorageError):
                        # don't try prefetching this file again
                        self.server.prefetch_404.append(obname)
                    self.server.log.debug("Prefetching %r %r failed (%s), took: %.3fs",
                                          site, obname, ex.__class__.__name__, time.monotonic() - start_time)
                    with suppress(Exception):
                        os.unlink(tmp_target_path)

            # everything else timed out
            while downloads:
                obname, tmp_target_path = downloads.popitem()
                self.server.log.debug("Prefetching %r %r timed out, took: %.3fs",
                                      site, obname, time.monotonic() - start_time)
                with suppress(Exception):
                    os.unlink(tmp_target_path)
Example #24
0
def setup_pg():
    tmpdir_obj = py_path.local(tempfile.mkdtemp(prefix="pghoard_dbtest_"))
    tmpdir = str(tmpdir_obj)
    # try to find the binaries for these versions in some path
    pgdata = os.path.join(tmpdir, "pgdata")
    db = PGTester(pgdata)  # pylint: disable=redefined-outer-name
    db.run_cmd("initdb", "-D", pgdata, "--encoding", "utf-8")
    # NOTE: does not use TCP ports, no port conflicts
    db.user = dict(host=pgdata, user="******", password="******", dbname="postgres", port="5432")
    # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but
    # psql triest to find .pgpass file from there as do our functions that
    # manipulate pgpass.  By pointing $HOME there we make sure we're not
    # making persistent changes to the environment.
    os.environ["HOME"] = tmpdir
    # allow replication connections
    with open(os.path.join(pgdata, "pg_hba.conf"), "w") as fp:
        fp.write(
            "local all disabled reject\n"
            "local all passwordy md5\n"
            "local all all trust\n"
            "local replication disabled reject\n"
            "local replication passwordy md5\n"
            "local replication all trust\n"
        )
    # rewrite postgresql.conf
    with open(os.path.join(pgdata, "postgresql.conf"), "r+") as fp:
        lines = fp.read().splitlines()
        fp.seek(0)
        fp.truncate()
        config = {}
        for line in lines:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            key, val = re.split(r"\s*=\s*", line, 1)
            config[key] = re.sub(r"\s*(#.*)?$", "", val)
        config.update({
            "hot_standby": "on",
            "logging_collector": "off",
            "max_wal_senders": 2,
            "wal_keep_segments": 100,
            "wal_level": "hot_standby",
            # disable fsync and synchronous_commit to speed up the tests a bit
            "fsync": "off",
            "synchronous_commit": "off",
            # don't need to wait for autovacuum workers when shutting down
            "autovacuum": "off",
        })
        lines = ["{} = {}\n".format(key, val) for key, val in sorted(config.items())]  # noqa
        fp.write("".join(lines))
    # now start pg and create test users
    db.run_pg()
    try:
        db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "disabled")
        db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "passwordy")
        db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"], "-s", db.user["user"])
        yield db
    finally:
        db.kill()
        with suppress(Exception):
            tmpdir_obj.remove(rec=1)
Example #25
0
    def _prefetch(self, site, filetype, names):
        if not names:
            return

        start_time = time.monotonic()
        callback_queue = Queue()

        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        downloads = {}
        for obname in names:
            if obname in self.server.prefetch_404:
                continue  # previously failed to prefetch this file, don't try again
            prefetch_target_path = os.path.join(
                xlog_dir, "{}.pghoard.prefetch".format(obname))
            if os.path.exists(prefetch_target_path):
                continue  # already fetched this file
            try:
                fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format(
                    xlog_dir, obname),
                                                       suffix=".pghoard.tmp")
                os.close(fd)
            except OSError as ex:
                self.server.log.error(
                    "Unable to create temporary file to prefetch %r: %s: %s",
                    obname, ex.__class__.__name__, ex)
                continue
            self.server.log.debug(
                "Prefetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r",
                site, obname, filetype, tmp_target_path)
            downloads[obname] = tmp_target_path
            self.server.transfer_queue.put({
                "callback_queue": callback_queue,
                "filetype": filetype,
                "local_path": obname,
                "opaque": obname,
                "site": site,
                "target_path": tmp_target_path,
                "type": "DOWNLOAD",
            })

        # allow something else to happen
        try:
            yield
        finally:
            # process results (timeout is 30 seconds after start but at least 5 seconds)
            timeout_at = max(start_time + 30, time.monotonic() + 5)
            while downloads:
                time_left = timeout_at - time.monotonic()
                try:
                    response = callback_queue.get(timeout=time_left)
                except Empty:
                    break  # timeout
                obname = response["opaque"]
                tmp_target_path = downloads.pop(response["opaque"])
                if response["success"]:
                    prefetch_target_path = os.path.join(
                        xlog_dir, "{}.pghoard.prefetch".format(obname))
                    os.rename(tmp_target_path, prefetch_target_path)
                    self.server.log.debug(
                        "Prefetched %r %r to %r, took: %.3fs", site, obname,
                        prefetch_target_path,
                        time.monotonic() - start_time)
                else:
                    ex = response.get("exception", Error)
                    if isinstance(ex, FileNotFoundFromStorageError):
                        # don't try prefetching this file again
                        self.server.prefetch_404.append(obname)
                    self.server.log.debug(
                        "Prefetching %r %r failed (%s), took: %.3fs", site,
                        obname, ex.__class__.__name__,
                        time.monotonic() - start_time)
                    with suppress(Exception):
                        os.unlink(tmp_target_path)

            # everything else timed out
            while downloads:
                obname, tmp_target_path = downloads.popitem()
                self.server.log.debug(
                    "Prefetching %r %r timed out, took: %.3fs", site, obname,
                    time.monotonic() - start_time)
                with suppress(Exception):
                    os.unlink(tmp_target_path)
Example #26
0
    def get_wal_or_timeline_file(self, site, filename, filetype):
        target_path = self.headers.get("x-pghoard-target-path")
        if not target_path:
            raise HttpResponse("x-pghoard-target-path header missing from download", status=400)

        self._process_completed_download_operations()

        # See if we have already prefetched the file
        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(filename))
        if os.path.exists(prefetch_target_path):
            ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path)
            if not ex:
                self._create_prefetch_operations(site, filetype, filename)
                raise HttpResponse(status=201)

        # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch
        # files from the archive starting from the latest checkpoint. We have potentially fetched these files
        # already earlier. Check if we have the files already and if we do, don't go over the network to refetch
        # them yet again but just rename them to the path that PG is requesting.
        xlog_path = os.path.join(xlog_dir, filename)
        if os.path.exists(xlog_path):
            self.server.log.info("Requested %r, found it in pg_xlog directory as: %r, returning directly",
                                 filename, xlog_path)
            ex = self._try_save_and_verify_restored_file(filetype, filename, xlog_path, target_path, unlink=False)
            if ex:
                self.server.log.warning("Found file: %r but it was invalid: %s", xlog_path, ex)
            else:
                raise HttpResponse(status=201)

        key = self._make_file_key(site, filetype, filename)
        with suppress(ValueError):
            self.server.prefetch_404.remove(key)
        self._create_fetch_operation(key, site, filetype, filename, max_age=5, suppress_error=False)
        self._create_prefetch_operations(site, filetype, filename)

        last_schedule_call = time.monotonic()
        start_time = time.monotonic()
        retries = 2
        while (time.monotonic() - start_time) <= 30:
            self._process_completed_download_operations(timeout=0.01)
            with self.server.lock:
                if os.path.isfile(prefetch_target_path):
                    ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path)
                    if not ex:
                        raise HttpResponse(status=201)
                    elif ex and retries == 0:
                        raise ex  # pylint: disable=raising-bad-type
                    retries -= 1
            if key in self.server.prefetch_404:
                raise HttpResponse(status=404)
            with self.server.lock:
                if key not in self.server.pending_download_ops:
                    if retries == 0:
                        raise HttpResponse(status=500)
                    retries -= 1
                    self._create_fetch_operation(key, site, filetype, filename, suppress_error=False)
            if time.monotonic() - last_schedule_call >= 1:
                last_schedule_call = time.monotonic()
                # Replace existing download operation if it has been executing for too long
                self._create_fetch_operation(key, site, filetype, filename, max_age=10, suppress_error=False)

        raise HttpResponse("TIMEOUT", status=500)
Example #27
0
    def get_wal_or_timeline_file(self, site, filename, filetype):
        target_path = self.headers.get("x-pghoard-target-path")
        if not target_path:
            raise HttpResponse(
                "x-pghoard-target-path header missing from download",
                status=400)

        self._process_completed_download_operations()

        # See if we have already prefetched the file
        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        prefetch_target_path = os.path.join(
            xlog_dir, "{}.pghoard.prefetch".format(filename))
        if os.path.exists(prefetch_target_path):
            ex = self._try_save_and_verify_restored_file(
                filetype, filename, prefetch_target_path, target_path)
            if not ex:
                self._create_prefetch_operations(site, filetype, filename)
                raise HttpResponse(status=201)

        # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch
        # files from the archive starting from the latest checkpoint. We have potentially fetched these files
        # already earlier. Check if we have the files already and if we do, don't go over the network to refetch
        # them yet again but just rename them to the path that PG is requesting.
        xlog_path = os.path.join(xlog_dir, filename)
        if os.path.exists(xlog_path):
            self.server.log.info(
                "Requested %r, found it in pg_xlog directory as: %r, returning directly",
                filename, xlog_path)
            ex = self._try_save_and_verify_restored_file(filetype,
                                                         filename,
                                                         xlog_path,
                                                         target_path,
                                                         unlink=False)
            if ex:
                self.server.log.warning(
                    "Found file: %r but it was invalid: %s", xlog_path, ex)
            else:
                raise HttpResponse(status=201)

        key = self._make_file_key(site, filetype, filename)
        with suppress(ValueError):
            self.server.prefetch_404.remove(key)
        self._create_fetch_operation(key,
                                     site,
                                     filetype,
                                     filename,
                                     max_age=5,
                                     suppress_error=False)
        self._create_prefetch_operations(site, filetype, filename)

        last_schedule_call = time.monotonic()
        start_time = time.monotonic()
        retries = 2
        while (time.monotonic() - start_time) <= 30:
            self._process_completed_download_operations(timeout=0.01)
            with self.server.lock:
                if os.path.isfile(prefetch_target_path):
                    ex = self._try_save_and_verify_restored_file(
                        filetype, filename, prefetch_target_path, target_path)
                    if not ex:
                        raise HttpResponse(status=201)
                    elif ex and retries == 0:
                        raise ex  # pylint: disable=raising-bad-type
                    retries -= 1
            if key in self.server.prefetch_404:
                raise HttpResponse(status=404)
            with self.server.lock:
                if key not in self.server.pending_download_ops:
                    if retries == 0:
                        raise HttpResponse(status=500)
                    retries -= 1
                    self._create_fetch_operation(key,
                                                 site,
                                                 filetype,
                                                 filename,
                                                 suppress_error=False)
            if time.monotonic() - last_schedule_call >= 1:
                last_schedule_call = time.monotonic()
                # Replace existing download operation if it has been executing for too long
                self._create_fetch_operation(key,
                                             site,
                                             filetype,
                                             filename,
                                             max_age=10,
                                             suppress_error=False)

        raise HttpResponse("TIMEOUT", status=500)
Example #28
0
    def handle_site(self, site, site_config):
        self.set_state_defaults(site)
        xlog_path, basebackup_path = self.create_backup_site_paths(site)

        if not site_config["active"]:
            return  # If a site has been marked inactive, don't bother checking anything

        self._cleanup_inactive_receivexlogs(site)

        chosen_backup_node = random.choice(site_config["nodes"])

        if site not in self.receivexlogs and site not in self.walreceivers:
            if site_config["active_backup_mode"] == "pg_receivexlog":
                self.receivexlog_listener(site, chosen_backup_node,
                                          xlog_path + "_incoming")
            elif site_config["active_backup_mode"] == "walreceiver":
                state_file_path = self.config["json_state_file_path"]
                walreceiver_state = {}
                with suppress(FileNotFoundError):
                    with open(state_file_path, "r") as fp:
                        old_state_file = json.load(fp)
                        walreceiver_state = old_state_file.get(
                            "walreceivers", {}).get(site, {})
                self.start_walreceiver(
                    site=site,
                    chosen_backup_node=chosen_backup_node,
                    last_flushed_lsn=walreceiver_state.get("last_flushed_lsn"))

        if site not in self.time_of_last_backup_check or \
                time.monotonic() - self.time_of_last_backup_check[site] > 300:
            self.time_of_last_backup[site] = self.check_backup_count_and_state(
                site)
            self.time_of_last_backup_check[site] = time.monotonic()

        # check if a basebackup is running, or if a basebackup has just completed
        if site in self.basebackups:
            try:
                result = self.basebackups_callbacks[site].get(block=False)
            except Empty:
                # previous basebackup (or its compression and upload) still in progress
                return
            if self.basebackups[site].is_alive():
                self.basebackups[site].join()
            del self.basebackups[site]
            del self.basebackups_callbacks[site]
            self.log.debug("Basebackup has finished for %r: %r", site, result)
            self.time_of_last_backup[site] = self.check_backup_count_and_state(
                site)
            self.time_of_last_backup_check[site] = time.monotonic()

        new_backup_needed = False
        if site in self.requested_basebackup_sites:
            self.log.info("Creating a new basebackup for %r due to request",
                          site)
            self.requested_basebackup_sites.discard(site)
            new_backup_needed = True
        elif site_config["basebackup_interval_hours"] is None:
            # Basebackups are disabled for this site (but they can still be requested over the API.)
            pass
        elif self.time_of_last_backup.get(site) is None:
            self.log.info(
                "Creating a new basebackup for %r because there are currently none",
                site)
            new_backup_needed = True
        else:
            delta_since_last_backup = datetime.datetime.now(
                datetime.timezone.utc) - self.time_of_last_backup[site]
            if delta_since_last_backup >= datetime.timedelta(
                    hours=site_config["basebackup_interval_hours"]):
                self.log.info(
                    "Creating a new basebackup for %r by schedule (%s from previous)",
                    site, delta_since_last_backup)
                new_backup_needed = True

        if new_backup_needed:
            self.basebackups_callbacks[site] = Queue()
            self.create_basebackup(site, chosen_backup_node, basebackup_path,
                                   self.basebackups_callbacks[site])
Example #29
0
def setup_pg():
    tmpdir_obj = py_path.local(tempfile.mkdtemp(prefix="pghoard_dbtest_"))
    tmpdir = str(tmpdir_obj)
    # try to find the binaries for these versions in some path
    pgdata = os.path.join(tmpdir, "pgdata")
    db = TestPG(pgdata)  # pylint: disable=redefined-outer-name
    db.run_cmd("initdb", "-D", pgdata, "--encoding", "utf-8")
    # NOTE: does not use TCP ports, no port conflicts
    db.user = dict(host=pgdata,
                   user="******",
                   password="******",
                   dbname="postgres",
                   port="5432")
    # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but
    # psql triest to find .pgpass file from there as do our functions that
    # manipulate pgpass.  By pointing $HOME there we make sure we're not
    # making persistent changes to the environment.
    os.environ["HOME"] = tmpdir
    # allow replication connections
    with open(os.path.join(pgdata, "pg_hba.conf"), "w") as fp:
        fp.write("local all disabled reject\n"
                 "local all passwordy md5\n"
                 "local all all trust\n"
                 "local replication disabled reject\n"
                 "local replication passwordy md5\n"
                 "local replication all trust\n")
    # rewrite postgresql.conf
    with open(os.path.join(pgdata, "postgresql.conf"), "r+") as fp:
        lines = fp.read().splitlines()
        fp.seek(0)
        fp.truncate()
        config = {}
        for line in lines:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            key, val = re.split(r"\s*=\s*", line, 1)
            config[key] = re.sub(r"\s*(#.*)?$", "", val)
        config.update({
            "hot_standby": "on",
            "logging_collector": "off",
            "max_wal_senders": 2,
            "wal_keep_segments": 100,
            "wal_level": "hot_standby",
            # disable fsync and synchronous_commit to speed up the tests a bit
            "fsync": "off",
            "synchronous_commit": "off",
            # don't need to wait for autovacuum workers when shutting down
            "autovacuum": "off",
        })
        lines = [
            "{} = {}\n".format(key, val) for key, val in sorted(config.items())
        ]  # noqa
        fp.write("".join(lines))
    # now start pg and create test users
    db.run_pg()
    try:
        db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"],
                   "disabled")
        db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"],
                   "passwordy")
        db.run_cmd("createuser", "-h", db.user["host"], "-p", db.user["port"],
                   "-s", db.user["user"])
        yield db
    finally:
        db.kill()
        with suppress(Exception):
            tmpdir_obj.remove(rec=1)
Example #30
0
    def handle_site(self, site, site_config):
        self.set_state_defaults(site)
        xlog_path, basebackup_path = self.create_backup_site_paths(site)

        if not site_config["active"]:
            return  # If a site has been marked inactive, don't bother checking anything

        self._cleanup_inactive_receivexlogs(site)

        chosen_backup_node = random.choice(site_config["nodes"])

        if site not in self.receivexlogs and site not in self.walreceivers:
            if site_config["active_backup_mode"] == "pg_receivexlog":
                self.receivexlog_listener(site, chosen_backup_node, xlog_path + "_incoming")
            elif site_config["active_backup_mode"] == "walreceiver":
                state_file_path = self.config["json_state_file_path"]
                walreceiver_state = {}
                with suppress(FileNotFoundError):
                    with open(state_file_path, "r") as fp:
                        old_state_file = json.load(fp)
                        walreceiver_state = old_state_file.get("walreceivers", {}).get(site, {})
                self.start_walreceiver(
                    site=site,
                    chosen_backup_node=chosen_backup_node,
                    last_flushed_lsn=walreceiver_state.get("last_flushed_lsn"))

        last_check_time = self.time_of_last_backup_check.get(site)
        if not last_check_time or (time.monotonic() - self.time_of_last_backup_check[site]) > 300:
            self.time_of_last_backup[site] = self.check_backup_count_and_state(site)
            self.time_of_last_backup_check[site] = time.monotonic()

        # check if a basebackup is running, or if a basebackup has just completed
        if site in self.basebackups:
            try:
                result = self.basebackups_callbacks[site].get(block=False)
            except Empty:
                # previous basebackup (or its compression and upload) still in progress
                return
            if self.basebackups[site].is_alive():
                self.basebackups[site].join()
            del self.basebackups[site]
            del self.basebackups_callbacks[site]
            self.log.debug("Basebackup has finished for %r: %r", site, result)
            self.time_of_last_backup[site] = self.check_backup_count_and_state(site)
            self.time_of_last_backup_check[site] = time.monotonic()

        new_backup_needed = False
        if site in self.requested_basebackup_sites:
            self.log.info("Creating a new basebackup for %r due to request", site)
            self.requested_basebackup_sites.discard(site)
            new_backup_needed = True
        elif site_config["basebackup_interval_hours"] is None:
            # Basebackups are disabled for this site (but they can still be requested over the API.)
            pass
        elif self.time_of_last_backup.get(site) is None:
            self.log.info("Creating a new basebackup for %r because there are currently none", site)
            new_backup_needed = True
        else:
            delta_since_last_backup = datetime.datetime.now(datetime.timezone.utc) - self.time_of_last_backup[site]
            if delta_since_last_backup >= datetime.timedelta(hours=site_config["basebackup_interval_hours"]):
                self.log.info("Creating a new basebackup for %r by schedule (%s from previous)",
                              site, delta_since_last_backup)
                new_backup_needed = True

        if new_backup_needed and not os.path.exists(self.config["maintenance_mode_file"]):
            self.basebackups_callbacks[site] = Queue()
            self.create_basebackup(site, chosen_backup_node, basebackup_path, self.basebackups_callbacks[site])
Example #31
0
    def handle_site(self, site, site_config):
        self.set_state_defaults(site)
        xlog_path, basebackup_path = self.create_backup_site_paths(site)

        if not site_config["active"]:
            return  # If a site has been marked inactive, don't bother checking anything

        self._cleanup_inactive_receivexlogs(site)

        chosen_backup_node = random.choice(site_config["nodes"])

        if site not in self.receivexlogs and site not in self.walreceivers:
            if site_config["active_backup_mode"] == "pg_receivexlog":
                self.receivexlog_listener(site, chosen_backup_node,
                                          xlog_path + "_incoming")
            elif site_config["active_backup_mode"] == "walreceiver":
                state_file_path = self.config["json_state_file_path"]
                walreceiver_state = {}
                with suppress(FileNotFoundError):
                    with open(state_file_path, "r") as fp:
                        old_state_file = json.load(fp)
                        walreceiver_state = old_state_file.get(
                            "walreceivers", {}).get(site, {})
                self.start_walreceiver(
                    site=site,
                    chosen_backup_node=chosen_backup_node,
                    last_flushed_lsn=walreceiver_state.get("last_flushed_lsn"))

        last_check_time = self.time_of_last_backup_check.get(site)
        if not last_check_time or (time.monotonic() -
                                   self.time_of_last_backup_check[site]) > 300:
            self.refresh_backup_list_and_delete_old(site)
            self.time_of_last_backup_check[site] = time.monotonic()

        # check if a basebackup is running, or if a basebackup has just completed
        if site in self.basebackups:
            try:
                result = self.basebackups_callbacks[site].get(block=False)
                if result["success"]:
                    # No matter which mode, if succeeded reset the counter
                    self.delta_backup_failures.pop(site, None)
                elif site_config["basebackup_mode"] == BaseBackupMode.delta:
                    last_failed_time = utc_now()
                    if site not in self.delta_backup_failures:
                        self.delta_backup_failures[
                            site] = DeltaBaseBackupFailureInfo(
                                retries=0, last_failed_time=last_failed_time)
                    else:
                        self.delta_backup_failures[site].retries += 1
                        self.delta_backup_failures[
                            site].last_failed_time = last_failed_time
            except Empty:
                # previous basebackup (or its compression and upload) still in progress
                return
            if self.basebackups[site].is_alive():
                self.basebackups[site].join()
            del self.basebackups[site]
            del self.basebackups_callbacks[site]
            self.log.debug("Basebackup has finished for %r: %r", site, result)
            self.refresh_backup_list_and_delete_old(site)
            self.time_of_last_backup_check[site] = time.monotonic()

        metadata = self.get_new_backup_details(site=site,
                                               site_config=site_config)
        if metadata and not os.path.exists(
                self.config["maintenance_mode_file"]):
            if site in self.delta_backup_failures:
                retries = self.delta_backup_failures[site].retries
                if retries > site_config["basebackup_delta_mode_max_retries"]:
                    self.log.info(
                        "Giving up backup after exceeding max retries: %r",
                        retries)
                    return
                else:
                    # Start from ~2 min with cap of one hour
                    retry_interval = min(2**(retries + 7), 60 * 60)
                    if utc_now() >= self.delta_backup_failures[
                            site].last_failed_time + datetime.timedelta(
                                seconds=retry_interval):
                        self.log.info("Re-trying delta basebackup")
                    else:
                        self.log.info(
                            "Waiting for backoff time before re-trying new delta backup due to previous failures"
                        )
                        return

            self.basebackups_callbacks[site] = Queue()
            self.create_basebackup(site, chosen_backup_node, basebackup_path,
                                   self.basebackups_callbacks[site], metadata)
Example #32
0
def delete_alert_file(config, filename):
    filepath = os.path.join(config["alert_file_dir"], filename)
    with suppress(FileNotFoundError):
        os.unlink(filepath)
Example #33
0
def delete_alert_file(config, filename):
    filepath = os.path.join(config["alert_file_dir"], filename)
    with suppress(FileNotFoundError):
        os.unlink(filepath)