Ejemplo n.º 1
0
def http_restore(pghoard):
    pgdata = get_pg_wal_directory(
        pghoard.config["backup_sites"][pghoard.test_site])
    return HTTPRestore("localhost",
                       pghoard.config["http_port"],
                       site=pghoard.test_site,
                       pgdata=pgdata)
Ejemplo n.º 2
0
 def test_archive_command_with_invalid_file(self, pghoard):
     # only WAL and timeline (.history) files can be archived
     bl_label = "000000010000000000000002.00000028.backup"
     bl_file = "xlog/{}".format(bl_label)
     wal_path = os.path.join(
         get_pg_wal_directory(
             pghoard.config["backup_sites"][pghoard.test_site]), bl_label)
     backup_wal_path = os.path.join(pghoard.config["backup_location"],
                                    pghoard.test_site, bl_file)
     with open(wal_path, "w") as fp:
         fp.write("jee")
     # backup labels are ignored - archiving returns success but file won't appear on disk
     archive_command(host="127.0.0.1",
                     port=pghoard.config["http_port"],
                     site=pghoard.test_site,
                     xlog=bl_label)
     assert not os.path.exists(backup_wal_path)
     # any other files raise an error
     with pytest.raises(postgres_command.PGCError) as excinfo:
         archive_command(host="127.0.0.1",
                         port=pghoard.config["http_port"],
                         site=pghoard.test_site,
                         xlog=bl_label + ".x")
     assert excinfo.value.exit_code == postgres_command.EXIT_ARCHIVE_FAIL
     assert not os.path.exists(backup_wal_path + ".x")
Ejemplo n.º 3
0
 def test_get_encrypted_archived_file(self, pghoard):
     wal_seg = "000000090000000000000010"
     content = wal_header_for_file(wal_seg)
     compressor = pghoard.Compressor()
     compressed_content = compressor.compress(content) + (compressor.flush() or b"")
     encryptor = Encryptor(CONSTANT_TEST_RSA_PUBLIC_KEY)
     encrypted_content = encryptor.update(compressed_content) + encryptor.finalize()
     wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site])
     archive_path = os.path.join(pghoard.test_site, "xlog", wal_seg)
     metadata = {
         "compression-algorithm": pghoard.config["compression"]["algorithm"],
         "original-file-size": len(content),
         "encryption-key-id": "testkey",
     }
     store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site)
     store.store_file_from_memory(archive_path, encrypted_content, metadata=metadata)
     pghoard.webserver.config["backup_sites"][pghoard.test_site]["encryption_keys"] = {
         "testkey": {
             "public": CONSTANT_TEST_RSA_PUBLIC_KEY,
             "private": CONSTANT_TEST_RSA_PRIVATE_KEY,
         },
     }
     restore_target = os.path.join(wal_dir, wal_seg)
     restore_command(site=pghoard.test_site, xlog=wal_seg, output=restore_target,
                     host="127.0.0.1", port=pghoard.config["http_port"])
     assert os.path.exists(restore_target)
     with open(restore_target, "rb") as fp:
         restored_data = fp.read()
     assert content == restored_data
Ejemplo n.º 4
0
 def test_get_encrypted_archived_file(self, pghoard):
     wal_seg = "000000090000000000000010"
     content = wal_header_for_file(wal_seg)
     compressor = pghoard.Compressor()
     compressed_content = compressor.compress(content) + (compressor.flush() or b"")
     encryptor = Encryptor(CONSTANT_TEST_RSA_PUBLIC_KEY)
     encrypted_content = encryptor.update(compressed_content) + encryptor.finalize()
     wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site])
     archive_path = os.path.join(pghoard.test_site, "xlog", wal_seg)
     metadata = {
         "compression-algorithm": pghoard.config["compression"]["algorithm"],
         "original-file-size": len(content),
         "encryption-key-id": "testkey",
     }
     store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site)
     store.store_file_from_memory(archive_path, encrypted_content, metadata=metadata)
     pghoard.webserver.config["backup_sites"][pghoard.test_site]["encryption_keys"] = {
         "testkey": {
             "public": CONSTANT_TEST_RSA_PUBLIC_KEY,
             "private": CONSTANT_TEST_RSA_PRIVATE_KEY,
         },
     }
     restore_target = os.path.join(wal_dir, wal_seg)
     restore_command(site=pghoard.test_site, xlog=wal_seg, output=restore_target,
                     host="127.0.0.1", port=pghoard.config["http_port"])
     assert os.path.exists(restore_target)
     with open(restore_target, "rb") as fp:
         restored_data = fp.read()
     assert content == restored_data
Ejemplo n.º 5
0
    def check_and_upload_missing_local_files(self):
        current_wal_file = self.get_current_wal_file()
        first_required_wal_file, _ = self.get_first_required_wal_segment()

        # Find relevant WAL files.  We do this by checking archival status
        # of all WAL files older than the one currently open (ie reverse
        # sorted list from newest file that should've been archived to the
        # oldest on disk) and and appending missing files to a list.  After
        # collecting a list we start archiving them from oldest to newest.
        # This is done so we don't break our missing archive detection logic
        # if sync is interrupted for some reason.
        # Sort all timeline files first to make sure they're always
        # archived, otherwise the timeline files are processed only after
        # all WAL files for a given timeline have been handled.
        wal_dir = get_pg_wal_directory(self.backup_site)
        wal_files = os.listdir(wal_dir)
        wal_files.sort(key=lambda f: (f.endswith(".history"), f), reverse=True)
        need_archival = []
        for wal_file in wal_files:
            archive_type = None
            if wal.TIMELINE_RE.match(wal_file):
                # We want all timeline files
                archive_type = "TIMELINE"
            elif not wal.WAL_RE.match(wal_file):
                pass  # not a WAL or timeline file
            elif wal_file == current_wal_file:
                self.log.info("Skipping currently open WAL file %r", wal_file)
            elif wal_file > current_wal_file:
                self.log.debug("Skipping recycled WAL file %r", wal_file)
            elif first_required_wal_file is not None and wal_file < first_required_wal_file:
                self.log.info(
                    "WAL file %r is not needed for the latest basebackup",
                    wal_file)
                break
            else:
                # WAL file in range first_required_wal_file .. current_wal_file
                archive_type = "WAL"

            if archive_type:
                resp = requests.head("{base}/archive/{file}".format(
                    base=self.base_url, file=wal_file))
                if resp.status_code == 200:
                    self.log.info("%s file %r already archived", archive_type,
                                  wal_file)
                    continue
                self.log.info("%s file %r needs to be archived", archive_type,
                              wal_file)
                need_archival.append(wal_file)

        for wal_file in sorted(need_archival):  # sort oldest to newest
            resp = requests.put("{base}/archive/{file}".format(
                base=self.base_url, file=wal_file))
            archive_type = "TIMELINE" if ".history" in wal_file else "WAL"
            if resp.status_code != 201:
                self.log.error(
                    "%s file %r archival failed with status code %r",
                    archive_type, wal_file, resp.status_code)
            else:
                self.log.info("%s file %r archived", archive_type, wal_file)
Ejemplo n.º 6
0
    def test_get_archived_file(self, pghoard):
        wal_seg_prev_tli = "00000001000000000000000F"
        wal_seg = "00000002000000000000000F"
        wal_file = "xlog/{}".format(wal_seg)
        # NOTE: create WAL header for the "previous" timeline, this should be accepted
        content = wal_header_for_file(wal_seg_prev_tli)
        wal_dir = get_pg_wal_directory(
            pghoard.config["backup_sites"][pghoard.test_site])
        archive_path = os.path.join(pghoard.test_site, wal_file)
        compressor = pghoard.Compressor()
        compressed_content = compressor.compress(content) + (
            compressor.flush() or b"")
        metadata = {
            "compression-algorithm":
            pghoard.config["compression"]["algorithm"],
            "original-file-size": len(content),
        }
        store = pghoard.transfer_agents[0].get_object_storage(
            pghoard.test_site)
        store.store_file_from_memory(archive_path,
                                     compressed_content,
                                     metadata=metadata)

        restore_command(site=pghoard.test_site,
                        xlog=wal_seg,
                        output=None,
                        host="127.0.0.1",
                        port=pghoard.config["http_port"])
        restore_target = os.path.join(wal_dir, wal_seg)

        restore_command(site=pghoard.test_site,
                        xlog=wal_seg,
                        output=restore_target,
                        host="127.0.0.1",
                        port=pghoard.config["http_port"])
        assert os.path.exists(restore_target) is True
        with open(restore_target, "rb") as fp:
            restored_data = fp.read()
        assert content == restored_data

        # test the same thing using restore as 'pghoard_postgres_command'
        tmp_out = os.path.join(wal_dir, restore_target + ".cmd")
        postgres_command.main([
            "--host",
            "localhost",
            "--port",
            str(pghoard.config["http_port"]),
            "--site",
            pghoard.test_site,
            "--mode",
            "restore",
            "--output",
            tmp_out,
            "--xlog",
            wal_seg,
        ])
        with open(tmp_out, "rb") as fp:
            restored_data = fp.read()
        assert content == restored_data
Ejemplo n.º 7
0
    def _create_fetch_operation(self,
                                key,
                                site,
                                filetype,
                                obname,
                                max_age=-1,
                                suppress_error=True):
        with self.server.lock:
            # Don't fetch again if we already have pending fetch operation unless the operation
            # has been ongoing longer than given max age and has potentially became stale
            existing = self.server.pending_download_ops.get(key)
            if existing and (
                    max_age < 0
                    or time.monotonic() - existing["started_at"] <= max_age):
                return

        xlog_dir = get_pg_wal_directory(
            self.server.config["backup_sites"][site])
        prefetch_target_path = os.path.join(
            xlog_dir, "{}.pghoard.prefetch".format(obname))
        if os.path.exists(prefetch_target_path):
            return

        try:
            fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format(
                xlog_dir, obname),
                                                   suffix=".pghoard.tmp")
            os.close(fd)
        except OSError as ex:
            self.server.log.error(
                "Unable to create temporary file to fetch %r: %s: %s", obname,
                ex.__class__.__name__, ex)
            if suppress_error:
                return
            else:
                raise HttpResponse(
                    "Unable to create temporary file for {0!r}: {1.__class__.__name__}: {1}"
                    .format(key, ex),
                    status=400)

        self.server.log.debug(
            "Fetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r",
            site, obname, filetype, tmp_target_path)
        target_path = os.path.join(xlog_dir,
                                   "{}.pghoard.prefetch".format(obname))
        self.server.pending_download_ops[key] = dict(
            started_at=time.monotonic(),
            target_path=target_path,
        )
        self.server.transfer_queue.put({
            "callback_queue": self.server.download_results,
            "filetype": filetype,
            "local_path": obname,
            "opaque": key,
            "site": site,
            "target_path": tmp_target_path,
            "type": "DOWNLOAD",
        })
Ejemplo n.º 8
0
    def test_wal_fetch_optimization(self, pghoard):
        # inject fake WAL and timeline files for testing
        invalid_wal_name = "000000060000000000000001"
        valid_wal_name = "000000060000000000000002"
        wal_name_output = "optimization_output_filename"
        output_path = os.path.join(
            get_pg_wal_directory(
                pghoard.config["backup_sites"][pghoard.test_site]),
            wal_name_output)
        invalid_wal_path = os.path.join(
            get_pg_wal_directory(
                pghoard.config["backup_sites"][pghoard.test_site]),
            invalid_wal_name)
        valid_wal_path = os.path.join(
            get_pg_wal_directory(
                pghoard.config["backup_sites"][pghoard.test_site]),
            valid_wal_name)

        with open(valid_wal_path, "wb") as out_file:
            out_file.write(
                wal_header_for_file(os.path.basename(valid_wal_path)))
        with open(invalid_wal_path, "wb") as out_file:
            # We use the wrong WAL file's name to generate the header on purpose to see that our check works
            out_file.write(
                wal_header_for_file(os.path.basename(valid_wal_path)))

        restore_command(site=pghoard.test_site,
                        xlog=os.path.basename(valid_wal_name),
                        host="127.0.0.1",
                        port=pghoard.config["http_port"],
                        output=output_path,
                        retry_interval=0.1)
        assert os.path.exists(output_path)
        os.unlink(output_path)

        with pytest.raises(postgres_command.PGCError):
            restore_command(site=pghoard.test_site,
                            xlog=os.path.basename(invalid_wal_name),
                            host="127.0.0.1",
                            port=pghoard.config["http_port"],
                            output=output_path,
                            retry_interval=0.1)
        assert not os.path.exists(output_path)
        os.unlink(invalid_wal_path)
Ejemplo n.º 9
0
    def handle_archival_request(self, site, filename, filetype):
        if filetype == "basebackup":
            # Request a basebackup to be made for site
            self.server.log.debug(
                "Requesting a new basebackup for site: %r to be made", site)
            self.server.requested_basebackup_sites.add(site)
            raise HttpResponse(status=201)

        start_time = time.time()

        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        xlog_path = os.path.join(xlog_dir, filename)
        self.server.log.debug("Got request to archive: %r %r %r, %r", site,
                              filetype, filename, xlog_path)
        if not os.path.exists(xlog_path):
            self.server.log.debug(
                "xlog_path: %r did not exist, cannot archive, returning 404",
                xlog_path)
            raise HttpResponse("N/A", status=404)

        self._verify_wal(filetype, filename, xlog_path)

        callback_queue = Queue()
        if not self.server.config["backup_sites"][site]["object_storage"]:
            compress_to_memory = False
        else:
            compress_to_memory = True
        compression_event = {
            "type": "MOVE",
            "callback_queue": callback_queue,
            "compress_to_memory": compress_to_memory,
            "delete_file_after_compression": False,
            "full_path": xlog_path,
            "site": site,
            "src_path": "{}.partial".format(xlog_path),
        }
        self.server.compression_queue.put(compression_event)
        try:
            response = callback_queue.get(timeout=30)
            self.server.log.debug(
                "Handled an archival request for: %r %r, took: %.3fs", site,
                xlog_path,
                time.time() - start_time)
        except Empty:
            self.server.log.exception(
                "Problem in getting a response in time, returning 404, took: %.2fs",
                time.time() - start_time)
            raise HttpResponse("TIMEOUT", status=500)

        if not response["success"]:
            raise HttpResponse(status=500)
        raise HttpResponse(status=201)
Ejemplo n.º 10
0
    def test_wal_fetch_optimization(self, pghoard):
        # inject fake WAL and timeline files for testing
        invalid_wal_name = "000000060000000000000001"
        valid_wal_name = "000000060000000000000002"
        wal_name_output = "optimization_output_filename"
        output_path = os.path.join(
            get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), wal_name_output)
        invalid_wal_path = os.path.join(
            get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), invalid_wal_name)
        valid_wal_path = os.path.join(
            get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), valid_wal_name)

        with open(valid_wal_path, "wb") as out_file:
            out_file.write(wal_header_for_file(os.path.basename(valid_wal_path)))
        with open(invalid_wal_path, "wb") as out_file:
            # We use the wrong WAL file's name to generate the header on purpose to see that our check works
            out_file.write(wal_header_for_file(os.path.basename(valid_wal_path)))

        restore_command(
            site=pghoard.test_site,
            xlog=os.path.basename(valid_wal_name),
            host="127.0.0.1",
            port=pghoard.config["http_port"],
            output=output_path,
            retry_interval=0.1)
        assert os.path.exists(output_path)
        os.unlink(output_path)

        with pytest.raises(postgres_command.PGCError):
            restore_command(
                site=pghoard.test_site,
                xlog=os.path.basename(invalid_wal_name),
                host="127.0.0.1",
                port=pghoard.config["http_port"],
                output=output_path,
                retry_interval=0.1)
        assert not os.path.exists(output_path)
        os.unlink(invalid_wal_path)
Ejemplo n.º 11
0
 def test_archiving(self, pghoard):
     store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site)
     # inject fake WAL and timeline files for testing
     for xlog_type, wal_name in [("xlog", "0000000000000000000000CC"), ("timeline", "0000000F.history")]:
         foo_path = os.path.join(get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), wal_name)
         with open(foo_path, "wb") as out_file:
             if xlog_type == "xlog":
                 out_file.write(wal_header_for_file(wal_name))
             else:
                 out_file.write(b"1 2 3\n")
         archive_command(host="localhost", port=pghoard.config["http_port"], site=pghoard.test_site, xlog=wal_name)
         archive_path = os.path.join(pghoard.test_site, xlog_type, wal_name)
         store.get_metadata_for_key(archive_path)
         store.delete_key(archive_path)
         os.unlink(foo_path)
Ejemplo n.º 12
0
    def handle_archival_request(self, site, filename, filetype):
        if filetype == "basebackup":
            # Request a basebackup to be made for site
            self.server.log.debug("Requesting a new basebackup for site: %r to be made", site)
            self.server.requested_basebackup_sites.add(site)
            raise HttpResponse(status=201)

        start_time = time.time()

        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        xlog_path = os.path.join(xlog_dir, filename)
        self.server.log.debug("Got request to archive: %r %r %r, %r", site, filetype,
                              filename, xlog_path)
        if not os.path.exists(xlog_path):
            self.server.log.debug("xlog_path: %r did not exist, cannot archive, returning 404", xlog_path)
            raise HttpResponse("N/A", status=404)

        self._verify_wal(filetype, filename, xlog_path)

        callback_queue = Queue()
        if not self.server.config["backup_sites"][site]["object_storage"]:
            compress_to_memory = False
        else:
            compress_to_memory = True
        compression_event = {
            "type": "MOVE",
            "callback_queue": callback_queue,
            "compress_to_memory": compress_to_memory,
            "delete_file_after_compression": False,
            "full_path": xlog_path,
            "site": site,
            "src_path": "{}.partial".format(xlog_path),
        }
        self.server.compression_queue.put(compression_event)
        try:
            response = callback_queue.get(timeout=30)
            self.server.log.debug("Handled an archival request for: %r %r, took: %.3fs",
                                  site, xlog_path, time.time() - start_time)
        except Empty:
            self.server.log.exception("Problem in getting a response in time, returning 404, took: %.2fs",
                                      time.time() - start_time)
            raise HttpResponse("TIMEOUT", status=500)

        if not response["success"]:
            raise HttpResponse(status=500)
        raise HttpResponse(status=201)
Ejemplo n.º 13
0
 def test_archive_command_with_invalid_file(self, pghoard):
     # only WAL and timeline (.history) files can be archived
     bl_label = "000000010000000000000002.00000028.backup"
     bl_file = "xlog/{}".format(bl_label)
     wal_path = os.path.join(get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), bl_label)
     backup_wal_path = os.path.join(pghoard.config["backup_location"], pghoard.test_site, bl_file)
     with open(wal_path, "w") as fp:
         fp.write("jee")
     # backup labels are ignored - archiving returns success but file won't appear on disk
     archive_command(host="127.0.0.1", port=pghoard.config["http_port"],
                     site=pghoard.test_site, xlog=bl_label)
     assert not os.path.exists(backup_wal_path)
     # any other files raise an error
     with pytest.raises(postgres_command.PGCError) as excinfo:
         archive_command(host="127.0.0.1", port=pghoard.config["http_port"],
                         site=pghoard.test_site, xlog=bl_label + ".x")
     assert excinfo.value.exit_code == postgres_command.EXIT_ARCHIVE_FAIL
     assert not os.path.exists(backup_wal_path + ".x")
Ejemplo n.º 14
0
 def test_archiving(self, pghoard):
     store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site)
     # inject fake WAL and timeline files for testing
     for xlog_type, wal_name in [
             ("xlog", "0000000000000000000000CC"),
             ("timeline", "0000000F.history")]:
         foo_path = os.path.join(get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), wal_name)
         with open(foo_path, "wb") as out_file:
             if xlog_type == "xlog":
                 out_file.write(wal_header_for_file(wal_name))
             else:
                 out_file.write(b"1 2 3\n")
         archive_command(host="localhost", port=pghoard.config["http_port"],
                         site=pghoard.test_site, xlog=wal_name)
         archive_path = os.path.join(pghoard.test_site, xlog_type, wal_name)
         store.get_metadata_for_key(archive_path)
         store.delete_key(archive_path)
         os.unlink(foo_path)
Ejemplo n.º 15
0
    def _create_prefetch_operations(self, site, filetype, filename):
        if filetype not in {"timeline", "xlog"}:
            return
        prefetch_n = self.server.config["restore_prefetch"]
        if prefetch_n <= 0:
            return

        xlog_dir = get_pg_wal_directory(
            self.server.config["backup_sites"][site])
        names = []
        if filetype == "timeline":
            tli_num = int(filename.replace(".history", ""), 16)
            for _ in range(prefetch_n):
                tli_num += 1
                prefetch_name = "{:08X}.history".format(tli_num)
                if os.path.isfile(os.path.join(xlog_dir, prefetch_name)):
                    continue
                names.append(prefetch_name)
        elif filetype == "xlog":
            xlog_num = int(filename, 16)
            for _ in range(prefetch_n):
                if xlog_num & 0xFF == 0xFF:
                    xlog_num += 0xFFFFFF00
                xlog_num += 1
                prefetch_name = "{:024X}".format(xlog_num)
                xlog_path = os.path.join(xlog_dir, prefetch_name)
                if os.path.isfile(xlog_path):
                    try:
                        wal.verify_wal(wal_name=prefetch_name,
                                       filepath=xlog_path)
                        continue
                    except ValueError as e:
                        self.server.log.debug(
                            "(Prefetch) File %s already exists but is invalid: %r",
                            xlog_path, e)
                names.append(prefetch_name)

        for obname in names:
            key = self._make_file_key(site, filetype, obname)
            if key in self.server.prefetch_404:
                continue  # previously failed to prefetch this file, don't try again
            self._create_fetch_operation(key, site, filetype, obname)
Ejemplo n.º 16
0
    def _create_fetch_operation(self, key, site, filetype, obname, max_age=-1, suppress_error=True):
        with self.server.lock:
            # Don't fetch again if we already have pending fetch operation unless the operation
            # has been ongoing longer than given max age and has potentially became stale
            existing = self.server.pending_download_ops.get(key)
            if existing and (max_age < 0 or time.monotonic() - existing["started_at"] <= max_age):
                return

        xlog_dir = get_pg_wal_directory(self.server.config["backup_sites"][site])
        prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname))
        if os.path.exists(prefetch_target_path):
            return

        try:
            fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format(xlog_dir, obname), suffix=".pghoard.tmp")
            os.close(fd)
        except OSError as ex:
            self.server.log.error("Unable to create temporary file to fetch %r: %s: %s",
                                  obname, ex.__class__.__name__, ex)
            if suppress_error:
                return
            else:
                raise HttpResponse("Unable to create temporary file for {0!r}: {1.__class__.__name__}: {1}"
                                   .format(key, ex), status=400)

        self.server.log.debug("Fetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r",
                              site, obname, filetype, tmp_target_path)
        target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname))
        self.server.pending_download_ops[key] = dict(
            started_at=time.monotonic(),
            target_path=target_path,
        )
        self.server.transfer_queue.put({
            "callback_queue": self.server.download_results,
            "filetype": filetype,
            "local_path": obname,
            "opaque": key,
            "site": site,
            "target_path": tmp_target_path,
            "type": "DOWNLOAD",
        })
Ejemplo n.º 17
0
    def test_get_archived_file(self, pghoard):
        wal_seg_prev_tli = "00000001000000000000000F"
        wal_seg = "00000002000000000000000F"
        wal_file = "xlog/{}".format(wal_seg)
        # NOTE: create WAL header for the "previous" timeline, this should be accepted
        content = wal_header_for_file(wal_seg_prev_tli)
        wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site])
        archive_path = os.path.join(pghoard.test_site, wal_file)
        compressor = pghoard.Compressor()
        compressed_content = compressor.compress(content) + (compressor.flush() or b"")
        metadata = {
            "compression-algorithm": pghoard.config["compression"]["algorithm"],
            "original-file-size": len(content),
        }
        store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site)
        store.store_file_from_memory(archive_path, compressed_content, metadata=metadata)

        restore_command(site=pghoard.test_site, xlog=wal_seg, output=None,
                        host="127.0.0.1", port=pghoard.config["http_port"])
        restore_target = os.path.join(wal_dir, wal_seg)

        restore_command(site=pghoard.test_site, xlog=wal_seg, output=restore_target,
                        host="127.0.0.1", port=pghoard.config["http_port"])
        assert os.path.exists(restore_target) is True
        with open(restore_target, "rb") as fp:
            restored_data = fp.read()
        assert content == restored_data

        # test the same thing using restore as 'pghoard_postgres_command'
        tmp_out = os.path.join(wal_dir, restore_target + ".cmd")
        postgres_command.main([
            "--host", "localhost",
            "--port", str(pghoard.config["http_port"]),
            "--site", pghoard.test_site,
            "--mode", "restore",
            "--output", tmp_out,
            "--xlog", wal_seg,
        ])
        with open(tmp_out, "rb") as fp:
            restored_data = fp.read()
        assert content == restored_data
Ejemplo n.º 18
0
    def _create_prefetch_operations(self, site, filetype, filename):
        if filetype not in {"timeline", "xlog"}:
            return
        prefetch_n = self.server.config["restore_prefetch"]
        if prefetch_n <= 0:
            return

        xlog_dir = get_pg_wal_directory(self.server.config["backup_sites"][site])
        names = []
        if filetype == "timeline":
            tli_num = int(filename.replace(".history", ""), 16)
            for _ in range(prefetch_n):
                tli_num += 1
                prefetch_name = "{:08X}.history".format(tli_num)
                if os.path.isfile(os.path.join(xlog_dir, prefetch_name)):
                    continue
                names.append(prefetch_name)
        elif filetype == "xlog":
            xlog_num = int(filename, 16)
            for _ in range(prefetch_n):
                if xlog_num & 0xFF == 0xFF:
                    xlog_num += 0xFFFFFF00
                xlog_num += 1
                prefetch_name = "{:024X}".format(xlog_num)
                xlog_path = os.path.join(xlog_dir, prefetch_name)
                if os.path.isfile(xlog_path):
                    try:
                        wal.verify_wal(wal_name=prefetch_name, filepath=xlog_path)
                        continue
                    except ValueError as e:
                        self.server.log.debug("(Prefetch) File %s already exists but is invalid: %r", xlog_path, e)
                names.append(prefetch_name)

        for obname in names:
            key = self._make_file_key(site, filetype, obname)
            if key in self.server.prefetch_404:
                continue  # previously failed to prefetch this file, don't try again
            self._create_fetch_operation(key, site, filetype, obname)
Ejemplo n.º 19
0
    def check_and_upload_missing_local_files(self, max_hash_checks):
        current_wal_file = self.get_current_wal_file()
        first_required_wal_file, _ = self.get_first_required_wal_segment()

        # Find relevant WAL files.  We do this by checking archival status
        # of all WAL files older than the one currently open (ie reverse
        # sorted list from newest file that should've been archived to the
        # oldest on disk) and and appending missing files to a list.  After
        # collecting a list we start archiving them from oldest to newest.
        # This is done so we don't break our missing archive detection logic
        # if sync is interrupted for some reason.
        # Sort all timeline files first to make sure they're always
        # archived, otherwise the timeline files are processed only after
        # all WAL files for a given timeline have been handled.
        wal_dir = get_pg_wal_directory(self.backup_site)
        wal_files = os.listdir(wal_dir)
        wal_files.sort(key=lambda f: (f.endswith(".history"), f), reverse=True)
        need_archival = []
        hash_checks_done = 0
        existing_wal_without_checksum_count = 0
        for wal_file in wal_files:
            archive_type = None
            if wal.TIMELINE_RE.match(wal_file):
                # We want all timeline files
                archive_type = "TIMELINE"
            elif not wal.WAL_RE.match(wal_file):
                pass   # not a WAL or timeline file
            elif wal_file == current_wal_file:
                self.log.info("Skipping currently open WAL file %r", wal_file)
            elif wal_file > current_wal_file:
                self.log.debug("Skipping recycled WAL file %r", wal_file)
            elif first_required_wal_file is not None and wal_file < first_required_wal_file:
                self.log.info("WAL file %r is not needed for the latest basebackup", wal_file)
                break
            else:
                # WAL file in range first_required_wal_file .. current_wal_file
                archive_type = "WAL"

            if archive_type:
                resp = requests.head("{base}/archive/{file}".format(base=self.base_url, file=wal_file))
                if resp.status_code == 200:
                    remote_hash = resp.headers.get("metadata-hash")
                    hash_algorithm = resp.headers.get("metadata-hash-algorithm")
                    check_hash = bool(
                        archive_type == "WAL" and (hash_checks_done < max_hash_checks or max_hash_checks < 0) and remote_hash
                    )
                    if archive_type == "WAL" and not remote_hash:
                        # If we don't have hashes available (old pghoard was running on previous master), re-upload first
                        # file that already exists in remote storage and doesn't have a checksum since it might be the last
                        # WAL file of previous timeline uploaded by old master and invalid because it doesn't have the
                        # timeline switch event and have some writes that are not valid for our timeline
                        existing_wal_without_checksum_count += 1
                        if existing_wal_without_checksum_count == 1:
                            self.log.info(
                                "%s file %r already archived but no hash is available, reuploading", archive_type, wal_file
                            )
                            need_archival.append(wal_file)
                            continue
                    if check_hash:
                        hash_checks_done += 1
                        our_hash = self.calculate_hash(os.path.join(wal_dir, wal_file), hash_algorithm)
                        if remote_hash.lower().strip() != our_hash.lower().strip():
                            self.log.warning(
                                "%s file %r already archived but existing hash %r differs from our hash %r, reuploading",
                                archive_type, wal_file, remote_hash, our_hash
                            )
                            need_archival.append(wal_file)
                        else:
                            self.log.info("%s file %r already archived and has valid hash", archive_type, wal_file)
                    else:
                        self.log.info("%s file %r already archived", archive_type, wal_file)
                    continue
                self.log.info("%s file %r needs to be archived", archive_type, wal_file)
                need_archival.append(wal_file)

        for wal_file in sorted(need_archival):  # sort oldest to newest
            resp = requests.put("{base}/archive/{file}".format(base=self.base_url, file=wal_file))
            archive_type = "TIMELINE" if ".history" in wal_file else "WAL"
            if resp.status_code != 201:
                self.log.error("%s file %r archival failed with status code %r",
                               archive_type, wal_file, resp.status_code)
            else:
                self.log.info("%s file %r archived", archive_type, wal_file)
Ejemplo n.º 20
0
    def test_archive_sync(self, db, pghoard):
        log = logging.getLogger("test_archive_sync")
        store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site)

        def list_archive(folder):
            if folder == "timeline":
                matcher = wal.TIMELINE_RE.match
            else:
                matcher = wal.WAL_RE.match

            path_to_list = "{}/{}".format(pghoard.test_site, folder)
            files_found, files_total = 0, 0
            for obj in store.list_path(path_to_list):
                fname = os.path.basename(obj["name"])
                files_total += 1
                if matcher(fname):
                    files_found += 1
                    yield fname

            log.info("Listed %r, %r out of %r matched %r pattern", path_to_list, files_found, files_total, folder)

        # create a basebackup to start with
        self._run_and_wait_basebackup(pghoard, db, "pipe")

        # force a couple of wal segment switches
        start_wal, _ = self._switch_wal(db, 4)
        # we should have at least 4 WAL files now (there may be more in
        # case other tests created them -- we share a single postresql
        # cluster between all tests)
        pg_wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site])
        pg_wals = {f for f in os.listdir(pg_wal_dir) if wal.WAL_RE.match(f) and f > start_wal}
        assert len(pg_wals) >= 4

        # create a couple of "recycled" xlog files that we must ignore
        last_wal = sorted(pg_wals)[-1]
        dummy_data = b"x" * (16 * 2 ** 20)

        def write_dummy_wal(inc):
            filename = "{:024X}".format((int(last_wal, 16) + inc))
            print("Writing dummy WAL file", filename)
            open(os.path.join(pg_wal_dir, filename), "wb").write(dummy_data)
            return filename

        recycled1 = write_dummy_wal(1)
        recycled2 = write_dummy_wal(2)

        # check what we have archived, there should be at least the three
        # above WALs that are NOT there at the moment
        archived_wals = set(list_archive("xlog"))
        assert len(pg_wals - archived_wals) >= 4
        # now perform an archive sync
        arsy = ArchiveSync()
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        # and now archive should include all our WALs
        archived_wals = set(list_archive("xlog"))

        # the recycled files must not appear in archived files
        assert recycled1 not in archived_wals
        assert recycled2 not in archived_wals

        # the regular wals must be archived
        assert archived_wals.issuperset(pg_wals)

        # if we delete a wal file that's not the latest archival it should
        # get synced to the archive as we don't have a basebackup newer than
        # it
        current_wal = arsy.get_current_wal_file()
        old_wals = sorted(wal for wal in pg_wals if wal < current_wal)
        store.delete_key(os.path.join(pghoard.test_site, "xlog", old_wals[-2]))
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        archived_wals = set(list_archive("xlog"))
        assert archived_wals.issuperset(pg_wals)
        # delete the topmost wal file, this should cause resync too
        store.delete_key(os.path.join(pghoard.test_site, "xlog", old_wals[-1]))
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        archived_wals = set(list_archive("xlog"))
        assert archived_wals.issuperset(pg_wals)
        # let's do a little dance to turn our DB into a standby and then
        # promote it, forcing a timeline switch
        db.kill(force=False)
        with open(os.path.join(db.pgdata, "recovery.conf"), "w") as fp:
            fp.write(
                "standby_mode = 'on'\n"
                "recovery_target_timeline = 'latest'\n"
                "restore_command = 'false'\n"
            )
        # start PG and promote it
        db.run_pg()
        db.run_cmd("pg_ctl", "-D", db.pgdata, "promote")
        time.sleep(5)  # TODO: instead of sleeping, poll the db until ready
        # we should have a single timeline file in pg_xlog/pg_wal now
        pg_wal_timelines = {f for f in os.listdir(pg_wal_dir) if wal.TIMELINE_RE.match(f)}
        assert len(pg_wal_timelines) > 0
        # but there should be nothing archived as archive_command wasn't setup
        archived_timelines = set(list_archive("timeline"))
        assert len(archived_timelines) == 0
        # let's hit archive sync
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        # now we should have an archived timeline
        archived_timelines = set(list_archive("timeline"))
        assert archived_timelines.issuperset(pg_wal_timelines)
        assert "00000002.history" in archived_timelines

        # let's take a new basebackup
        self._run_and_wait_basebackup(pghoard, db, "basic")
        # nuke archives and resync them
        for name in list_archive(folder="timeline"):
            store.delete_key(os.path.join(pghoard.test_site, "timeline", name))
        for name in list_archive(folder="xlog"):
            store.delete_key(os.path.join(pghoard.test_site, "xlog", name))
        self._switch_wal(db, 1)

        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])

        archived_wals = set(list_archive("xlog"))
        # assume the same timeline file as before and one to three wal files
        assert len(archived_wals) >= 1
        assert len(archived_wals) <= 3
        archived_timelines = set(list_archive("timeline"))
        assert list(archived_timelines) == ["00000002.history"]
Ejemplo n.º 21
0
    def get_wal_or_timeline_file(self, site, filename, filetype):
        target_path = self.headers.get("x-pghoard-target-path")
        if not target_path:
            raise HttpResponse(
                "x-pghoard-target-path header missing from download",
                status=400)

        # See if we have already prefetched the file
        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        prefetch_target_path = os.path.join(
            xlog_dir, "{}.pghoard.prefetch".format(filename))
        if os.path.exists(prefetch_target_path):
            self._save_and_verify_restored_file(filetype, filename,
                                                prefetch_target_path,
                                                target_path)
            raise HttpResponse(status=201)

        # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch
        # files from the archive starting from the latest checkpoint. We have potentially fetched these files
        # already earlier. Check if we have the files already and if we do, don't go over the network to refetch
        # them yet again but just rename them to the path that PG is requesting.
        xlog_path = os.path.join(xlog_dir, filename)
        if os.path.exists(xlog_path):
            self.server.log.info(
                "Requested %r, found it in pg_xlog directory as: %r, returning directly",
                filename, xlog_path)
            success = False
            try:
                self._save_and_verify_restored_file(filetype, filename,
                                                    xlog_path, target_path)
                success = True
            except (ValueError, HttpResponse) as ex:
                self.server.log.warning(
                    "Found file: %r but it was invalid: %s", xlog_path, ex)
            if success:
                raise HttpResponse(status=201)

        prefetch_n = self.server.config["restore_prefetch"]
        prefetch = []
        if filetype == "timeline":
            tli_num = int(filename.replace(".history", ""), 16)
            for _ in range(prefetch_n):
                tli_num += 1
                prefetch.append("{:08X}.history".format(tli_num))
        elif filetype == "xlog":
            xlog_num = int(filename, 16)
            for _ in range(prefetch_n):
                if xlog_num & 0xFF == 0xFF:
                    xlog_num += 0xFFFFFF00
                xlog_num += 1
                prefetch.append("{:024X}".format(xlog_num))

        with self._prefetch(site, filetype, prefetch):
            self._transfer_agent_op(site,
                                    filename,
                                    filetype,
                                    "DOWNLOAD",
                                    target_path=target_path)

        raise HttpResponse(status=201)
Ejemplo n.º 22
0
    def _prefetch(self, site, filetype, names):
        if not names:
            return

        start_time = time.monotonic()
        callback_queue = Queue()

        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        downloads = {}
        for obname in names:
            if obname in self.server.prefetch_404:
                continue  # previously failed to prefetch this file, don't try again
            prefetch_target_path = os.path.join(
                xlog_dir, "{}.pghoard.prefetch".format(obname))
            if os.path.exists(prefetch_target_path):
                continue  # already fetched this file
            try:
                fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format(
                    xlog_dir, obname),
                                                       suffix=".pghoard.tmp")
                os.close(fd)
            except OSError as ex:
                self.server.log.error(
                    "Unable to create temporary file to prefetch %r: %s: %s",
                    obname, ex.__class__.__name__, ex)
                continue
            self.server.log.debug(
                "Prefetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r",
                site, obname, filetype, tmp_target_path)
            downloads[obname] = tmp_target_path
            self.server.transfer_queue.put({
                "callback_queue": callback_queue,
                "filetype": filetype,
                "local_path": obname,
                "opaque": obname,
                "site": site,
                "target_path": tmp_target_path,
                "type": "DOWNLOAD",
            })

        # allow something else to happen
        try:
            yield
        finally:
            # process results (timeout is 30 seconds after start but at least 5 seconds)
            timeout_at = max(start_time + 30, time.monotonic() + 5)
            while downloads:
                time_left = timeout_at - time.monotonic()
                try:
                    response = callback_queue.get(timeout=time_left)
                except Empty:
                    break  # timeout
                obname = response["opaque"]
                tmp_target_path = downloads.pop(response["opaque"])
                if response["success"]:
                    prefetch_target_path = os.path.join(
                        xlog_dir, "{}.pghoard.prefetch".format(obname))
                    os.rename(tmp_target_path, prefetch_target_path)
                    self.server.log.debug(
                        "Prefetched %r %r to %r, took: %.3fs", site, obname,
                        prefetch_target_path,
                        time.monotonic() - start_time)
                else:
                    ex = response.get("exception", Error)
                    if isinstance(ex, FileNotFoundFromStorageError):
                        # don't try prefetching this file again
                        self.server.prefetch_404.append(obname)
                    self.server.log.debug(
                        "Prefetching %r %r failed (%s), took: %.3fs", site,
                        obname, ex.__class__.__name__,
                        time.monotonic() - start_time)
                    with suppress(Exception):
                        os.unlink(tmp_target_path)

            # everything else timed out
            while downloads:
                obname, tmp_target_path = downloads.popitem()
                self.server.log.debug(
                    "Prefetching %r %r timed out, took: %.3fs", site, obname,
                    time.monotonic() - start_time)
                with suppress(Exception):
                    os.unlink(tmp_target_path)
Ejemplo n.º 23
0
    def get_wal_or_timeline_file(self, site, filename, filetype):
        target_path = self.headers.get("x-pghoard-target-path")
        if not target_path:
            raise HttpResponse("x-pghoard-target-path header missing from download", status=400)

        self._process_completed_download_operations()

        # See if we have already prefetched the file
        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(filename))
        if os.path.exists(prefetch_target_path):
            ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path)
            if not ex:
                self._create_prefetch_operations(site, filetype, filename)
                raise HttpResponse(status=201)

        # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch
        # files from the archive starting from the latest checkpoint. We have potentially fetched these files
        # already earlier. Check if we have the files already and if we do, don't go over the network to refetch
        # them yet again but just rename them to the path that PG is requesting.
        xlog_path = os.path.join(xlog_dir, filename)
        if os.path.exists(xlog_path):
            self.server.log.info("Requested %r, found it in pg_xlog directory as: %r, returning directly",
                                 filename, xlog_path)
            ex = self._try_save_and_verify_restored_file(filetype, filename, xlog_path, target_path, unlink=False)
            if ex:
                self.server.log.warning("Found file: %r but it was invalid: %s", xlog_path, ex)
            else:
                raise HttpResponse(status=201)

        key = self._make_file_key(site, filetype, filename)
        with suppress(ValueError):
            self.server.prefetch_404.remove(key)
        self._create_fetch_operation(key, site, filetype, filename, max_age=5, suppress_error=False)
        self._create_prefetch_operations(site, filetype, filename)

        last_schedule_call = time.monotonic()
        start_time = time.monotonic()
        retries = 2
        while (time.monotonic() - start_time) <= 30:
            self._process_completed_download_operations(timeout=0.01)
            with self.server.lock:
                if os.path.isfile(prefetch_target_path):
                    ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path)
                    if not ex:
                        raise HttpResponse(status=201)
                    elif ex and retries == 0:
                        raise ex  # pylint: disable=raising-bad-type
                    retries -= 1
            if key in self.server.prefetch_404:
                raise HttpResponse(status=404)
            with self.server.lock:
                if key not in self.server.pending_download_ops:
                    if retries == 0:
                        raise HttpResponse(status=500)
                    retries -= 1
                    self._create_fetch_operation(key, site, filetype, filename, suppress_error=False)
            if time.monotonic() - last_schedule_call >= 1:
                last_schedule_call = time.monotonic()
                # Replace existing download operation if it has been executing for too long
                self._create_fetch_operation(key, site, filetype, filename, max_age=10, suppress_error=False)

        raise HttpResponse("TIMEOUT", status=500)
Ejemplo n.º 24
0
    def test_archive_sync(self, db, pghoard):
        log = logging.getLogger("test_archive_sync")
        store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site)

        def list_archive(folder):
            if folder == "timeline":
                matcher = wal.TIMELINE_RE.match
            else:
                matcher = wal.WAL_RE.match

            path_to_list = "{}/{}".format(pghoard.test_site, folder)
            files_found, files_total = 0, 0
            for obj in store.list_path(path_to_list):
                fname = os.path.basename(obj["name"])
                files_total += 1
                if matcher(fname):
                    files_found += 1
                    yield fname

            log.info("Listed %r, %r out of %r matched %r pattern", path_to_list, files_found, files_total, folder)

        # create a basebackup to start with
        self._run_and_wait_basebackup(pghoard, db, "pipe")

        # force a couple of wal segment switches
        start_wal, _ = self._switch_wal(db, 4)
        # we should have at least 4 WAL files now (there may be more in
        # case other tests created them -- we share a single postresql
        # cluster between all tests)
        pg_wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site])
        pg_wals = {f for f in os.listdir(pg_wal_dir) if wal.WAL_RE.match(f) and f > start_wal}
        assert len(pg_wals) >= 4

        # create a couple of "recycled" xlog files that we must ignore
        last_wal = sorted(pg_wals)[-1]
        dummy_data = b"x" * (16 * 2 ** 20)

        def write_dummy_wal(inc):
            filename = "{:024X}".format((int(last_wal, 16) + inc))
            print("Writing dummy WAL file", filename)
            open(os.path.join(pg_wal_dir, filename), "wb").write(dummy_data)
            return filename

        recycled1 = write_dummy_wal(1)
        recycled2 = write_dummy_wal(2)

        # check what we have archived, there should be at least the three
        # above WALs that are NOT there at the moment
        archived_wals = set(list_archive("xlog"))
        assert len(pg_wals - archived_wals) >= 4
        # now perform an archive sync
        arsy = ArchiveSync()
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        # and now archive should include all our WALs
        archived_wals = set(list_archive("xlog"))

        # the recycled files must not appear in archived files
        assert recycled1 not in archived_wals
        assert recycled2 not in archived_wals

        # the regular wals must be archived
        assert archived_wals.issuperset(pg_wals)

        # if we delete a wal file that's not the latest archival it should
        # get synced to the archive as we don't have a basebackup newer than
        # it
        current_wal = arsy.get_current_wal_file()
        old_wals = sorted(wal for wal in pg_wals if wal < current_wal)
        store.delete_key(os.path.join(pghoard.test_site, "xlog", old_wals[-2]))
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        archived_wals = set(list_archive("xlog"))
        assert archived_wals.issuperset(pg_wals)
        # delete the topmost wal file, this should cause resync too
        store.delete_key(os.path.join(pghoard.test_site, "xlog", old_wals[-1]))
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        archived_wals = set(list_archive("xlog"))
        assert archived_wals.issuperset(pg_wals)
        # let's do a little dance to turn our DB into a standby and then
        # promote it, forcing a timeline switch
        db.kill(force=False)
        with open(os.path.join(db.pgdata, "recovery.conf"), "w") as fp:
            fp.write(
                "standby_mode = 'on'\n"
                "recovery_target_timeline = 'latest'\n"
                "restore_command = 'false'\n"
            )
        # start PG and promote it
        db.run_pg()
        db.run_cmd("pg_ctl", "-D", db.pgdata, "promote")
        time.sleep(5)  # TODO: instead of sleeping, poll the db until ready
        # we should have a single timeline file in pg_xlog/pg_wal now
        pg_wal_timelines = {f for f in os.listdir(pg_wal_dir) if wal.TIMELINE_RE.match(f)}
        assert len(pg_wal_timelines) > 0
        # but there should be nothing archived as archive_command wasn't setup
        archived_timelines = set(list_archive("timeline"))
        assert len(archived_timelines) == 0
        # let's hit archive sync
        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])
        # now we should have an archived timeline
        archived_timelines = set(list_archive("timeline"))
        assert archived_timelines.issuperset(pg_wal_timelines)
        assert "00000002.history" in archived_timelines

        # let's take a new basebackup
        self._run_and_wait_basebackup(pghoard, db, "basic")
        # nuke archives and resync them
        for name in list_archive(folder="timeline"):
            store.delete_key(os.path.join(pghoard.test_site, "timeline", name))
        for name in list_archive(folder="xlog"):
            store.delete_key(os.path.join(pghoard.test_site, "xlog", name))
        self._switch_wal(db, 1)

        arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path])

        archived_wals = set(list_archive("xlog"))
        # assume the same timeline file as before and one to three wal files
        assert len(archived_wals) >= 1
        assert len(archived_wals) <= 3
        archived_timelines = set(list_archive("timeline"))
        assert list(archived_timelines) == ["00000002.history"]
Ejemplo n.º 25
0
    def get_wal_or_timeline_file(self, site, filename, filetype):
        target_path = self.headers.get("x-pghoard-target-path")
        if not target_path:
            raise HttpResponse(
                "x-pghoard-target-path header missing from download",
                status=400)

        self._process_completed_download_operations()

        # See if we have already prefetched the file
        site_config = self.server.config["backup_sites"][site]
        xlog_dir = get_pg_wal_directory(site_config)
        prefetch_target_path = os.path.join(
            xlog_dir, "{}.pghoard.prefetch".format(filename))
        if os.path.exists(prefetch_target_path):
            ex = self._try_save_and_verify_restored_file(
                filetype, filename, prefetch_target_path, target_path)
            if not ex:
                self._create_prefetch_operations(site, filetype, filename)
                raise HttpResponse(status=201)

        # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch
        # files from the archive starting from the latest checkpoint. We have potentially fetched these files
        # already earlier. Check if we have the files already and if we do, don't go over the network to refetch
        # them yet again but just rename them to the path that PG is requesting.
        xlog_path = os.path.join(xlog_dir, filename)
        if os.path.exists(xlog_path):
            self.server.log.info(
                "Requested %r, found it in pg_xlog directory as: %r, returning directly",
                filename, xlog_path)
            ex = self._try_save_and_verify_restored_file(filetype,
                                                         filename,
                                                         xlog_path,
                                                         target_path,
                                                         unlink=False)
            if ex:
                self.server.log.warning(
                    "Found file: %r but it was invalid: %s", xlog_path, ex)
            else:
                raise HttpResponse(status=201)

        key = self._make_file_key(site, filetype, filename)
        with suppress(ValueError):
            self.server.prefetch_404.remove(key)
        self._create_fetch_operation(key,
                                     site,
                                     filetype,
                                     filename,
                                     max_age=5,
                                     suppress_error=False)
        self._create_prefetch_operations(site, filetype, filename)

        last_schedule_call = time.monotonic()
        start_time = time.monotonic()
        retries = 2
        while (time.monotonic() - start_time) <= 30:
            self._process_completed_download_operations(timeout=0.01)
            with self.server.lock:
                if os.path.isfile(prefetch_target_path):
                    ex = self._try_save_and_verify_restored_file(
                        filetype, filename, prefetch_target_path, target_path)
                    if not ex:
                        raise HttpResponse(status=201)
                    elif ex and retries == 0:
                        raise ex  # pylint: disable=raising-bad-type
                    retries -= 1
            if key in self.server.prefetch_404:
                raise HttpResponse(status=404)
            with self.server.lock:
                if key not in self.server.pending_download_ops:
                    if retries == 0:
                        raise HttpResponse(status=500)
                    retries -= 1
                    self._create_fetch_operation(key,
                                                 site,
                                                 filetype,
                                                 filename,
                                                 suppress_error=False)
            if time.monotonic() - last_schedule_call >= 1:
                last_schedule_call = time.monotonic()
                # Replace existing download operation if it has been executing for too long
                self._create_fetch_operation(key,
                                             site,
                                             filetype,
                                             filename,
                                             max_age=10,
                                             suppress_error=False)

        raise HttpResponse("TIMEOUT", status=500)
Ejemplo n.º 26
0
def http_restore(pghoard):
    pgdata = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site])
    return HTTPRestore("localhost", pghoard.config["http_port"], site=pghoard.test_site, pgdata=pgdata)
Ejemplo n.º 27
0
    def check_and_upload_missing_local_files(self, max_hash_checks):
        current_wal_file = self.get_current_wal_file()
        first_required_wal_file, _ = self.get_first_required_wal_segment()

        # Find relevant WAL files.  We do this by checking archival status
        # of all WAL files older than the one currently open (ie reverse
        # sorted list from newest file that should've been archived to the
        # oldest on disk) and and appending missing files to a list.  After
        # collecting a list we start archiving them from oldest to newest.
        # This is done so we don't break our missing archive detection logic
        # if sync is interrupted for some reason.
        # Sort all timeline files first to make sure they're always
        # archived, otherwise the timeline files are processed only after
        # all WAL files for a given timeline have been handled.
        wal_dir = get_pg_wal_directory(self.backup_site)
        wal_files = os.listdir(wal_dir)
        wal_files.sort(key=lambda f: (f.endswith(".history"), f), reverse=True)
        need_archival = []
        hash_checks_done = 0
        existing_wal_without_checksum_count = 0
        for wal_file in wal_files:
            archive_type = None
            if wal.TIMELINE_RE.match(wal_file):
                # We want all timeline files
                archive_type = "TIMELINE"
            elif not wal.WAL_RE.match(wal_file):
                pass  # not a WAL or timeline file
            elif wal_file == current_wal_file:
                self.log.info("Skipping currently open WAL file %r", wal_file)
            elif wal_file > current_wal_file:
                self.log.debug("Skipping recycled WAL file %r", wal_file)
            elif first_required_wal_file is not None and wal_file < first_required_wal_file:
                self.log.info(
                    "WAL file %r is not needed for the latest basebackup",
                    wal_file)
                break
            else:
                # WAL file in range first_required_wal_file .. current_wal_file
                archive_type = "WAL"

            if archive_type:
                resp = requests.head("{base}/archive/{file}".format(
                    base=self.base_url, file=wal_file))
                if resp.status_code == 200:
                    remote_hash = resp.headers.get("metadata-hash")
                    hash_algorithm = resp.headers.get(
                        "metadata-hash-algorithm")
                    check_hash = bool(archive_type == "WAL"
                                      and (hash_checks_done < max_hash_checks
                                           or max_hash_checks < 0)
                                      and remote_hash)
                    if archive_type == "WAL" and not remote_hash:
                        # If we don't have hashes available (old pghoard was running on previous master), re-upload first
                        # file that already exists in remote storage and doesn't have a checksum since it might be the last
                        # WAL file of previous timeline uploaded by old master and invalid because it doesn't have the
                        # timeline switch event and have some writes that are not valid for our timeline
                        existing_wal_without_checksum_count += 1
                        if existing_wal_without_checksum_count == 1:
                            self.log.info(
                                "%s file %r already archived but no hash is available, reuploading",
                                archive_type, wal_file)
                            need_archival.append(wal_file)
                            continue
                    if check_hash:
                        hash_checks_done += 1
                        our_hash = self.calculate_hash(
                            os.path.join(wal_dir, wal_file), hash_algorithm)
                        if not our_hash:
                            self.log.info(
                                "%s file %r already archived (file deleted before getting hash)",
                                archive_type, wal_file)
                        elif remote_hash.lower().strip() != our_hash.lower(
                        ).strip():
                            self.log.warning(
                                "%s file %r already archived but existing hash %r differs from our hash %r, reuploading",
                                archive_type, wal_file, remote_hash, our_hash)
                            need_archival.append(wal_file)
                        else:
                            self.log.info(
                                "%s file %r already archived and has valid hash",
                                archive_type, wal_file)
                    else:
                        self.log.info("%s file %r already archived",
                                      archive_type, wal_file)
                    continue
                self.log.info("%s file %r needs to be archived", archive_type,
                              wal_file)
                need_archival.append(wal_file)

        for wal_file in sorted(need_archival):  # sort oldest to newest
            resp = requests.put("{base}/archive/{file}".format(
                base=self.base_url, file=wal_file))
            archive_type = "TIMELINE" if ".history" in wal_file else "WAL"
            if resp.status_code != 201:
                self.log.error(
                    "%s file %r archival failed with status code %r",
                    archive_type, wal_file, resp.status_code)
            else:
                self.log.info("%s file %r archived", archive_type, wal_file)