Esempio n. 1
0
def test_dynamically_emptied_directories(tmpdir):
    """Ensure empty directories in the base backup are created

    Particularly in the case when PostgreSQL empties the files in
    those directories in parallel.  This emptying can happen after the
    files are partitioned into their tarballs but before the tar and
    upload process is complete.

    """
    # Create a directory structure with a file in it, e.g:
    #
    #   ./adir/bdir/afile
    adir = tmpdir.join('adir').ensure(dir=True)
    bdir = adir.join('bdir').ensure(dir=True)
    some_file = bdir.join('afile')
    some_file.write('1234567890')

    # Generate the partition for the tar files
    base_dir = adir.strpath
    spec, parts = tar_partition.partition(base_dir)
    tar_paths = []
    for part in parts:
        for tar_info in part:
            rel_path = os.path.relpath(tar_info.submitted_path, base_dir)
            tar_paths.append(rel_path)

    # Ensure the "bdir" directory is included in the partition so
    # "bdir" is created even if postgres removes "afile" during the
    # tarring process.
    assert 'bdir' in tar_paths
Esempio n. 2
0
def test_dynamically_emptied_directories(tmpdir):
    '''
    Ensure we create directories in the base backup even when PostgreSQL
    empties the files in those directories.  This emptying can happen after we
    partition the files into their tarballs but before the tar and upload
    process is complete.
    '''
    # Create a directory structure with a file in it:
    #   adir/bdir/afile
    adir = tmpdir.join('adir').ensure(dir=True)
    bdir = adir.join('bdir').ensure(dir=True)
    some_file = bdir.join('afile')
    some_file.write('1234567890')
    # Generate the partition for the tar files
    base_dir = adir.strpath
    spec, parts = tar_partition.partition(base_dir)
    tar_paths = []
    for part in parts:
        for tar_info in part:
            rel_path = os.path.relpath(tar_info.submitted_path, base_dir)
            tar_paths.append(rel_path)
    # Ensure we include the bdir directory in the partition so we'll still
    # create bdir even if postgres removes afile during the tar process
    assert 'bdir' in tar_paths
Esempio n. 3
0
    def _upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir,
                               version, pool_size, rate_limit=None):
        """
        Upload to url_prefix from pg_cluster_dir

        This function ignores the directory pg_xlog, which contains WAL
        files and are not generally part of a base backup.

        Note that this is also lzo compresses the files: thus, the number
        of pooled processes involves doing a full sequential scan of the
        uncompressed Postgres heap file that is pipelined into lzo. Once
        lzo is completely finished (necessary to have access to the file
        size) the file is sent to S3 or WABS.

        TODO: Investigate an optimization to decouple the compression and
        upload steps to make sure that the most efficient possible use of
        pipelining of network and disk resources occurs.  Right now it
        possible to bounce back and forth between bottlenecking on reading
        from the database block device and subsequently the S3/WABS sending
        steps should the processes be at the same stage of the upload
        pipeline: this can have a very negative impact on being able to
        make full use of system resources.

        Furthermore, it desirable to overflowing the page cache: having
        separate tunables for number of simultanious compression jobs
        (which occupy /tmp space and page cache) and number of uploads
        (which affect upload throughput) would help.

        """
        spec, parts = tar_partition.partition(pg_cluster_dir)

        # TODO :: Move arbitray path construction to StorageLayout Object
        backup_prefix = '{0}/basebackups_{1}/base_{file_name}_{file_offset}'\
                .format(self.layout.prefix.rstrip('/'), FILE_STRUCTURE_VERSION,
                        **start_backup_info)

        if rate_limit is None:
            per_process_limit = None
        else:
            per_process_limit = int(rate_limit / pool_size)

        # Reject tiny per-process rate limits.  They should be
        # rejected more nicely elsewhere.
        assert per_process_limit > 0 or per_process_limit is None

        total_size = 0

        # Make an attempt to upload extended version metadata
        extended_version_url = backup_prefix + '/extended_version.txt'
        logger.info(
            msg='start upload postgres version metadata',
            detail=('Uploading to {extended_version_url}.'
                    .format(extended_version_url=extended_version_url)))
        uri_put_file(self.creds,
                     extended_version_url, StringIO(version),
                     content_encoding='text/plain')

        logger.info(msg='postgres version metadata upload complete')

        uploader = PartitionUploader(self.creds, backup_prefix,
                                     per_process_limit, self.gpg_key_id)

        pool = TarUploadPool(uploader, pool_size)

        # Enqueue uploads for parallel execution
        for tpart in parts:
            total_size += tpart.total_member_size

            # 'put' can raise an exception for a just-failed upload,
            # aborting the process.
            pool.put(tpart)

        # Wait for remaining parts to upload.  An exception can be
        # raised to signal failure of the upload.
        pool.join()

        return spec, backup_prefix, total_size
Esempio n. 4
0
    def _upload_pg_cluster_dir(self,
                               start_backup_info,
                               pg_cluster_dir,
                               version,
                               pool_size,
                               rate_limit=None):
        """
        Upload to url_prefix from pg_cluster_dir

        This function ignores the directory pg_xlog, which contains WAL
        files and are not generally part of a base backup.

        Note that this is also lzo compresses the files: thus, the number
        of pooled processes involves doing a full sequential scan of the
        uncompressed Postgres heap file that is pipelined into lzo. Once
        lzo is completely finished (necessary to have access to the file
        size) the file is sent to S3 or WABS.

        TODO: Investigate an optimization to decouple the compression and
        upload steps to make sure that the most efficient possible use of
        pipelining of network and disk resources occurs.  Right now it
        possible to bounce back and forth between bottlenecking on reading
        from the database block device and subsequently the S3/WABS sending
        steps should the processes be at the same stage of the upload
        pipeline: this can have a very negative impact on being able to
        make full use of system resources.

        Furthermore, it desirable to overflowing the page cache: having
        separate tunables for number of simultanious compression jobs
        (which occupy /tmp space and page cache) and number of uploads
        (which affect upload throughput) would help.

        """
        spec, parts = tar_partition.partition(pg_cluster_dir)

        # TODO :: Move arbitray path construction to StorageLayout Object
        backup_prefix = '{0}/basebackups_{1}/base_{file_name}_{file_offset}'\
            .format(self.layout.prefix.rstrip('/'), FILE_STRUCTURE_VERSION,
                        **start_backup_info)

        if rate_limit is None:
            per_process_limit = None
        else:
            per_process_limit = int(rate_limit / pool_size)

        # Reject tiny per-process rate limits.  They should be
        # rejected more nicely elsewhere.
        assert per_process_limit is None or per_process_limit > 0

        total_size = 0

        # Make an attempt to upload extended version metadata
        extended_version_url = backup_prefix + '/extended_version.txt'
        logger.info(msg='start upload postgres version metadata',
                    detail=('Uploading to {extended_version_url}.'.format(
                        extended_version_url=extended_version_url)))
        uri_put_file(self.creds,
                     extended_version_url,
                     BytesIO(version.encode("utf8")),
                     content_type='text/plain')

        logger.info(msg='postgres version metadata upload complete')

        uploader = PartitionUploader(self.creds, backup_prefix,
                                     per_process_limit, self.gpg_key_id)

        pool = TarUploadPool(uploader, pool_size)

        # Enqueue uploads for parallel execution
        for tpart in parts:
            total_size += tpart.total_member_size

            # 'put' can raise an exception for a just-failed upload,
            # aborting the process.
            pool.put(tpart)

        # Wait for remaining parts to upload.  An exception can be
        # raised to signal failure of the upload.
        pool.join()

        return spec, backup_prefix, total_size
Esempio n. 5
0
    def _s3_upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir,
                                  version, pool_size, rate_limit=None):
        """
        Upload to s3_url_prefix from pg_cluster_dir

        This function ignores the directory pg_xlog, which contains WAL
        files and are not generally part of a base backup.

        Note that this is also lzo compresses the files: thus, the number
        of pooled processes involves doing a full sequential scan of the
        uncompressed Postgres heap file that is pipelined into lzo. Once
        lzo is completely finished (necessary to have access to the file
        size) the file is sent to S3.

        TODO: Investigate an optimization to decouple the compression and
        upload steps to make sure that the most efficient possible use of
        pipelining of network and disk resources occurs.  Right now it
        possible to bounce back and forth between bottlenecking on reading
        from the database block device and subsequently the S3 sending
        steps should the processes be at the same stage of the upload
        pipeline: this can have a very negative impact on being able to
        make full use of system resources.

        Furthermore, it desirable to overflowing the page cache: having
        separate tunables for number of simultanious compression jobs
        (which occupy /tmp space and page cache) and number of uploads
        (which affect upload throughput) would help.

        """
        parts = tar_partition.partition(pg_cluster_dir)

        backup_s3_prefix = ('{0}/basebackups_{1}/'
                            'base_{file_name}_{file_offset}'
                            .format(self.s3_prefix, FILE_STRUCTURE_VERSION,
                                    **start_backup_info))

        if rate_limit is None:
            per_process_limit = None
        else:
            per_process_limit = int(rate_limit / pool_size)

        # Reject tiny per-process rate limits.  They should be
        # rejected more nicely elsewhere.
        assert per_process_limit > 0 or per_process_limit is None

        # a list to accumulate async upload jobs
        uploads = []

        total_size = 0

        # Make an attempt to upload extended version metadata
        extended_version_url = backup_s3_prefix + '/extended_version.txt'
        logger.info(
            msg='start upload postgres version metadata',
            detail=('Uploading to {extended_version_url}.'
                    .format(extended_version_url=extended_version_url)))
        s3_worker.uri_put_file(extended_version_url, StringIO(version),
                               content_encoding='text/plain')
        logger.info(msg='postgres version metadata upload complete')

        pool = gevent.pool.Pool(size=pool_size)

        # Enqueue uploads for parallel execution
        try:
            for tpart in parts:
                total_size += tpart.total_member_size
                uploads.append(pool.apply_async(
                        s3_worker.do_partition_put,
                        [backup_s3_prefix, tpart, per_process_limit,
                         self.gpg_key_id]))
        finally:
            while uploads:
                uploads.pop().get()

            pool.join()

        return backup_s3_prefix, total_size