def _s3_upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir, version, pool_size, rate_limit=None): """ Upload to s3_url_prefix from pg_cluster_dir This function ignores the directory pg_xlog, which contains WAL files and are not generally part of a base backup. Note that this is also openssl encrypts and compresses the files: thus, the number of pooled processes involves doing a full sequential scan of the uncompressed Postgres heap file that is pipelined into openssl. Once openssl is completely finished (necessary to have access to the file size) the file is sent to S3. TODO: Investigate an optimization to decouple the compression and upload steps to make sure that the most efficient possible use of pipelining of network and disk resources occurs. Right now it possible to bounce back and forth between bottlenecking on reading from the database block device and subsequently the S3 sending steps should the processes be at the same stage of the upload pipeline: this can have a very negative impact on being able to make full use of system resources. Furthermore, it desirable to overflowing the page cache: having separate tunables for number of simultanious compression jobs (which occupy /tmp space and page cache) and number of uploads (which affect upload throughput) would help. """ # Get a manifest of files first. matches = [] def raise_walk_error(e): raise e walker = os.walk(pg_cluster_dir, onerror=raise_walk_error) for root, dirnames, filenames in walker: is_cluster_toplevel = (os.path.abspath(root) == os.path.abspath(pg_cluster_dir)) # Do not capture any WAL files, although we do want to # capture the WAL directory or symlink if is_cluster_toplevel: if 'pg_xlog' in dirnames: dirnames.remove('pg_xlog') matches.append(os.path.join(root, 'pg_xlog')) for filename in filenames: if is_cluster_toplevel and filename in ('server.key', 'server.crt', 'postmaster.pid', 'postgresql.conf'): # Do not include the postmaster pid file or the # configuration file in the backup. pass else: matches.append(os.path.join(root, filename)) # Special case for empty directories if not filenames: matches.append(root) backup_s3_prefix = ('{0}/basebackups_{1}/' 'base_{file_name}_{file_offset}' .format(self.s3_prefix, FILE_STRUCTURE_VERSION, **start_backup_info)) # absolute upload paths are used for telling openssl what to compress local_abspaths = [os.path.abspath(match) for match in matches] # computed to subtract out extra extraneous absolute path # information when storing on S3 common_local_prefix = os.path.commonprefix(local_abspaths) partitions = tar_partition.tar_partitions_plan( common_local_prefix, local_abspaths, # 1610612736 bytes == 1.5 gigabytes, per partition, # non-tunable 1610612736) # A multiprocessing pool to do the uploads with pool = multiprocessing.Pool(processes=pool_size) if rate_limit is None: per_process_limit = None else: per_process_limit = int(rate_limit / pool_size) # Reject tiny per-process rate limits. They should be # rejected more nicely elsewher. assert per_process_limit > 0 or per_process_limit is None # a list to accumulate async upload jobs uploads = [] total_size = 0 with self.s3cmd_temp_config as s3cmd_config: # Make an attempt to upload extended version metadata with tempfile.NamedTemporaryFile(mode='w') as version_tempf: version_tempf.write(unicode(version)) version_tempf.flush() check_call_wait_sigint( [S3CMD_BIN, '-c', s3cmd_config.name, '--mime-type=text/plain', 'put', version_tempf.name, backup_s3_prefix + '/extended_version.txt']) # Enqueue uploads for parallel execution try: for tpart_number, tpart in enumerate(partitions): total_size += tpart.total_member_size uploads.append(pool.apply_async( worker.do_partition_put, [backup_s3_prefix, tpart_number, tpart, per_process_limit, s3cmd_config.name])) pool.close() finally: # Necessary in case finally block gets hit before # .close() pool.close() while uploads: # XXX: Need timeout to work around Python bug: # # http://bugs.python.org/issue8296 uploads.pop().get(1e100) pool.join() return backup_s3_prefix, total_size
def _s3_upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir, version, pool_size, rate_limit=None): """ Upload to s3_url_prefix from pg_cluster_dir This function ignores the directory pg_xlog, which contains WAL files and are not generally part of a base backup. Note that this is also lzo compresses the files: thus, the number of pooled processes involves doing a full sequential scan of the uncompressed Postgres heap file that is pipelined into lzo. Once lzo is completely finished (necessary to have access to the file size) the file is sent to S3. TODO: Investigate an optimization to decouple the compression and upload steps to make sure that the most efficient possible use of pipelining of network and disk resources occurs. Right now it possible to bounce back and forth between bottlenecking on reading from the database block device and subsequently the S3 sending steps should the processes be at the same stage of the upload pipeline: this can have a very negative impact on being able to make full use of system resources. Furthermore, it desirable to overflowing the page cache: having separate tunables for number of simultanious compression jobs (which occupy /tmp space and page cache) and number of uploads (which affect upload throughput) would help. """ # Get a manifest of files first. matches = [] def raise_walk_error(e): raise e walker = os.walk(pg_cluster_dir, onerror=raise_walk_error) for root, dirnames, filenames in walker: is_cluster_toplevel = (os.path.abspath(root) == os.path.abspath(pg_cluster_dir)) # Do not capture any WAL files, although we do want to # capture the WAL directory or symlink if is_cluster_toplevel: if 'pg_xlog' in dirnames: dirnames.remove('pg_xlog') matches.append(os.path.join(root, 'pg_xlog')) for filename in filenames: if is_cluster_toplevel and filename in ('postmaster.pid', 'postgresql.conf'): # Do not include the postmaster pid file or the # configuration file in the backup. pass else: matches.append(os.path.join(root, filename)) # Special case for empty directories if not filenames: matches.append(root) backup_s3_prefix = ('{0}/basebackups_{1}/' 'base_{file_name}_{file_offset}' .format(self.s3_prefix, FILE_STRUCTURE_VERSION, **start_backup_info)) # absolute upload paths are used for telling lzop what to compress local_abspaths = [os.path.abspath(match) for match in matches] # computed to subtract out extra extraneous absolute path # information when storing on S3 common_local_prefix = os.path.commonprefix(local_abspaths) partitions = tar_partition.tar_partitions_plan( common_local_prefix, local_abspaths, # 1610612736 bytes == 1.5 gigabytes, per partition, # non-tunable 1610612736) if rate_limit is None: per_process_limit = None else: per_process_limit = int(rate_limit / pool_size) # Reject tiny per-process rate limits. They should be # rejected more nicely elsewhere. assert per_process_limit > 0 or per_process_limit is None # a list to accumulate async upload jobs uploads = [] total_size = 0 # Make an attempt to upload extended version metadata extended_version_url = backup_s3_prefix + '/extended_version.txt' logger.info( msg='start upload postgres version metadata', detail=('Uploading to {extended_version_url}.' .format(extended_version_url=extended_version_url))) s3_worker.uri_put_file(extended_version_url, StringIO(version), content_encoding='text/plain') logger.info(msg='postgres version metadata upload complete') pool = gevent.pool.Pool(size=pool_size) # Enqueue uploads for parallel execution try: for tpart in partitions: total_size += tpart.total_member_size uploads.append(pool.apply_async( s3_worker.do_partition_put, [backup_s3_prefix, tpart, per_process_limit, self.gpg_key_id])) finally: while uploads: uploads.pop().get() pool.join() return backup_s3_prefix, total_size
def _s3_upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir, version, pool_size, rate_limit=None): """ Upload to s3_url_prefix from pg_cluster_dir This function ignores the directory pg_xlog, which contains WAL files and are not generally part of a base backup. Note that this is also lzo compresses the files: thus, the number of pooled processes involves doing a full sequential scan of the uncompressed Postgres heap file that is pipelined into lzo. Once lzo is completely finished (necessary to have access to the file size) the file is sent to S3. TODO: Investigate an optimization to decouple the compression and upload steps to make sure that the most efficient possible use of pipelining of network and disk resources occurs. Right now it possible to bounce back and forth between bottlenecking on reading from the database block device and subsequently the S3 sending steps should the processes be at the same stage of the upload pipeline: this can have a very negative impact on being able to make full use of system resources. Furthermore, it desirable to overflowing the page cache: having separate tunables for number of simultanious compression jobs (which occupy /tmp space and page cache) and number of uploads (which affect upload throughput) would help. """ # Get a manifest of files first. matches = [] def raise_walk_error(e): raise e walker = os.walk(pg_cluster_dir, onerror=raise_walk_error) for root, dirnames, filenames in walker: is_cluster_toplevel = (os.path.abspath(root) == os.path.abspath(pg_cluster_dir)) # Do not capture any WAL files, although we do want to # capture the WAL directory or symlink if is_cluster_toplevel: if 'pg_xlog' in dirnames: dirnames.remove('pg_xlog') matches.append(os.path.join(root, 'pg_xlog')) for filename in filenames: if is_cluster_toplevel and filename in ('postmaster.pid', 'postgresql.conf'): # Do not include the postmaster pid file or the # configuration file in the backup. pass else: matches.append(os.path.join(root, filename)) # Special case for empty directories if not filenames: matches.append(root) backup_s3_prefix = ('{0}/basebackups_{1}/' 'base_{file_name}_{file_offset}' .format(self.s3_prefix, FILE_STRUCTURE_VERSION, **start_backup_info)) # absolute upload paths are used for telling lzop what to compress local_abspaths = [os.path.abspath(match) for match in matches] # computed to subtract out extra extraneous absolute path # information when storing on S3 common_local_prefix = os.path.commonprefix(local_abspaths) partitions = tar_partition.tar_partitions_plan( common_local_prefix, local_abspaths, # 1610612736 bytes == 1.5 gigabytes, per partition, # non-tunable 1610612736) if rate_limit is None: per_process_limit = None else: per_process_limit = int(rate_limit / pool_size) # Reject tiny per-process rate limits. They should be # rejected more nicely elsewhere. assert per_process_limit > 0 or per_process_limit is None # a list to accumulate async upload jobs uploads = [] total_size = 0 # Make an attempt to upload extended version metadata extended_version_url = backup_s3_prefix + '/extended_version.txt' logger.info(msg='start upload postgres version metadata', detail=('Uploading to {extended_version_url}.' .format(extended_version_url=extended_version_url))) s3_worker.uri_put_file(extended_version_url, StringIO(version), content_encoding='text/plain') logger.info(msg='postgres version metadata upload complete') pool = gevent.pool.Pool(size=pool_size) # Enqueue uploads for parallel execution try: for tpart in partitions: total_size += tpart.total_member_size uploads.append(pool.apply_async( s3_worker.do_partition_put, [backup_s3_prefix, tpart, per_process_limit, self.gpg_key_id])) finally: while uploads: uploads.pop().get() pool.join() return backup_s3_prefix, total_size