def psql_csv_run(sql_command, error_handler=None): """ Runs psql and returns a CSVReader object from the query This CSVReader includes header names as the first record in all situations. The output is fully buffered into Python. """ csv_query = ('COPY ({query}) TO STDOUT WITH CSV HEADER;' .format(query=sql_command)) psql_proc = popen_sp([PSQL_BIN, '-p',os.getenv('WALE_PG_PORT'),'-d', 'postgres', '-c', csv_query], stdout=subprocess.PIPE) stdout = psql_proc.communicate()[0] if psql_proc.returncode != 0: if error_handler is not None: error_handler(psql_proc) else: assert error_handler is None raise UserException( 'could not csv-execute a query successfully via psql', 'Query was "{query}".'.format(sql_command), 'You may have to set some libpq environment ' 'variables if you are sure the server is running.') # Previous code must raise any desired exceptions for non-zero # exit codes assert psql_proc.returncode == 0 # Fake enough iterator interface to get a CSV Reader object # that works. return csv.reader(iter(stdout.strip().split('\n')))
def do_lzo_aes_s3_put(s3_url, path, s3cmd_config_path): """ Synchronous version of the s3-upload wrapper Nominally intended to be used through a pool, but exposed here for testing and experimentation. """ with tempfile.NamedTemporaryFile(mode='w') as tf: compression_p = popen_sp([ENCODE_BIN,path], stdout=tf, bufsize=BUFSIZE_HT) compression_p.wait() if compression_p.returncode != 0: raise UserCritical( 'could not properly compress heap file', 'the heap file is at {path}'.format(path=path)) # Not to be confused with fsync: the point is to make # sure any Python-buffered output is visible to other # processes, but *NOT* force a write to disk. tf.flush() check_call_wait_sigint([S3CMD_BIN, '-c', s3cmd_config_path, 'put', tf.name, s3_url + '.lzo.aes'])
def invoke_program(): with open(os.devnull, 'w') as devnull: proc = piper.popen_sp(['python', '--version'], stdout=devnull, stderr=devnull) if proc: proc.wait()
def start(self): if self._process is not None: raise StandardError( 'BUG: Tried to .start on a PipelineCommand twice') self._process = popen_sp(self._command, stdin=self._stdin, stdout=self._stdout, bufsize=BUFSIZE_HT, close_fds=True)
def start(self): if self._process is not None: raise UserCritical( 'BUG: Tried to .start on a PipelineCommand twice') self._process = popen_sp(self._command, stdin=self._stdin, stdout=self._stdout, close_fds=True)
def _read_controldata(self): controldata_proc = popen_sp([self._controldata_bin, self.data_directory], stdout=PIPE) stdout = controldata_proc.communicate()[0] controldata = {} for line in stdout.split("\n"): split_values = line.split(":") if len(split_values) == 2: key, val = split_values controldata[key.strip()] = val.strip() return controldata
def _read_controldata(self): controldata_proc = popen_sp( [self._controldata_bin, self.data_directory], stdout=PIPE) stdout = controldata_proc.communicate()[0].decode('utf-8') controldata = {} for line in stdout.split('\n'): split_values = line.split(':') if len(split_values) == 2: key, val = split_values controldata[key.strip()] = val.strip() return controldata
def external_program_check(to_check=frozenset([PSQL_BIN, LZOP_BIN, PV_BIN])): """ Validates the existence and basic working-ness of other programs Implemented because it is easy to get confusing error output when one does not install a dependency because of the fork-worker model that is both necessary for throughput and makes more obscure the cause of failures. This is intended to be a time and frustration saving measure. This problem has confused The Author in practice when switching rapidly between machines. """ could_not_run = [] error_msgs = [] def psql_err_handler(popen): assert popen.returncode != 0 error_msgs.append( textwrap.fill("Could not get a connection to the database: " "note that superuser access is required") ) # Bogus error message that is re-caught and re-raised raise EnvironmentError("INTERNAL: Had problems running psql " "from external_program_check") with open(os.devnull, "wb") as nullf: for program in to_check: try: if program is PSQL_BIN: psql_csv_run("SELECT 1", error_handler=psql_err_handler) else: if program is PV_BIN: extra_args = ["--quiet"] else: extra_args = [] proc = popen_sp([program] + extra_args, stdout=nullf, stderr=nullf, stdin=subprocess.PIPE) # Close stdin for processes that default to # reading from the pipe; the programs WAL-E uses # of this kind will terminate in this case. proc.stdin.close() proc.wait() except EnvironmentError: could_not_run.append(program) if could_not_run: error_msgs.append("Could not run the following programs, are they installed? " + ", ".join(could_not_run)) if error_msgs: raise UserException("could not run one or more external programs WAL-E depends upon", "\n".join(error_msgs)) return None
def check_call_wait_sigint(*popenargs, **kwargs): got_sigint = False wait_sigint_proc = None try: wait_sigint_proc = popen_sp(*popenargs, **kwargs) except KeyboardInterrupt, e: got_sigint = True if wait_sigint_proc is not None: wait_sigint_proc.send_signal(signal.SIGINT) wait_sigint_proc.wait() raise e
def __init__(self, data_directory): self.data_directory = data_directory pg_config_proc = popen_sp([CONFIG_BIN], stdout=PIPE) output = pg_config_proc.communicate()[0] for line in output.split('\n'): parts = line.split('=') if len(parts) != 2: continue key, val = map(lambda x: x.strip(), parts) if key == 'BINDIR': self._controldata_bin = os.path.join(val, CONTROLDATA_BIN) elif key == 'VERSION': self._pg_version = val
def __init__(self, data_directory): self.data_directory = data_directory pg_config_proc = popen_sp([CONFIG_BIN], stdout=PIPE) output = pg_config_proc.communicate()[0] for line in output.split("\n"): parts = line.split("=") if len(parts) != 2: continue key, val = [x.strip() for x in parts] if key == "BINDIR": self._controldata_bin = os.path.join(val, CONTROLDATA_BIN) elif key == "VERSION": self._pg_version = val
def do_lzop_s3_put(s3_url, local_path): """ Compress and upload a given local path. :type s3_url: string :param s3_url: A s3://bucket/key style URL that is the destination :type local_path: string :param local_path: a path to a file to be compressed """ assert not s3_url.endswith('.lzo') s3_url += '.lzo' with tempfile.NamedTemporaryFile(mode='rwb') as tf: compression_p = popen_sp([LZOP_BIN, '--stdout', local_path], stdout=tf, bufsize=BUFSIZE_HT) compression_p.wait() if compression_p.returncode != 0: raise UserCritical( 'could not properly compress file', 'the file is at {path}'.format(path=local_path)) tf.flush() logger.info(msg='begin archiving a file', detail=('Uploading "{local_path}" to "{s3_url}".' .format(**locals()))) clock_start = time.clock() k = uri_put_file(s3_url, tf) clock_finish = time.clock() kib_per_second = format_kib_per_second(clock_start, clock_finish, k.size) logger.info( msg='completed archiving to a file ', detail=('Archiving to "{s3_url}" complete at ' '{kib_per_second}KiB/s. ') .format(s3_url=s3_url, kib_per_second=kib_per_second))
def do_partition_put(backup_s3_prefix, tpart_number, tpart, rate_limit, s3cmd_config_path): """ Synchronous version of the s3-upload wrapper Nominally intended to be used through a pool, but exposed here for testing and experimentation. """ with tempfile.NamedTemporaryFile(mode='w') as tf: compression_p = popen_sp([ENCODE_BIN], stdin=subprocess.PIPE, stdout=tf, bufsize=BUFSIZE_HT) tpart.tarfile_write(compression_p.stdin, rate_limit=rate_limit) compression_p.stdin.flush() compression_p.stdin.close() compression_p.wait() if compression_p.returncode != 0: raise UserCritical( 'could not properly compress tar partition', 'The partition failed is {tpart_number}. ' 'It has the following manifest:\n ' '{error_manifest}' .format(error_manifest=tpart.format_manifest(), tpart_number=tpart_number)) # Not to be confused with fsync: the point is to make # sure any Python-buffered output is visible to other # processes, but *NOT* force a write to disk. tf.flush() check_call_wait_sigint( [S3CMD_BIN, '-c', s3cmd_config_path, 'put', tf.name, '/'.join([backup_s3_prefix, 'tar_partitions', 'part_{tpart_number}.tar.lzo.aes'.format( tpart_number=tpart_number)])])
def external_program_check(to_check=frozenset([PSQL_BIN, LZOP_BIN, PV_BIN])): """ Validates the existence and basic working-ness of other programs Implemented because it is easy to get confusing error output when one does not install a dependency because of the fork-worker model that is both necessary for throughput and makes more obscure the cause of failures. This is intended to be a time and frustration saving measure. This problem has confused The Author in practice when switching rapidly between machines. """ could_not_run = [] error_msgs = [] def psql_err_handler(popen): assert popen.returncode != 0 error_msgs.append( textwrap.fill('Could not get a connection to the database: ' 'note that superuser access is required')) # Bogus error message that is re-caught and re-raised raise EnvironmentError('INTERNAL: Had problems running psql ' 'from external_program_check') with open(os.devnull, 'wb') as nullf: for program in to_check: try: if program is PSQL_BIN: psql_csv_run('SELECT 1', error_handler=psql_err_handler) else: if program is PV_BIN: extra_args = ['--quiet'] else: extra_args = [] proc = popen_sp([program] + extra_args, stdout=nullf, stderr=nullf, stdin=subprocess.PIPE) # Close stdin for processes that default to # reading from the pipe; the programs WAL-E uses # of this kind will terminate in this case. proc.stdin.close() proc.wait() except EnvironmentError: could_not_run.append(program) if could_not_run: error_msgs.append( 'Could not run the following programs, are they installed? ' + ', '.join(could_not_run)) if error_msgs: raise UserException( 'could not run one or more external programs WAL-E depends upon', '\n'.join(error_msgs)) return None
def start(self, command, stdin, stdout): self._command = command self._process = popen_sp(command, stdin=stdin, stdout=stdout, bufsize=BUFSIZE_HT, close_fds=True)
def database_s3_fetch(self, pg_cluster_dir, backup_name, pool_size): basebackups_prefix = '/'.join( [self.s3_prefix, 'basebackups_' + FILE_STRUCTURE_VERSION]) with self.s3cmd_temp_config as s3cmd_config: # Verify sane looking input for backup_name if backup_name == 'LATEST': # "LATEST" is a special backup name that is always valid # to always find the lexically-largest backup, with the # intend of getting the freshest database as soon as # possible. backup_find = popen_sp( [S3CMD_BIN, '-c', s3cmd_config.name, 'ls', basebackups_prefix + '/'], stdout=subprocess.PIPE) stdout, stderr = backup_find.communicate() sentinel_suffix = '_backup_stop_sentinel.json' # Find sentinel files as markers of guaranteed good backups sentinel_urls = [] for line in (l.strip() for l in stdout.split('\n')): if line.endswith(sentinel_suffix): sentinel_urls.append(line.split()[-1]) if not sentinel_urls: raise UserException( 'no base backups found', 'The prefix searched was "{0}"' .format(basebackups_prefix), 'Consider checking to make sure that you have taken a ' 'base backup and that the prefix is correct. ' 'New versions of WAL-E with new file formats will ' 'fail to recognize old backups, too.') else: sentinel_urls.sort() # Slice away the extra URL cruft to locate just # the base backup name. # # NB: '... + 1' is for trailing slash begin_slice = len(basebackups_prefix) + 1 end_slice = -len(sentinel_suffix) backup_name = sentinel_urls[-1][begin_slice:end_slice] base_backup_regexp = (r'base' r'_(?P<segment>[0-9a-zA-Z.]{0,60})' r'_(?P<position>[0-9A-F]{8})') match = re.match(base_backup_regexp, backup_name) if match is None: raise UserException('non-conformant backup name passed', 'The invalid name was "{0}"' .format(backup_name)) assert backup_name != 'LATEST', ('Must be rewritten to the actual ' 'name of the last base backup') backup_s3_prefix = '/'.join([basebackups_prefix, backup_name]) backup_s3_cluster_prefix = '/'.join( [backup_s3_prefix, 'tar_partitions', '']) ls_proc = popen_sp( [S3CMD_BIN, '-c', s3cmd_config.name, 'ls', backup_s3_cluster_prefix], stdout=subprocess.PIPE) stdout, stderr = ls_proc.communicate() pool = multiprocessing.Pool(processes=pool_size) results = [] cluster_abspath = os.path.abspath(pg_cluster_dir) try: partitions = set() for line in stdout.split('\n'): # Skip any blank lines if not line.strip(): continue pos = line.rfind('s3://') if pos > 0: s3_url = line[pos:] assert s3_url.startswith(backup_s3_cluster_prefix) base, partition_name = s3_url.rsplit('/', 1) match = re.match(r'part_(\d+).tar.lzo.aes', partition_name) if not match: raise UserCritical( 'Malformed tar partition in base backup', 'The offensive line from s3cmd is ' + line) else: partition_number = int(match.group(1)) assert partition_number not in partitions, \ 'Must not have duplicates' partitions.add(partition_number) else: raise UserCritical( 'could not parse s3cmd output', 'Could not find "s3://" in {0}'.format(line)) # Verify that partitions look sane: they should be # numbered contiguously from 0 to some number. expected_partitions = set(xrange(max(partitions) + 1)) if partitions != expected_partitions: raise UserCritical( 'some tar partitions are missing in the base backup', 'The following is a list of missing partition ' 'numbers: ' + unicode(expected_partitions - partitions), 'If this is a backup with a sentinel, S3 may be ' 'inconsistent -- try again later, and note how long ' 'it takes. If not, there is a bug somewhere.') else: assert expected_partitions == partitions for tpart_number in partitions: results.append(pool.apply_async( worker.do_partition_get, [backup_s3_prefix, cluster_abspath, tpart_number, s3cmd_config.name])) pool.close() finally: # Necessary in case finally block gets hit before # .close() pool.close() while results: # XXX: Need timeout to work around Python bug: # # http://bugs.python.org/issue8296 results.pop().get(1e100) pool.join()
def invoke_program(): with open(os.devnull, 'w') as devnull: piper.popen_sp(['python', '--version'], stdout=devnull, stderr=devnull)
def external_program_check( to_check=frozenset([PSQL_BIN, ENCODE_BIN, DECODE_BIN, S3CMD_BIN, MBUFFER_BIN])): """ Validates the existence and basic working-ness of other programs Implemented because it is easy to get confusing error output when one does not install a dependency because of the fork-worker model that is both necessary for throughput and makes more obscure the cause of failures. This is intended to be a time and frustration saving measure. This problem has confused The Author in practice when switching rapidly between machines. """ could_not_run = [] error_msgs = [] def psql_err_handler(popen): assert popen.returncode != 0 error_msgs.append(textwrap.fill( 'Could not get a connection to the database: ' 'note that superuser access is required')) # Bogus error message that is re-caught and re-raised raise Exception('INTERNAL: Had problems running psql ' 'from external_program_check') with open(os.devnull, 'w') as nullf: for program in to_check: try: if program is PSQL_BIN: psql_csv_run('SELECT 1', error_handler=psql_err_handler) else: if program is MBUFFER_BIN: # Prevent noise on the TTY, as mbuffer writes # text there: suppressing stdout/stderr # doesn't seem to work. extra_args = ['-q'] else: extra_args = [] proc = popen_sp([program] + extra_args, stdout=nullf, stderr=nullf, stdin=subprocess.PIPE) # Close stdin for processes that default to # reading from the pipe; the programs WAL-E uses # of this kind will terminate in this case. proc.stdin.close() proc.wait() except EnvironmentError: could_not_run.append(program) if could_not_run: error_msgs.append( 'Could not run the following programs, are they installed? ' + ', '.join(could_not_run)) if error_msgs: raise UserException( 'could not run one or more external programs WAL-E depends upon', '\n'.join(error_msgs)) return None
def do_partition_put(backup_s3_prefix, tpart, rate_limit): """ Synchronous version of the s3-upload wrapper """ logger.info(msg='beginning volume compression', detail='Building volume {name}.'.format(name=tpart.name)) with tempfile.NamedTemporaryFile(mode='rwb') as tf: compression_p = popen_sp([LZOP_BIN, '--stdout'], stdin=subprocess.PIPE, stdout=tf, bufsize=BUFSIZE_HT) tpart.tarfile_write(compression_p.stdin, rate_limit=rate_limit) compression_p.stdin.flush() compression_p.stdin.close() # Poll for process completion, avoid .wait() as to allow other # greenlets a chance to execute. Calling .wait() will result # in deadlock. while True: if compression_p.poll() is not None: break else: # Give other stacks a chance to continue progress gevent.sleep(0.1) if compression_p.returncode != 0: raise UserCritical( 'could not properly compress tar', 'The volume that failed is {volume}. ' 'It has the following manifest:\n ' '{error_manifest}' .format(error_manifest=tpart.format_manifest(), volume=tpart.name)) tf.flush() s3_url = '/'.join([backup_s3_prefix, 'tar_partitions', 'part_{number}.tar.lzo' .format(number=tpart.name)]) logger.info( msg='begin uploading a base backup volume', detail=('Uploading to "{s3_url}".') .format(s3_url=s3_url)) def log_volume_failures_on_error(exc_tup, exc_processor_cxt): def standard_detail_message(prefix=''): return (prefix + ' There have been {n} attempts to send the ' 'volume {name} so far.'.format(n=exc_processor_cxt, name=tpart.name)) typ, value, tb = exc_tup del tb del exc_tup # Screen for certain kinds of known-errors to retry from if issubclass(typ, socket.error): socketmsg = value[1] if isinstance(value, tuple) else value logger.info( msg='Retrying send because of a socket error', detail=standard_detail_message( "The socket error's message is '{0}'." .format(socketmsg))) elif (issubclass(typ, boto.exception.S3ResponseError) and value.error_code == 'RequestTimeTooSkewed'): logger.info(msg='Retrying send because of a Request Skew time', detail=standard_detail_message()) else: # This type of error is unrecognized as a retry-able # condition, so propagate it, original stacktrace and # all. raise exc_tup[0], exc_tup[1], exc_tup[2] @retry(retry_with_count(log_volume_failures_on_error)) def put_file_helper(): return uri_put_file(s3_url, tf) # Actually do work, retrying if necessary, and timing how long # it takes. clock_start = time.clock() k = put_file_helper() clock_finish = time.clock() kib_per_second = format_kib_per_second(clock_start, clock_finish, k.size) logger.info( msg='finish uploading a base backup volume', detail=('Uploading to "{s3_url}" complete at ' '{kib_per_second}KiB/s. ') .format(s3_url=s3_url, kib_per_second=kib_per_second))
def do_partition_put(backup_s3_prefix, tpart, rate_limit): """ Synchronous version of the s3-upload wrapper """ logger.info(msg='beginning volume compression') with tempfile.NamedTemporaryFile(mode='rwb') as tf: compression_p = popen_sp([LZOP_BIN, '--stdout'], stdin=subprocess.PIPE, stdout=tf, bufsize=BUFSIZE_HT) tpart.tarfile_write(compression_p.stdin, rate_limit=rate_limit) compression_p.stdin.flush() compression_p.stdin.close() # Poll for process completion, avoid .wait() as to allow other # greenlets a chance to execute. Calling .wait() will result # in deadlock. while True: if compression_p.poll() is not None: break else: # Give other stacks a chance to continue progress gevent.sleep(0.1) if compression_p.returncode != 0: raise UserCritical( 'could not properly compress tar', 'The volume that failed is {volume}. ' 'It has the following manifest:\n ' '{error_manifest}' .format(error_manifest=tpart.format_manifest(), volume=tpart.name)) tf.flush() s3_url = '/'.join([backup_s3_prefix, 'tar_partitions', 'part_{number}.tar.lzo' .format(number=tpart.name)]) logger.info( msg='begin uploading a base backup volume', detail=('Uploading to "{s3_url}".') .format(s3_url=s3_url)) def put_volume_exception_processor(exc_tup, exc_processor_cxt): typ, value, tb = exc_tup # Screen for certain kinds of known-errors to retry # from if issubclass(typ, socket.error): # This branch is for conditions that are retry-able. if value[0] == errno.ECONNRESET: # "Connection reset by peer" if exc_processor_cxt is None: exc_processor_cxt = 1 else: exc_processor_cxt += 1 logger.info( msg='Connection reset detected, retrying send', detail=('There have been {n} attempts to send the ' 'volume {name} so far.' .format(n=exc_processor_cxt, name=tpart.name))) return exc_processor_cxt else: # This type of error is unrecognized as a # retry-able condition, so propagate it, original # stacktrace and all. raise typ, value, tb @retry(put_volume_exception_processor) def put_file_helper(): return uri_put_file(s3_url, tf) # Actually do work, retrying if necessary, and timing how long # it takes. clock_start = time.clock() k = put_file_helper() clock_finish = time.clock() kib_per_second = compute_kib_per_second(clock_start, clock_finish, k.size) logger.info( msg='finish uploading a base backup volume', detail=('Uploading to "{s3_url}" complete at ' '{kib_per_second:02g}KiB/s. ') .format(s3_url=s3_url, kib_per_second=kib_per_second))
def __init__(self, stdin=PIPE, stdout=PIPE): self._decompression_p = popen_sp( [LZOP_BIN, '-d', '--stdout', '-'], stdin=stdin, stdout=stdout, bufsize=BUFSIZE_HT)