def open_checkpoint(cls, filename, force=False, state=None): """ Acquire a locked checkpoint stream. """ safe_mkdir(os.path.dirname(filename)) fp = lock_file(filename, "a+") if fp in (None, False): if force: log.info('Found existing runner, forcing leadership forfeit.') state = state or CheckpointDispatcher.from_file(filename) if cls.kill_runner(state): log.info('Successfully killed leader.') # TODO(wickman) Blocking may not be the best idea here. Perhaps block up to # a maximum timeout. But blocking is necessary because os.kill does not immediately # release the lock if we're in force mode. fp = lock_file(filename, "a+", blocking=True) else: log.error('Found existing runner, cannot take control.') if fp in (None, False): raise cls.PermissionError( 'Could not open locked checkpoint: %s, lock_file = %s' % (filename, fp)) ckpt = ThriftRecordWriter(fp) ckpt.set_sync(True) return ckpt
def _setup_ckpt(self): """Set up the checkpoint: must be run on the parent.""" self._log('initializing checkpoint file: %s' % self.ckpt_file()) ckpt_fp = lock_file(self.ckpt_file(), "a+") if ckpt_fp in (None, False): raise self.CheckpointError('Could not acquire checkpoint permission or lock for %s!' % self.ckpt_file()) self._ckpt_head = os.path.getsize(self.ckpt_file()) ckpt_fp.seek(self._ckpt_head) self._ckpt = ThriftRecordWriter(ckpt_fp) self._ckpt.set_sync(True)
def test_basic_thriftrecordwriter_write(): test_string = StringType("hello world") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string) rw.close() with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) assert rr.read() == test_string
def test_thrift_recordwriter_type_mismatch(): test_string = StringType("hello world") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string) rw.close() with open(fn) as fpr: rr = ThriftRecordReader(fpr, IntType) # This is a peculiar behavior of Thrift in that it just returns # ThriftType() with no serialization applied assert rr.read() == IntType()
def test_paranoid_thrift_append_framing(): test_string_1 = StringType("hello world") test_string_2 = StringType("ahoy ahoy, bonjour") with EphemeralFile('w') as fp: fn = fp.name ThriftRecordWriter.append(fn, test_string_1) ThriftRecordWriter.append(fn, test_string_2) with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) assert rr.read() == test_string_1 assert rr.read() == test_string_2
def test_thriftrecordreader_iteration(): test_string_1 = StringType("hello world") test_string_2 = StringType("ahoy ahoy, bonjour") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string_1) rw.write(test_string_2) rw.close() with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) records = [] for record in rr: records.append(record) assert records == [test_string_1, test_string_2]
def test_thriftrecordwriter_framing(): test_string_1 = StringType("hello world") test_string_2 = StringType("ahoy ahoy, bonjour") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string_1) rw.close() with open(fn, 'a') as fpa: rw = ThriftRecordWriter(fpa) rw.write(test_string_2) with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) assert rr.read() == test_string_1 assert rr.read() == test_string_2
def open_checkpoint(cls, filename, force=False, state=None): """ Acquire a locked checkpoint stream. """ safe_mkdir(os.path.dirname(filename)) fp = lock_file(filename, "a+") if fp in (None, False): if force: log.info("Found existing runner, forcing leadership forfeit.") state = state or CheckpointDispatcher.from_file(filename) if cls.kill_runner(state): log.info("Successfully killed leader.") # TODO(wickman) Blocking may not be the best idea here. Perhaps block up to # a maximum timeout. But blocking is necessary because os.kill does not immediately # release the lock if we're in force mode. fp = lock_file(filename, "a+", blocking=True) else: log.error("Found existing runner, cannot take control.") if fp in (None, False): raise cls.PermissionError("Could not open locked checkpoint: %s, lock_file = %s" % (filename, fp)) ckpt = ThriftRecordWriter(fp) ckpt.set_sync(True) return ckpt
class ProcessBase(object): """ Encapsulate a running process for a task. """ class Error(Exception): pass class UnknownUserError(Error): pass class CheckpointError(Error): pass class UnspecifiedSandbox(Error): pass class PermissionError(Error): pass CONTROL_WAIT_CHECK_INTERVAL = Amount(100, Time.MILLISECONDS) MAXIMUM_CONTROL_WAIT = Amount(1, Time.MINUTES) def __init__(self, name, cmdline, sequence, pathspec, sandbox_dir, user=None, platform=None, logger_destination=LoggerDestination.FILE, logger_mode=LoggerMode.STANDARD, rotate_log_size=None, rotate_log_backups=None): """ required: name = name of the process cmdline = cmdline of the process sequence = the next available sequence number for state updates pathspec = TaskPath object for synthesizing path names sandbox_dir = the sandbox in which to run the process platform = Platform providing fork, clock, getpid optional: user = the user to run as (if unspecified, will default to current user.) if specified to a user that is not the current user, you must have root access logger_destination = The destination for logs output. logger_mode = The type of logger to use for the process. rotate_log_size = The maximum size of the rotated stdout/stderr logs. rotate_log_backups = The maximum number of rotated stdout/stderr log backups. """ self._name = name self._cmdline = cmdline self._pathspec = pathspec self._seq = sequence self._sandbox = sandbox_dir if self._sandbox: safe_mkdir(self._sandbox) self._pid = None self._fork_time = None self._user = user self._ckpt = None self._ckpt_head = -1 if platform is None: raise ValueError("Platform must be specified") self._platform = platform self._logger_destination = logger_destination self._logger_mode = logger_mode self._rotate_log_size = rotate_log_size self._rotate_log_backups = rotate_log_backups if not LoggerDestination.is_valid(self._logger_destination): raise ValueError("Logger destination %s is invalid." % self._logger_destination) if not LoggerMode.is_valid(self._logger_mode): raise ValueError("Logger mode %s is invalid." % self._logger_mode) if self._logger_mode == LoggerMode.ROTATE: if self._rotate_log_size.as_(Data.BYTES) <= 0: raise ValueError('Log size cannot be less than one byte.') if self._rotate_log_backups <= 0: raise ValueError('Log backups cannot be less than one.') def _log(self, msg, exc_info=None): log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg), exc_info=exc_info) def _getpwuid(self): """Returns a tuple of the user (i.e. --user) and current user.""" uid = os.getuid() try: current_user = pwd.getpwuid(uid) except KeyError: raise self.UnknownUserError('Unknown uid %s!' % uid) try: user = pwd.getpwnam(self._user) if self._user is not None else current_user except KeyError: raise self.UnknownUserError('Unable to get pwent information!') return user, current_user def _ckpt_write(self, msg): self._init_ckpt_if_necessary() self._log("child state transition [%s] <= %s" % (self.ckpt_file(), msg)) self._ckpt.write(msg) def _write_process_update(self, **kw): """Write a process update to the coordinator's checkpoint stream.""" process_status = ProcessStatus(**kw) process_status.seq = self._seq process_status.process = self.name() self._ckpt_write(RunnerCkpt(process_status=process_status)) self._seq += 1 def _write_initial_update(self): self._write_process_update(state=ProcessState.FORKED, fork_time=self._fork_time, coordinator_pid=self._pid) def cmdline(self): return self._cmdline def name(self): return self._name def pid(self): """pid of the coordinator""" return self._pid def rebind(self, pid, fork_time): """rebind Process to an existing coordinator pid without forking""" self._pid = pid self._fork_time = fork_time def ckpt_file(self): return self._pathspec.getpath('process_checkpoint') def process_logdir(self): return self._pathspec.getpath('process_logdir') def _setup_ckpt(self): """Set up the checkpoint: must be run on the parent.""" self._log('initializing checkpoint file: %s' % self.ckpt_file()) ckpt_fp = lock_file(self.ckpt_file(), "a+") if ckpt_fp in (None, False): raise self.CheckpointError('Could not acquire checkpoint permission or lock for %s!' % self.ckpt_file()) self._ckpt_head = os.path.getsize(self.ckpt_file()) ckpt_fp.seek(self._ckpt_head) self._ckpt = ThriftRecordWriter(ckpt_fp) self._ckpt.set_sync(True) def _init_ckpt_if_necessary(self): if self._ckpt is None: self._setup_ckpt() def _wait_for_control(self): """Wait for control of the checkpoint stream: must be run in the child.""" total_wait_time = Amount(0, Time.SECONDS) with open(self.ckpt_file(), 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while total_wait_time < self.MAXIMUM_CONTROL_WAIT: ckpt_tail = os.path.getsize(self.ckpt_file()) if ckpt_tail == self._ckpt_head: self._platform.clock().sleep(self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS)) total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL continue checkpoint = rr.try_read() if checkpoint: if not checkpoint.process_status: raise self.CheckpointError('No process status in checkpoint!') if (checkpoint.process_status.process != self.name() or checkpoint.process_status.state != ProcessState.FORKED or checkpoint.process_status.fork_time != self._fork_time or checkpoint.process_status.coordinator_pid != self._pid): self._log('Losing control of the checkpoint stream:') self._log(' fork_time [%s] vs self._fork_time [%s]' % ( checkpoint.process_status.fork_time, self._fork_time)) self._log(' coordinator_pid [%s] vs self._pid [%s]' % ( checkpoint.process_status.coordinator_pid, self._pid)) raise self.CheckpointError('Lost control of the checkpoint stream!') self._log('Taking control of the checkpoint stream at record: %s' % checkpoint.process_status) self._seq = checkpoint.process_status.seq + 1 return True raise self.CheckpointError('Timed out waiting for checkpoint stream!') def _prepare_fork(self): user, current_user = self._getpwuid() if self._user: if user != current_user and os.geteuid() != 0: raise self.PermissionError('Must be root to run processes as other users!') self._fork_time = self._platform.clock().time() self._setup_ckpt() # Since the forked process is responsible for creating log files, it needs to own the log dir. safe_mkdir(self.process_logdir()) os.chown(self.process_logdir(), user.pw_uid, user.pw_gid) def _finalize_fork(self): self._write_initial_update() self._ckpt.close() self._ckpt = None def start(self): """ This is the main call point from the runner, and forks a co-ordinator process to run the target process (i.e. self.cmdline()) The parent returns immediately and populates information about the pid of the co-ordinator. The child (co-ordinator) will launch the target process in a subprocess. """ self._prepare_fork() # calls _setup_ckpt which can raise CheckpointError # calls _getpwuid which can raise: # UnknownUserError # PermissionError self._pid = self._platform.fork() if self._pid == 0: self._pid = self._platform.getpid() self._wait_for_control() # can raise CheckpointError try: self.execute() except Exception as e: self._log('Error trying to execute %s: %s' % (self._name, e)) raise e finally: self._ckpt.close() self.finish() else: self._finalize_fork() # can raise CheckpointError def execute(self): raise NotImplementedError def finish(self): pass
class ProcessBase(object): """ Encapsulate a running process for a task. """ class Error(Exception): pass class UnknownUserError(Error): pass class CheckpointError(Error): pass class UnspecifiedSandbox(Error): pass class PermissionError(Error): pass CONTROL_WAIT_CHECK_INTERVAL = Amount(100, Time.MILLISECONDS) MAXIMUM_CONTROL_WAIT = Amount(1, Time.MINUTES) def __init__(self, name, cmdline, sequence, pathspec, sandbox_dir, user=None, platform=None, logger_destination=LoggerDestination.FILE, logger_mode=LoggerMode.STANDARD, rotate_log_size=None, rotate_log_backups=None): """ required: name = name of the process cmdline = cmdline of the process sequence = the next available sequence number for state updates pathspec = TaskPath object for synthesizing path names sandbox_dir = the sandbox in which to run the process platform = Platform providing fork, clock, getpid optional: user = the user to run as (if unspecified, will default to current user.) if specified to a user that is not the current user, you must have root access logger_destination = The destination for logs output. logger_mode = The type of logger to use for the process. rotate_log_size = The maximum size of the rotated stdout/stderr logs. rotate_log_backups = The maximum number of rotated stdout/stderr log backups. """ self._name = name self._cmdline = cmdline self._pathspec = pathspec self._seq = sequence self._sandbox = sandbox_dir if self._sandbox: safe_mkdir(self._sandbox) self._pid = None self._fork_time = None self._user = user self._ckpt = None self._ckpt_head = -1 if platform is None: raise ValueError("Platform must be specified") self._platform = platform self._logger_destination = logger_destination self._logger_mode = logger_mode self._rotate_log_size = rotate_log_size self._rotate_log_backups = rotate_log_backups if not LoggerDestination.is_valid(self._logger_destination): raise ValueError("Logger destination %s is invalid." % self._logger_destination) if not LoggerMode.is_valid(self._logger_mode): raise ValueError("Logger mode %s is invalid." % self._logger_mode) if self._logger_mode == LoggerMode.ROTATE: if self._rotate_log_size.as_(Data.BYTES) <= 0: raise ValueError('Log size cannot be less than one byte.') if self._rotate_log_backups <= 0: raise ValueError('Log backups cannot be less than one.') def _log(self, msg, exc_info=None): log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg), exc_info=exc_info) def _getpwuid(self): """Returns a tuple of the user (i.e. --user) and current user.""" uid = os.getuid() try: current_user = pwd.getpwuid(uid) except KeyError: raise self.UnknownUserError('Unknown uid %s!' % uid) try: user = pwd.getpwnam( self._user) if self._user is not None else current_user except KeyError: raise self.UnknownUserError('Unable to get pwent information!') return user, current_user def _ckpt_write(self, msg): self._init_ckpt_if_necessary() self._log("child state transition [%s] <= %s" % (self.ckpt_file(), msg)) self._ckpt.write(msg) def _write_process_update(self, **kw): """Write a process update to the coordinator's checkpoint stream.""" process_status = ProcessStatus(**kw) process_status.seq = self._seq process_status.process = self.name() self._ckpt_write(RunnerCkpt(process_status=process_status)) self._seq += 1 def _write_initial_update(self): self._write_process_update(state=ProcessState.FORKED, fork_time=self._fork_time, coordinator_pid=self._pid) def cmdline(self): return self._cmdline def name(self): return self._name def pid(self): """pid of the coordinator""" return self._pid def rebind(self, pid, fork_time): """rebind Process to an existing coordinator pid without forking""" self._pid = pid self._fork_time = fork_time def ckpt_file(self): return self._pathspec.getpath('process_checkpoint') def process_logdir(self): return self._pathspec.getpath('process_logdir') def _setup_ckpt(self): """Set up the checkpoint: must be run on the parent.""" self._log('initializing checkpoint file: %s' % self.ckpt_file()) ckpt_fp = lock_file(self.ckpt_file(), "a+") if ckpt_fp in (None, False): raise self.CheckpointError( 'Could not acquire checkpoint permission or lock for %s!' % self.ckpt_file()) self._ckpt_head = os.path.getsize(self.ckpt_file()) ckpt_fp.seek(self._ckpt_head) self._ckpt = ThriftRecordWriter(ckpt_fp) self._ckpt.set_sync(True) def _init_ckpt_if_necessary(self): if self._ckpt is None: self._setup_ckpt() def _wait_for_control(self): """Wait for control of the checkpoint stream: must be run in the child.""" total_wait_time = Amount(0, Time.SECONDS) with open(self.ckpt_file(), 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while total_wait_time < self.MAXIMUM_CONTROL_WAIT: ckpt_tail = os.path.getsize(self.ckpt_file()) if ckpt_tail == self._ckpt_head: self._platform.clock().sleep( self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS)) total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL continue checkpoint = rr.try_read() if checkpoint: if not checkpoint.process_status: raise self.CheckpointError( 'No process status in checkpoint!') if (checkpoint.process_status.process != self.name() or checkpoint.process_status.state != ProcessState.FORKED or checkpoint.process_status.fork_time != self._fork_time or checkpoint.process_status.coordinator_pid != self._pid): self._log('Losing control of the checkpoint stream:') self._log(' fork_time [%s] vs self._fork_time [%s]' % (checkpoint.process_status.fork_time, self._fork_time)) self._log(' coordinator_pid [%s] vs self._pid [%s]' % (checkpoint.process_status.coordinator_pid, self._pid)) raise self.CheckpointError( 'Lost control of the checkpoint stream!') self._log( 'Taking control of the checkpoint stream at record: %s' % checkpoint.process_status) self._seq = checkpoint.process_status.seq + 1 return True raise self.CheckpointError('Timed out waiting for checkpoint stream!') def _prepare_fork(self): user, current_user = self._getpwuid() if self._user: if user != current_user and os.geteuid() != 0: raise self.PermissionError( 'Must be root to run processes as other users!') self._fork_time = self._platform.clock().time() self._setup_ckpt() # Since the forked process is responsible for creating log files, it needs to own the log dir. safe_mkdir(self.process_logdir()) os.chown(self.process_logdir(), user.pw_uid, user.pw_gid) def _finalize_fork(self): self._write_initial_update() self._ckpt.close() self._ckpt = None def start(self): """ This is the main call point from the runner, and forks a co-ordinator process to run the target process (i.e. self.cmdline()) The parent returns immediately and populates information about the pid of the co-ordinator. The child (co-ordinator) will launch the target process in a subprocess. """ self._prepare_fork( ) # calls _setup_ckpt which can raise CheckpointError # calls _getpwuid which can raise: # UnknownUserError # PermissionError self._pid = self._platform.fork() if self._pid == 0: self._pid = self._platform.getpid() self._wait_for_control() # can raise CheckpointError try: self.execute() except Exception as e: self._log('Error trying to execute %s: %s' % (self._name, e)) raise e finally: self._ckpt.close() self.finish() else: self._finalize_fork() # can raise CheckpointError def execute(self): raise NotImplementedError def finish(self): pass