def __init__( self, batch_size, restart_threshold, watch_secs, max_per_shard_failures, max_total_failures, rollback_on_failure=True, wait_for_batch_completion=False, pulse_interval_secs=None, ): if batch_size <= 0: raise ValueError("Batch size should be greater than 0") if watch_secs <= 0: raise ValueError("Watch seconds should be greater than 0") if pulse_interval_secs is not None and pulse_interval_secs < self.MIN_PULSE_INTERVAL_SECONDS: raise ValueError("Pulse interval seconds must be at least %s seconds." % self.MIN_PULSE_INTERVAL_SECONDS) if restart_threshold: log.warn("restart_threshold has been deprecated and will be removed in a future release") self.batch_size = batch_size self.watch_secs = watch_secs self.max_total_failures = max_total_failures self.max_per_instance_failures = max_per_shard_failures self.rollback_on_failure = rollback_on_failure self.wait_for_batch_completion = wait_for_batch_completion self.pulse_interval_secs = pulse_interval_secs
def _maybe_scrubbed_env(cls): for env_var in cls._SCRUBBED_ENV: value = os.getenv(env_var) if value: log.warn('Scrubbing {env_var}={value}'.format(env_var=env_var, value=value)) with environment_as(**cls._SCRUBBED_ENV): yield
def initialize(options): cwd_path = os.path.abspath(CWD) checkpoint_root = os.path.join(cwd_path, MesosPathDetector.DEFAULT_SANDBOX_PATH) # status providers: status_providers = [ HealthCheckerProvider(), ResourceManagerProvider(checkpoint_root=checkpoint_root) ] if options.announcer_enable: log.warn('Please remove the deprecated and no-op --announcer-enable flag in scheduler config!') if options.announcer_ensemble is not None: status_providers.append(DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path, options.announcer_allow_custom_serverset_path, options.announcer_hostname, make_zk_auth(options.announcer_zookeeper_auth_config) )) # Create executor stub if options.execute_as_user or options.nosetuid: # If nosetuid is set, execute_as_user is also None thermos_runner_provider = UserOverrideThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env ) thermos_runner_provider.set_role(None) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, sandbox_provider=UserOverrideDirectorySandboxProvider(options.execute_as_user) ) else: thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env ) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers ) return thermos_executor
def run(self): # Re-run the election in a loop periodically until a master can be elected or the elector is # aborted. while not self._aborted.is_set() and not self._completed.wait( self._query_interval): if datetime.utcnow() < self._election_deadline: self._elect(timedout=False) else: log.info( "Timed out waiting for all slaves to respond. Now elect from existing responses" ) self._elect(timedout=True) if not self._completed.is_set(): log.warn("No slave is electable after timeout") if self._aborted.is_set( ): # If asked to stop, directly return without triggering the callback. log.info( "Asked to stop the elector thread for cluster %s. Stopping..." % self._cluster_name) return self._master_callback( self._master) # Invoke the callback from the elector thread. log.info( "Stopping the elector thread for cluster %s because the election has completed" % self._cluster_name)
def update(self, instances=None): """Performs the job update, blocking until it completes. A rollback will be performed if the update was considered a failure based on the update configuration. Arguments: instances -- (optional) instances to update. If not specified, all instances will be updated. Returns a response object with update result status. """ resp = self._start() if resp.responseCode != ResponseCode.OK: return resp try: # Handle cron jobs separately from other jobs. if self._replace_template_if_cron(): log.info('Cron template updated, next run will reflect changes') return self._finish() else: if not self._update(instances): log.warn('Update failures threshold reached') self._finish() return self._failed_response('Update reverted') else: log.info('Update successful') return self._finish() except self.Error as e: return self._failed_response('Aborting update without rollback! Fatal error: %s' % e)
def update(self, instances=None): """Performs the job update, blocking until it completes. A rollback will be performed if the update was considered a failure based on the update configuration. Arguments: instances -- (optional) instances to update. If not specified, all instances will be updated. Returns a response object with update result status. """ resp = self._start() if resp.responseCode != ResponseCode.OK: return resp try: # Handle cron jobs separately from other jobs. if self._replace_template_if_cron(): log.info( 'Cron template updated, next run will reflect changes') return self._finish() else: if not self._update(instances): log.warn('Update failures threshold reached') self._finish() return self._failed_response('Update reverted') else: log.info('Update successful') return self._finish() except self.Error as e: return self._failed_response( 'Aborting update without rollback! Fatal error: %s' % e)
def wait_for_accept(cls, port, tunnel_popen, timeout): total_time = Amount(0, Time.SECONDS) sleep = cls.MIN_RETRY warned = False # Did we log a warning that shows we're waiting for the tunnel? while total_time < timeout and tunnel_popen.returncode is None: try: accepted_socket = socket.create_connection(('localhost', port), timeout=5.0) accepted_socket.close() return True except socket.error: total_time += sleep time.sleep(sleep.as_(Time.SECONDS)) # Increase sleep exponentially until MAX_INTERVAL is reached sleep = min(sleep * 2, cls.MAX_INTERVAL) if total_time > cls.WARN_THRESHOLD and not warned: log.warn('Still waiting for tunnel to be established after %s (timeout is %s)' % ( total_time, cls.DEFAULT_TIMEOUT)) warned = True tunnel_popen.poll() # needed to update tunnel_popen.returncode if tunnel_popen.returncode is not None: cls.log('SSH returned prematurely with code %s' % str(tunnel_popen.returncode)) else: cls.log('timed out initializing tunnel') return False
def _run_task(self, task): assert self._runner, "_runner should be created before this method is called" try: self._runner.start() log.info("Task runner for task %s started" % task.task_id) self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING) except TaskError as e: log.error("Task runner for task %s failed to start: %s" % (task.task_id, str(e))) # Send TASK_FAILED if the task failed to start. self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED) except Exception as e: log.error("Error occurred while executing the task: %s" % e) log.error(traceback.format_exc()) # Send TASK_LOST for unknown errors. self._send_update(task.task_id.value, mesos_pb2.TASK_LOST) else: # Wait for the task's return code (when it terminates). try: returncode = self._runner.join() # If '_runner' terminates, it has either failed or been killed. log.warn("Task process terminated with return code %s" % returncode) except TaskError as e: log.error("Task terminated: %s" % e) finally: if self._killed: self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED) else: self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED) self._terminated.set() finally: # No matter what happens above, when we reach here the executor has no task to run so it # should just commit seppuku. self._kill()
def __init__(self, batch_size, restart_threshold, watch_secs, max_per_shard_failures, max_total_failures, rollback_on_failure=True, wait_for_batch_completion=False, pulse_interval_secs=None): if batch_size <= 0: raise ValueError('Batch size should be greater than 0') if watch_secs <= 0: raise ValueError('Watch seconds should be greater than 0') if pulse_interval_secs is not None and pulse_interval_secs < self.MIN_PULSE_INTERVAL_SECONDS: raise ValueError( 'Pulse interval seconds must be at least %s seconds.' % self.MIN_PULSE_INTERVAL_SECONDS) if restart_threshold: log.warn( 'restart_threshold has been deprecated and will be removed in a future release' ) self.batch_size = batch_size self.watch_secs = watch_secs self.max_total_failures = max_total_failures self.max_per_instance_failures = max_per_shard_failures self.rollback_on_failure = rollback_on_failure self.wait_for_batch_completion = wait_for_batch_completion self.pulse_interval_secs = pulse_interval_secs
def start(self, env=None): if self._process: log.warn( "start() called when a running task subprocess already exists") return command = ( "%(cmd)s %(framework_user)s %(host)s %(port)s %(server_id)s %(data_dir)s %(log_dir)s " "%(tmp_dir)s %(conf_file)s %(buffer_pool_size)s" % dict(cmd=os.path.join(self._scripts_dir, "mysos_launch_mysqld.sh"), framework_user=self._framework_user, host=self._host, port=self._port, server_id=self._server_id, data_dir=self._sandbox.mysql_data_dir, log_dir=self._sandbox.mysql_log_dir, tmp_dir=self._sandbox.mysql_tmp_dir, conf_file=self._conf_file, buffer_pool_size=self._buffer_pool_size)) log.info("Executing command: %s" % command) self._process = subprocess.Popen(command, shell=True, env=env, preexec_fn=os.setpgrp) # There is a delay before mysqld becomes available to accept requests. Wait for it. command = "%(cmd)s %(pid_file)s %(port)s %(timeout)s" % dict( cmd=os.path.join(self._scripts_dir, "mysos_wait_for_mysqld.sh"), pid_file=os.path.join(self._sandbox.mysql_log_dir, "mysqld.pid"), port=self._port, timeout=60) log.info("Executing command: %s" % command) subprocess.check_call(command, shell=True, env=env) return self._process
def start(self, env=None): if self._process: log.warn("start() called when a running task subprocess already exists") return command = ( "%(cmd)s %(framework_user)s %(host)s %(port)s %(server_id)s %(data_dir)s %(log_dir)s " "%(tmp_dir)s %(conf_file)s %(buffer_pool_size)s" % dict( cmd=os.path.join(self._scripts_dir, "mysos_launch_mysqld.sh"), framework_user=self._framework_user, host=self._host, port=self._port, server_id=self._server_id, data_dir=self._sandbox.mysql_data_dir, log_dir=self._sandbox.mysql_log_dir, tmp_dir=self._sandbox.mysql_tmp_dir, conf_file=self._conf_file, buffer_pool_size=self._buffer_pool_size)) log.info("Executing command: %s" % command) self._process = subprocess.Popen(command, shell=True, env=env, preexec_fn=os.setpgrp) # There is a delay before mysqld becomes available to accept requests. Wait for it. command = "%(cmd)s %(pid_file)s %(port)s %(timeout)s" % dict( cmd=os.path.join(self._scripts_dir, "mysos_wait_for_mysqld.sh"), pid_file=os.path.join(self._sandbox.mysql_log_dir, "mysqld.pid"), port=self._port, timeout=60) log.info("Executing command: %s" % command) subprocess.check_call(command, shell=True, env=env) return self._process
def __check_int(item): if item is not None: try: item = int(item) except ValueError: log.warn('Failed to deserialize value %r' % item) item = None return item
def _request_agent_containers(self): try: resp = requests.get(self._url, timeout=self._request_timeout) resp.raise_for_status() return resp.json() except requests.exceptions.RequestException as ex: log.warn("MesosDiskCollector: Unexpected error talking to agent api: %s", ex) return []
def iterate(self): with self._lock: try: with open(self._filename, 'r') as fp: self._sample = json.load(fp) except (IOError, OSError, ValueError) as e: if log: log.warn('Failed to collect sample: %s' % e)
def add_to_queue(self, queue, item, label): """ queue items send to us by the sniffer """ count = len(queue) if count > queue.maxlength(): # pragma: no cover log.warn("Too many %s queued (%d)", label, count) return queue.appendleft(item)
def run(self, lock): if self.options.dry_run: print "****** Dry Run ******" logger = None if self.options.log or self.options.log_level: from twitter.common.log import init from twitter.common.log.options import LogOptions LogOptions.set_stderr_log_level((self.options.log_level or "info").upper()) logdir = self.options.logdir or self.config.get("goals", "logdir", default=None) if logdir: safe_mkdir(logdir) LogOptions.set_log_dir(logdir) init("goals") else: init() logger = log if self.options.recursive_directory: log.warn("--all-recursive is deprecated, use a target spec with the form [dir]:: instead") for dir in self.options.recursive_directory: self.add_target_recursive(dir) if self.options.target_directory: log.warn("--all is deprecated, use a target spec with the form [dir]: instead") for dir in self.options.target_directory: self.add_target_directory(dir) context = Context( self.config, self.options, self.targets, requested_goals=self.requested_goals, lock=lock, log=logger, timer=self.timer if self.options.time else None, ) unknown = [] for phase in self.phases: if not phase.goals(): unknown.append(phase) if unknown: print ("Unknown goal(s): %s" % " ".join(phase.name for phase in unknown)) print ("") return Phase.execute(context, "goals") if logger: logger.debug("Operating on targets: %s", self.targets) ret = Phase.attempt(context, self.phases) if self.options.time: print ("Timing report") print ("=============") self.timer.print_timings() return ret
def initialize(options): cwd_path = os.path.abspath(CWD) checkpoint_root = os.path.join(cwd_path, MesosPathDetector.DEFAULT_SANDBOX_PATH) # status providers: status_providers = [ HealthCheckerProvider(), ResourceManagerProvider(checkpoint_root=checkpoint_root) ] if options.announcer_enable: log.warn( 'Please remove the deprecated and no-op --announcer-enable flag in scheduler config!' ) if options.announcer_ensemble is not None: status_providers.append( DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path, options.announcer_allow_custom_serverset_path, options.announcer_hostname)) # Create executor stub if options.execute_as_user or options.nosetuid: # If nosetuid is set, execute_as_user is also None thermos_runner_provider = UserOverrideThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env) thermos_runner_provider.set_role(None) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, sandbox_provider=UserOverrideDirectorySandboxProvider( options.execute_as_user)) else: thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers) return thermos_executor
def _maybe_scrubbed_classpath(self): if self._scrub_classpath: classpath = os.getenv('CLASSPATH') if classpath: log.warn('Scrubbing CLASSPATH=%s' % classpath) with environment_as(CLASSPATH=None): yield else: yield
def run(self, lock): with self.check_errors("Target contains a dependency cycle") as error: for target in self.targets: try: InternalTarget.check_cycles(target) except InternalTarget.CycleException as e: error(target.id) timer = None if self.options.time: class Timer(object): def now(self): return time.time() def log(self, message): print(message) timer = Timer() logger = None if self.options.log or self.options.log_level: from twitter.common.log import init from twitter.common.log.options import LogOptions LogOptions.set_stderr_log_level((self.options.log_level or 'info').upper()) logdir = self.options.logdir or self.config.get('goals', 'logdir', default=None) if logdir: safe_mkdir(logdir) LogOptions.set_log_dir(logdir) init('goals') else: init() logger = log if self.options.recursive_directory: log.warn('--all-recursive is deprecated, use a target spec with the form [dir]:: instead') for dir in self.options.recursive_directory: self.add_target_recursive(dir) if self.options.target_directory: log.warn('--all is deprecated, use a target spec with the form [dir]: instead') for dir in self.options.target_directory: self.add_target_directory(dir) context = Context(self.config, self.options, self.targets, lock=lock, log=logger) unknown = [] for phase in self.phases: if not phase.goals(): unknown.append(phase) if unknown: print('Unknown goal(s): %s' % ' '.join(phase.name for phase in unknown)) print('') return Phase.execute(context, 'goals') if logger: logger.debug('Operating on targets: %s', self.targets) return Phase.attempt(context, self.phases, timer=timer)
def cpu_affinity(self): """ Get CPU affinity of this process :return: a list() of CPU cores this processes is pinned to """ try: return self.process.cpu_affinity() except AttributeError: log.warn('cpu affinity is not available on your platform')
def get_cpu_affinity(self): """ Get CPU affinity of this process :return: a list() of CPU cores this processes is pinned to """ try: return self.process.cpu_affinity() except AttributeError: log.warn('cpu affinity is not available on your platform')
def getpage(self, wiki_space, page_title): """ Fetches a page object. Returns None if the page does not exist or otherwise could not be fetched. """ try: return self._server.confluence1.getPage(self._session_token, wiki_space, page_title) except Fault as e: log.warn('Failed to fetch page %s: %s' % (page_title, e)) return None
def getpage(self, wiki_space, page_title): """ Fetches a page object. Returns None if the page does not exist or otherwise could not be fetched. """ try: return self._api_entrypoint.getPage(self._session_token, wiki_space, page_title) except XMLRPCError as e: log.warn('Failed to fetch page %s: %s' % (page_title, e)) return None
def add_to_queue(self, queue, item, label): """ queue items send to us by the sniffer """ with self._cv: count = len(queue) if count > queue.maxlength(): log.warn("Too many %s queued (%d)", label, count) return queue.appendleft(item) self._cv.notify()
def run(self, lock): with self.check_errors("Target contains a dependency cycle") as error: with self.timer.timing("parse:check_cycles"): for target in self.targets: try: InternalTarget.check_cycles(target) except InternalTarget.CycleException as e: error(target.id) logger = None if self.options.log or self.options.log_level: from twitter.common.log import init from twitter.common.log.options import LogOptions LogOptions.set_stderr_log_level((self.options.log_level or "info").upper()) logdir = self.options.logdir or self.config.get("goals", "logdir", default=None) if logdir: safe_mkdir(logdir) LogOptions.set_log_dir(logdir) init("goals") else: init() logger = log if self.options.recursive_directory: log.warn("--all-recursive is deprecated, use a target spec with the form [dir]:: instead") for dir in self.options.recursive_directory: self.add_target_recursive(dir) if self.options.target_directory: log.warn("--all is deprecated, use a target spec with the form [dir]: instead") for dir in self.options.target_directory: self.add_target_directory(dir) context = Context(self.config, self.options, self.targets, lock=lock, log=logger) unknown = [] for phase in self.phases: if not phase.goals(): unknown.append(phase) if unknown: print("Unknown goal(s): %s" % " ".join(phase.name for phase in unknown)) print("") return Phase.execute(context, "goals") if logger: logger.debug("Operating on targets: %s", self.targets) ret = Phase.attempt(context, self.phases, timer=self.timer if self.options.time else None) if self.options.time: print("Timing report") print("=============") self.timer.print_timings() return ret
def run(self, lock): if self.options.dry_run: print '****** Dry Run ******' logger = None if self.options.log or self.options.log_level: from twitter.common.log import init from twitter.common.log.options import LogOptions LogOptions.set_stderr_log_level((self.options.log_level or 'info').upper()) logdir = self.options.logdir or self.config.get('goals', 'logdir', default=None) if logdir: safe_mkdir(logdir) LogOptions.set_log_dir(logdir) init('goals') else: init() logger = log if self.options.recursive_directory: log.warn('--all-recursive is deprecated, use a target spec with the form [dir]:: instead') for dir in self.options.recursive_directory: self.add_target_recursive(dir) if self.options.target_directory: log.warn('--all is deprecated, use a target spec with the form [dir]: instead') for dir in self.options.target_directory: self.add_target_directory(dir) context = Context( self.config, self.options, self.targets, lock=lock, log=logger, timer=self.timer if self.options.time else None) unknown = [] for phase in self.phases: if not phase.goals(): unknown.append(phase) if unknown: print('Unknown goal(s): %s' % ' '.join(phase.name for phase in unknown)) print('') return Phase.execute(context, 'goals') if logger: logger.debug('Operating on targets: %s', self.targets) ret = Phase.attempt(context, self.phases) if self.options.time: print('Timing report') print('=============') self.timer.print_timings() return ret
def run(self, lock): with self.check_errors("Target contains a dependency cycle") as error: with self.timer.timing('parse:check_cycles'): for target in self.targets: try: InternalTarget.check_cycles(target) except InternalTarget.CycleException as e: error(target.id) logger = None if self.options.log or self.options.log_level: from twitter.common.log import init from twitter.common.log.options import LogOptions LogOptions.set_stderr_log_level((self.options.log_level or 'info').upper()) logdir = self.options.logdir or self.config.get('goals', 'logdir', default=None) if logdir: safe_mkdir(logdir) LogOptions.set_log_dir(logdir) init('goals') else: init() logger = log if self.options.recursive_directory: log.warn('--all-recursive is deprecated, use a target spec with the form [dir]:: instead') for dir in self.options.recursive_directory: self.add_target_recursive(dir) if self.options.target_directory: log.warn('--all is deprecated, use a target spec with the form [dir]: instead') for dir in self.options.target_directory: self.add_target_directory(dir) context = Context(self.config, self.options, self.targets, lock=lock, log=logger) unknown = [] for phase in self.phases: if not phase.goals(): unknown.append(phase) if unknown: print('Unknown goal(s): %s' % ' '.join(phase.name for phase in unknown)) print('') return Phase.execute(context, 'goals') if logger: logger.debug('Operating on targets: %s', self.targets) ret = Phase.attempt(context, self.phases, timer=self.timer if self.options.time else None) if self.options.time: print('Timing report') print('=============') self.timer.print_timings() return ret
def set_niceness(self, nice_level): """ Set the nice level of this process :param nice_level: the nice level to set """ try: # TODO (phobos182): double check that psutil does not allow negative nice values if not 0 <= nice_level <= 20: raise ValueError('nice level must be between 0 and 20') self.process.nice(nice_level) except(EnvironmentError, ValueError, AccessDenied, NoSuchProcess) as e: log.warn('unable to set nice level on process: {}'.format(e))
def set_cpu_affinity(self, cpu_affinity_csv): """ Set CPU affinity for this process :param cpu_affinity_csv: A comma-separated string representing CPU cores """ try: cpu_list = self.parse_cpu_affinity(cpu_affinity_csv) self.process.cpu_affinity(cpu_list) except (OSError, ValueError) as e: log.warn('unable to set cpu affinity: {}, on process: {}'.format(cpu_affinity_csv, e)) except AttributeError: log.warn('cpu affinity is not available on your platform')
def callback(): if hook_method is None: return True log.debug("Running %s in %s" % (hook_method.__name__, hook.__class__.__name__)) hook_result = False try: hook_result = hook_method() if not hook_result: log.debug("%s in %s returned False" % (hook_method.__name__, hook.__class__.__name__)) except Exception: log.warn("Error in %s in %s" % (hook_method.__name__, hook.__class__.__name__)) log.warn(traceback.format_exc()) return hook_result
def set_cpu_affinity(self, cpu_affinity_csv): """ Set CPU affinity for this process :param cpu_affinity_csv: A comma-separated string representing CPU cores """ try: cpu_list = self.parse_cpu_affinity(cpu_affinity_csv) self.process.cpu_affinity(cpu_list) except (OSError, ValueError) as e: log.warn('unable to set cpu affinity: {}, on process: {}'.format( cpu_affinity_csv, e)) except AttributeError: log.warn('cpu affinity is not available on your platform')
def set_niceness(self, nice_level): """ Set the nice level of this process :param nice_level: the nice level to set """ try: # TODO (phobos182): double check that psutil does not allow negative nice values if not 0 <= nice_level <= 20: raise ValueError('nice level must be between 0 and 20') self.process.nice(nice_level) except (EnvironmentError, ValueError, AccessDenied, NoSuchProcess) as e: log.warn('unable to set nice level on process: {}'.format(e))
def _stop(self, timeout): """ Stop the runner and wait for its thread (and the sub-processes) to exit. :param timeout: The timeout that the process should die before a hard SIGKILL is issued (SIGTERM is used initially). :return: True if an active runner is stopped, False if the runner is not started or already stopping/stopped. """ with self._lock: if not self._started: log.warn("Cannot stop the runner because it's not started") return False if not self._popen: log.info( "The runner task did not start successfully so no need to kill it" ) return False try: log.info("Terminating process group: %s" % self._popen.pid) os.killpg(self._popen.pid, signal.SIGTERM) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False log.info("Waiting for process to terminate due to SIGTERM") # Escalate to SIGKILL if SIGTERM is not sufficient. if not self._exited.wait(timeout=timeout): with self._lock: try: log.warn( "Killing process group %s which failed to terminate cleanly within %s secs" % (self._popen.pid, timeout)) os.killpg(self._popen.pid, signal.SIGKILL) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False else: return True log.info("Waiting for process to terminate due to SIGKILL") if not self._exited.wait(timeout=timeout): raise TaskError("Failed to kill process group %s" % self._popen.pid) return True
def stop(self, timeout=10): with self._lock: # stop() could be called by multiple threads. Locking so we only stop the runner once. if self._stopping: log.warn("The runner is already stopping/stopped") return False else: log.info("Stopping runner") self._stopping = True try: return self._stop(timeout) finally: self._kazoo.stop() log.info("Runner cleaned up")
def callback(): if hook_method is None: return True log.debug('Running %s in %s' % (hook_method.__name__, hook.__class__.__name__)) hook_result = False try: hook_result = hook_method() if not hook_result: log.debug('%s in %s returned False' % (hook_method.__name__, hook.__class__.__name__)) except Exception: log.warn('Error in %s in %s' % (hook_method.__name__, hook.__class__.__name__)) log.warn(traceback.format_exc()) return hook_result
def _validate_health_check_config(config): health_check_config = config.health_check_config().get() health_checker = health_check_config.get('health_checker', {}) # If we have old-style of configuring. # TODO (AURORA-1563): Remove this code after we drop support for defining these directly in # HealthCheckConfig. for deprecated in {'endpoint', 'expected_response', 'expected_response_code'}: if deprecated in health_check_config: log.warn(HTTP_DEPRECATION_WARNING) break if SHELL_HEALTH_CHECK in health_checker: # Make sure we specified a shell_command if we chose a shell config. shell_health_checker = health_checker.get(SHELL_HEALTH_CHECK, {}) shell_command = shell_health_checker.get('shell_command') if not shell_command: # Must define a command. die(MUST_PROVIDE_SHELL_COMMAND_ERROR)
def update(self, instances=None): """Performs the job update, blocking until it completes. A rollback will be performed if the update was considered a failure based on the update configuration. Arguments: instances -- (optional) instances to update. If not specified, all instances will be updated. Returns a response object with update result status. """ try: resp = self._start() if resp.responseCode != ResponseCode.OK: return resp try: # Handle cron jobs separately from other jobs. if self._replace_template_if_cron(): log.info( 'Cron template updated, next run will reflect changes') return self._finish() else: try: instance_configs = self._get_update_instructions( instances) self._check_and_log_response( self._validate_quota(instance_configs)) except self.Error as e: # Safe to release the lock acquired above as no job mutation has happened yet. self._finish() return self._failed_response( 'Unable to start job update: %s' % e) if not self._update(instance_configs): log.warn('Update failures threshold reached') self._finish() return self._failed_response('Update reverted') else: log.info('Update successful') return self._finish() except (self.Error, ExecutionError, Exception) as e: return self._failed_response( 'Aborting update without rollback! Fatal error: %s' % e) finally: self._scheduler_mux.terminate()
def _await_nailgun_server(self, stdout, stderr): nailgun_timeout_seconds = 5 max_socket_connect_attempts = 10 nailgun = None port_parse_start = time.time() with safe_open(self._ng_out, 'r') as ng_out: while not nailgun: started = ng_out.readline() if started.find( 'Listening for transport dt_socket at address:') >= 0: nailgun_timeout_seconds = 60 log.warn( 'Timeout extended to {timeout} seconds for debugger to attach to ng server.' .format(timeout=nailgun_timeout_seconds)) started = ng_out.readline() if started: port = self._parse_nailgun_port(started) nailgun = self._create_ngclient(port, stdout, stderr) log.debug('Detected ng server up on port %d' % port) elif time.time() - port_parse_start > nailgun_timeout_seconds: raise NailgunClient.NailgunError( 'Failed to read ng output after' ' %s seconds' % nailgun_timeout_seconds) attempt = 0 while nailgun: sock = nailgun.try_connect() if sock: sock.close() endpoint = self._get_nailgun_endpoint() if endpoint: log.debug( 'Connected to ng server launched with %s fingerprint %s pid: %d @ port: %d' % endpoint) else: raise NailgunClient.NailgunError( 'Failed to connect to ng server.') return nailgun elif attempt > max_socket_connect_attempts: raise nailgun.NailgunError( 'Failed to connect to ng output after %d connect attempts' % max_socket_connect_attempts) attempt += 1 log.debug('Failed to connect on attempt %d' % attempt) time.sleep(0.1)
def _stop(self, timeout): """ Stop the runner and wait for its thread (and the sub-processes) to exit. :param timeout: The timeout that the process should die before a hard SIGKILL is issued (SIGTERM is used initially). :return: True if an active runner is stopped, False if the runner is not started or already stopping/stopped. """ with self._lock: if not self._started: log.warn("Cannot stop the runner because it's not started") return False if not self._popen: log.info("The runner task did not start successfully so no need to kill it") return False try: log.info("Terminating process group: %s" % self._popen.pid) os.killpg(self._popen.pid, signal.SIGTERM) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False log.info("Waiting for process to terminate due to SIGTERM") # Escalate to SIGKILL if SIGTERM is not sufficient. if not self._exited.wait(timeout=timeout): with self._lock: try: log.warn("Killing process group %s which failed to terminate cleanly within %s secs" % (self._popen.pid, timeout)) os.killpg(self._popen.pid, signal.SIGKILL) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False else: return True log.info("Waiting for process to terminate due to SIGKILL") if not self._exited.wait(timeout=timeout): raise TaskError("Failed to kill process group %s" % self._popen.pid) return True
def bootstrap(self, task_control, env): """ Bootstraps the executor state. :param task_control: Task control for carrying out state bootstrapping commands. :param env: The environment variables for task_control methods. """ # 1. Directly return if the data folder is not empty. if os.listdir(self._sandbox.mysql_data_dir): # TODO(jyx): This will be expected when we use persistent volumes. Validate the state in that # case. log.warn( "MySQL state already exists unexpectedly. Finishing bootstrap without restoration " "or initialization") return # 2. If the data folder is clean, restore state from the backup store. This can be a noop if the # user doesn't want to restore any state. try: backup_info = self._backup_store.restore() except BackupStore.Error as e: raise self.Error("Failed to restore MySQL state: %s" % e) if backup_info: # If some backup is restored, persist the backup info. log.info("Finished restoring the backup") backup_info_file = os.path.join(self._sandbox.var, BACKUP_INFO_FILENAME) # Useful for the user to check the result of backup restore. with open(backup_info_file, 'w') as f: json.dump(backup_info.__dict__, f) log.info("Persisted backup info '%s' to file %s" % (backup_info.__dict__, backup_info_file)) else: # If no recovery necessary, initialize the data dirs. log.info( "No MySQL backup is restored. Initializing a new MySQL instance" ) try: task_control.initialize(env) except subprocess.CalledProcessError as e: raise self.Error("Unable to initialize MySQL state: %s" % e)
def unpack_json(cls, blob): blob = json.loads(blob) for key in ('status', 'serviceEndpoint', 'additionalEndpoints'): if key not in blob: raise ValueError('Expected to find %s in ServiceInstance JSON!' % key) additional_endpoints = dict((name, Endpoint(value['host'], value['port'])) for name, value in blob['additionalEndpoints'].items()) shard = blob.get('shard') if shard is not None: try: shard = int(shard) except ValueError: log.warn('Failed to deserialize shard from value %r' % shard) shard = None return cls( service_endpoint=Endpoint(blob['serviceEndpoint']['host'], blob['serviceEndpoint']['port']), additional_endpoints=additional_endpoints, status=Status.from_string(blob['status']), shard=shard)
def _await_nailgun_server(self, stdout, stderr, debug_desc): # TODO(Eric Ayers) Make these cmdline/config parameters once we have a global way to fetch # the global options scope. nailgun_timeout_seconds = 10 max_socket_connect_attempts = 5 nailgun = None port_parse_start = time.time() with safe_open(self._ng_out, 'r') as ng_out: while not nailgun: started = ng_out.readline() if started.find('Listening for transport dt_socket at address:') >= 0: nailgun_timeout_seconds = 60 log.warn('Timeout extended to {timeout} seconds for debugger to attach to ng server.' .format(timeout=nailgun_timeout_seconds)) started = ng_out.readline() if started: port = self._parse_nailgun_port(started) nailgun = self._create_ngclient(port, stdout, stderr) log.debug('Detected ng server up on port {port}'.format(port=port)) elif time.time() - port_parse_start > nailgun_timeout_seconds: raise NailgunClient.NailgunError( 'Failed to read ng output after {sec} seconds.\n {desc}' .format(sec=nailgun_timeout_seconds, desc=debug_desc)) attempt = 0 while nailgun: sock = nailgun.try_connect() if sock: sock.close() endpoint = self._get_nailgun_endpoint() if endpoint: log.debug('Connected to ng server launched with {endpoint}' .format(endpoint=repr(endpoint))) else: raise NailgunClient.NailgunError('Failed to connect to ng server.') return nailgun elif attempt > max_socket_connect_attempts: raise nailgun.NailgunError('Failed to connect to ng output after {count} connect attempts' .format(count=max_socket_connect_attempts)) attempt += 1 log.debug('Failed to connect on attempt {count}'.format(count=attempt)) time.sleep(0.1)
def _is_restart_needed(self, failed_instances): """Checks if there are any failed instances recoverable via restart. Arguments: failed_instances -- Failed instance IDs. Returns True if restart is allowed, False otherwise (i.e. update failed). """ if not failed_instances: return False log.info('Failed instances: %s' % failed_instances) with self._thread_lock: unretryable_instances = self.failure_threshold.update_failure_counts(failed_instances) if unretryable_instances: log.warn('Not restarting failed instances %s, which exceeded ' 'maximum allowed instance failure limit of %s' % (unretryable_instances, self._update_config.max_per_instance_failures)) return False if unretryable_instances else True
def run(self): # Re-run the election in a loop periodically until a master can be elected or the elector is # aborted. while not self._aborted.is_set() and not self._completed.wait(self._query_interval): if datetime.utcnow() < self._election_deadline: self._elect(timedout=False) else: log.info("Timed out waiting for all slaves to respond. Now elect from existing responses") self._elect(timedout=True) if not self._completed.is_set(): log.warn("No slave is electable after timeout") if self._aborted.is_set(): # If asked to stop, directly return without triggering the callback. log.info("Asked to stop the elector thread for cluster %s. Stopping..." % self._cluster_name) return self._master_callback(self._master) # Invoke the callback from the elector thread. log.info( "Stopping the elector thread for cluster %s (epoch %s) because the election has completed" % (self._cluster_name, self._epoch))
def check_duplicate_conflicting_protos(sources_by_base, sources, log): """Checks if proto files are duplicate or conflicting. There are sometimes two files with the same name on the .proto path. This causes the protobuf compiler to stop with an error. Some repos have legitimate cases for this, and so this task decides to just choose one to keep the entire build from failing. Sometimes, they are identical copies. That is harmless, but if there are two files with the same name with different contents, that is ambiguous and we want to complain loudly. :param dict sources_by_base: mapping of base to path :param list sources: list of sources :param Context.Log log: writes error messages to the console for conflicts """ sources_by_genfile = {} for base in sources_by_base.keys(): # Need to iterate over /original/ bases. for path in sources_by_base[base]: if not path in sources: continue # Check to make sure we haven't already removed it. source = path[len(base):] genfiles = calculate_genfiles(path, source) for key in genfiles.keys(): for genfile in genfiles[key]: if genfile in sources_by_genfile: # Possible conflict! prev = sources_by_genfile[genfile] if not prev in sources: # Must have been culled by an earlier pass. continue if not _same_contents(path, prev): log.error('Proto conflict detected (.proto files are different):\n' '1: {prev}\n2: {curr}'.format(prev=prev, curr=path)) else: log.warn('Proto duplication detected (.proto files are identical):\n' '1: {prev}\n2: {curr}'.format(prev=prev, curr=path)) log.warn(' Arbitrarily favoring proto 1.') if path in sources: sources.remove(path) # Favor the first version. continue sources_by_genfile[genfile] = path