def on_finalizing(self, task_update): log.debug('Task on_finalizing(%s)', task_update) if not self._runner._recovery: self._runner._kill() self._runner._plan = self._runner._finalizing_plan if self._runner._finalization_start is None: self._runner._finalization_start = task_update.timestamp_ms / 1000.0
def _spawn(self, cmd, **subprocess_args): with self._maybe_scrubbed_env(): log.debug('Executing: %s' % ' '.join(cmd)) try: return subprocess.Popen(cmd, cwd=self._buildroot, **subprocess_args) except OSError as e: raise self.Error('Problem executing %s: %s' % (self._distribution.java, e))
def on_active(self, task_update): log.debug('Task on_active(%s)', task_update) self._runner._plan = self._runner._regular_plan if self._runner._recovery: return TaskRunnerHelper.initialize_task(self._pathspec, ThermosTaskWrapper(self._runner._task).to_json())
def is_process_lost(self, process_name): """Determine whether or not we should mark a task as LOST and do so if necessary.""" current_run = self._current_process_run(process_name) if not current_run: raise self.InternalError('No current_run for process %s!' % process_name) def forked_but_never_came_up(): return current_run.state == ProcessState.FORKED and ( self._clock.time() - current_run.fork_time > self.LOST_TIMEOUT.as_(Time.SECONDS)) def running_but_coordinator_died(): if current_run.state != ProcessState.RUNNING: return False coordinator_pid, _, _ = TaskRunnerHelper.scan_process(self.state, process_name) if coordinator_pid is not None: return False elif self._watcher.has_data(process_name): return False return True if forked_but_never_came_up() or running_but_coordinator_died(): log.info('Detected a LOST task: %s', current_run) log.debug(' forked_but_never_came_up: %s', forked_but_never_came_up()) log.debug(' running_but_coordinator_died: %s', running_but_coordinator_died()) return True return False
def __init__( self, task_id, task_monitor, disk_collector_provider=DiskCollectorProvider(), process_collection_interval=PROCESS_COLLECTION_INTERVAL, disk_collection_interval=DiskCollectorSettings.DISK_COLLECTION_INTERVAL, history_time=HISTORY_TIME, history_provider=HistoryProvider()): """ task_monitor: TaskMonitor object specifying the task whose resources should be monitored sandbox: Directory for which to monitor disk utilisation """ self._task_monitor = task_monitor # exposes PIDs, sandbox self._task_id = task_id log.debug('Initialising resource collection for task %s', self._task_id) self._process_collectors = dict() # ProcessStatus => ProcessTreeCollector self._disk_collector_provider = disk_collector_provider self._disk_collector = None self._process_collection_interval = process_collection_interval.as_(Time.SECONDS) self._disk_collection_interval = disk_collection_interval.as_(Time.SECONDS) min_collection_interval = min(self._process_collection_interval, self._disk_collection_interval) self._history = history_provider.provides(history_time, min_collection_interval) self._kill_signal = threading.Event() ExceptionalThread.__init__(self, name='%s[%s]' % (self.__class__.__name__, task_id)) self.daemon = True
def wait_start(self, timeout=MAX_WAIT): log.debug("Waiting for task to start.") def is_started(): return self._monitor and (self._monitor.active or self._monitor.finished) waited = Amount(0, Time.SECONDS) while waited < timeout: if not is_started(): log.debug(" - sleeping...") self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL else: break if not self.is_alive: if self._popen_rc != 0: raise TaskError("Task failed: %s" % self.compute_status().reason) else: # We can end up here if the process exited between the call to Popen and # waitpid (in is_alive), which is fine. log.info("Task runner exited: %s" % self.compute_status().reason) break if not is_started(): log.error("Task did not start with in deadline, forcing loss.") self.lose() raise TaskError("Task did not start within deadline.")
def _await_nailgun_server(self, workunit): nailgun_timeout_seconds = 5 max_socket_connect_attempts = 10 nailgun = None port_parse_start = time.time() with _safe_open(self._ng_out, 'r') as ng_out: while not nailgun: started = ng_out.readline() if started: port = self._parse_nailgun_port(started) with open(self._pidfile, 'a') as pidfile: pidfile.write(':%d\n' % port) nailgun = self._create_ngclient(port, workunit) log.debug('Detected ng server up on port %d' % port) elif time.time() - port_parse_start > nailgun_timeout_seconds: raise NailgunError('Failed to read ng output after %s seconds' % nailgun_timeout_seconds) attempt = 0 while nailgun: sock = nailgun.try_connect() if sock: sock.close() log.info('Connected to ng server pid: %d @ port: %d' % self._get_nailgun_endpoint()) return nailgun elif attempt > max_socket_connect_attempts: raise NailgunError('Failed to connect to ng output after %d connect attempts' % max_socket_connect_attempts) attempt += 1 log.debug('Failed to connect on attempt %d' % attempt) time.sleep(0.1)
def flush(self): if self.isOpen(): self.close() self.open() data = self.__wbuf.getvalue() self.__wbuf = BytesIO() self._session.headers['Content-Type'] = 'application/x-thrift' self._session.headers['Content-Length'] = str(len(data)) self._session.headers['Host'] = self.__urlparse.hostname response = None try: response = self._session.post( self.__uri, data=data, timeout=self.__timeout, auth=self.__auth) response.raise_for_status() except request_exceptions.Timeout: raise TTransportException( type=TTransportException.TIMED_OUT, message='Timed out talking to %s' % self.__uri) except request_exceptions.RequestException as e: if response: log.debug('Error connecting, logging response headers:.') for field_name, field_value in response.headers.items(): log.debug(' %s: %s' % (field_name, field_value)) raise TTransportException( type=TTransportException.UNKNOWN, message='Unknown error talking to %s: %s' % (self.__uri, e)) self.__rbuf = BytesIO(response.content)
def _get_process_resource_consumption(self, task_id, process_name): if task_id not in self.active_tasks: log.debug("Task %s not found in active tasks" % task_id) return ProcessSample.empty().to_dict() sample = self.active_tasks[task_id].resource_monitor.sample_by_process(process_name).to_dict() log.debug('Resource consumption (%s, %s) => %s' % (task_id, process_name, sample)) return sample
def select_binary(base_path, version, name, config=None): """Selects a binary matching the current os and architecture. :raises: :class:`pants.binary_util.BinaryUtil.BinaryNotFound` if no binary of the given version and name could be found. """ # TODO(John Sirois): finish doc of the path structure expexcted under base_path config = config or Config.load() bootstrap_dir = config.getdefault('pants_bootstrapdir') binary_path = select_binary_base_path(base_path, version, name) bootstrapped_binary_path = os.path.join(bootstrap_dir, binary_path) if not os.path.exists(bootstrapped_binary_path): downloadpath = bootstrapped_binary_path + '~' try: with select_binary_stream(base_path, version, name, config) as stream: with safe_open(downloadpath, 'wb') as bootstrapped_binary: bootstrapped_binary.write(stream()) os.rename(downloadpath, bootstrapped_binary_path) chmod_plus_x(bootstrapped_binary_path) finally: safe_delete(downloadpath) log.debug('Selected {binary} binary bootstrapped to: {path}' .format(binary=name, path=bootstrapped_binary_path)) return bootstrapped_binary_path
def genlang(self, lang, targets): bases, sources = self._calculate_sources(targets) if lang == 'java': safe_mkdir(self.java_out) gen = '--java_out=%s' % self.java_out elif lang == 'python': safe_mkdir(self.py_out) gen = '--python_out=%s' % self.py_out else: raise TaskError('Unrecognized protobuf gen lang: %s' % lang) args = [ self.protobuf_binary, gen ] for base in bases: args.append('--proto_path=%s' % base) args.extend(sources) log.debug('Executing: %s' % ' '.join(args)) process = subprocess.Popen(args) result = process.wait() if result != 0: raise TaskError
def run(self): log.debug('Health checker thread started.') self._clock.sleep(self._initial_interval) log.debug('Initial interval expired.') while not self._dead.is_set(): self._maybe_update_failure_count(*self._checker()) self._clock.sleep(self._interval)
def _update_instances_in_parallel(self, target, instances_to_update): """Processes instance updates in parallel and waits for completion. Arguments: target -- target method to handle instance update. instances_to_update -- list of InstanceData with update details. Returns Queue with non-updated instance data. """ log.info('Processing in parallel with %s worker thread(s)' % self._update_config.batch_size) instance_queue = Queue() for instance_to_update in instances_to_update: instance_queue.put(instance_to_update) try: threads = [] for _ in range(self._update_config.batch_size): threads.append(spawn_worker(target, kwargs={'instance_queue': instance_queue})) for thread in threads: thread.join_and_raise() except Exception as e: log.debug('Caught unhandled exception: %s' % e) self._terminate() raise return instance_queue
def _construct_scheduler(self): """ Populates: self._scheduler_client self._client """ self._scheduler_client = SchedulerClient.get(self.cluster, verbose=self.verbose) assert self._scheduler_client, "Could not find scheduler (cluster = %s)" % self.cluster.name start = time.time() while (time.time() - start) < self.CONNECT_MAXIMUM_WAIT.as_(Time.SECONDS): try: # this can wind up generating any kind of error, because it turns into # a call to a dynamically set authentication module. self._client = self._scheduler_client.get_thrift_client() break except SchedulerClient.CouldNotConnect as e: log.warning('Could not connect to scheduler: %s' % e) except Exception as e: # turn any auth module exception into an auth error. log.debug('Warning: got an unknown exception during authentication:') log.debug(traceback.format_exc()) raise self.AuthenticationError('Error connecting to scheduler: %s' % e) if not self._client: raise self.TimeoutError('Timed out trying to connect to scheduler at %s' % self.cluster.name) server_version = self._client.getVersion().result.getVersionResult if server_version != CURRENT_API_VERSION: raise self.APIVersionError("Client Version: %s, Server Version: %s" % (CURRENT_API_VERSION, server_version))
def sample(self): """ Collate and aggregate ProcessSamples for process and children Returns None: result is stored in self.value """ try: last_sample, last_stamp = self._sample, self._stamp if self._process is None: self._process = Process(self._pid) parent = self._process parent_sample = process_to_sample(parent) new_samples = dict((proc.pid, process_to_sample(proc)) for proc in parent.get_children(recursive=True)) new_samples[self._pid] = parent_sample except PsutilError as e: log.warning("Error during process sampling: %s" % e) self._sample = ProcessSample.empty() self._rate = 0.0 else: last_stamp = self._stamp self._stamp = time() # for most stats, calculate simple sum to aggregate self._sample = sum(new_samples.values(), ProcessSample.empty()) # cpu consumption is more complicated # We require at least 2 generations of a process before we can calculate rate, so for all # current processes that were not running in the previous sample, compare to an empty sample if self._sampled_tree and last_stamp: new = new_samples.values() old = [self._sampled_tree.get(pid, ProcessSample.empty()) for pid in new_samples.keys()] new_user_sys = sum(map(attrgetter("user"), new)) + sum(map(attrgetter("system"), new)) old_user_sys = sum(map(attrgetter("user"), old)) + sum(map(attrgetter("system"), old)) self._rate = (new_user_sys - old_user_sys) / (self._stamp - last_stamp) log.debug("Calculated rate for pid=%s and children: %s" % (self._process.pid, self._rate)) self._sampled_tree = new_samples
def setup_child_subreaping(): """ This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This means if any children processes need to be reparented, they will be reparented to this process. More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html and here: https://lwn.net/Articles/474787/ Callers should reap terminal children to prevent zombies. """ log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER") # This constant is taken from prctl.h PR_SET_CHILD_SUBREAPER = 36 try: library_name = ctypes.util.find_library('c') if library_name is None: log.warning("libc is not found. Unable to call prctl!") log.warning("Children subreaping is disabled!") return libc = ctypes.CDLL(library_name, use_errno=True) # If we are on a system where prctl doesn't exist, this will throw an # attribute error. ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) if ret != 0: errno = ctypes.get_errno() raise OSError(errno, os.strerror(errno)) except Exception as e: log.error("Unable to call prctl %s" % e) log.error("Children subreaping is disabled!")
def run(self): """ The internal thread for the observer. This periodically polls the checkpoint root for new tasks, or transitions of tasks from active to finished state. """ while not self._stop_event.is_set(): time.sleep(self.POLLING_INTERVAL.as_(Time.SECONDS)) active_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='active')] finished_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='finished')] with self.lock: # Ensure all tasks currently detected on the system are observed appropriately for active in active_tasks: if active not in self.active_tasks: log.debug('task_id %s (unknown) -> active' % active) self.add_active_task(active) for finished in finished_tasks: if finished in self.active_tasks: log.debug('task_id %s active -> finished' % finished) self.active_to_finished(finished) elif finished not in self.finished_tasks: log.debug('task_id %s (unknown) -> finished' % finished) self.add_finished_task(finished) # Remove ObservedTasks for tasks no longer detected on the system for unknown in set(self.active_tasks) - set(active_tasks + finished_tasks): log.debug('task_id %s active -> (unknown)' % unknown) self.remove_active_task(unknown) for unknown in set(self.finished_tasks) - set(active_tasks + finished_tasks): log.debug('task_id %s finished -> (unknown)' % unknown) self.remove_finished_task(unknown)
def _maybe_update_health_check_count(self, is_healthy, reason): if not is_healthy: log.warning('Health check failure: %s' % reason) if self.current_consecutive_successes > 0: log.debug('Reset consecutive successes counter.') self.current_consecutive_successes = 0 if self._should_ignore_failure(): return if self._should_fail_fast(): log.warning('Not enough attempts left prove health, failing fast.') self.healthy = False self.reason = reason self.current_consecutive_failures += 1 if self.current_consecutive_failures > self.max_consecutive_failures: log.warning('Reached consecutive failure limit.') self.healthy = False self.reason = reason else: self.current_consecutive_successes += 1 if not self.running: if self.current_consecutive_successes >= self.min_consecutive_successes: log.info('Reached consecutive success limit.') self.running = True if self.current_consecutive_failures > 0: log.debug('Reset consecutive failures counter.') self.current_consecutive_failures = 0
def _create_kill_add_lists(self, instance_ids, operation_configs): """Determines a particular action (kill or add) to use for every instance in instance_ids. Arguments: instance_ids -- current batch of IDs to process. operation_configs -- OperationConfigs with update details. Returns lists of instances to kill and to add. """ to_kill = [] to_add = [] for instance_id in instance_ids: from_config = operation_configs.from_config.get(instance_id) to_config = operation_configs.to_config.get(instance_id) if from_config and to_config: diff_output = self._diff_configs(from_config, to_config) if diff_output: log.debug('Task configuration changed for instance [%s]:\n%s' % (instance_id, diff_output)) to_kill.append(instance_id) to_add.append(instance_id) elif from_config and not to_config: to_kill.append(instance_id) elif not from_config and to_config: to_add.append(instance_id) else: raise self.Error('Instance %s is outside of supported range' % instance_id) return to_kill, to_add
def flush(self): if not self.isOpen(): self.open() data = self.__wbuf.getvalue() self.__wbuf = BytesIO() self._session.headers['Accept'] = 'application/vnd.apache.thrift.binary' self._session.headers['Content-Type'] = 'application/vnd.apache.thrift.binary' self._session.headers['Content-Length'] = str(len(data)) self._session.headers['Host'] = self.__urlparse.hostname try: response = self._session.post( self.__uri, data=data, timeout=self.__timeout, auth=self.__auth) response.raise_for_status() self.__rbuf = BytesIO(response.content) except request_exceptions.Timeout: raise TTransportException( type=TTransportException.TIMED_OUT, message='Timed out talking to %s' % self.__uri) except request_exceptions.RequestException as e: if e.response is not None: log.debug('Request failed, response headers:') for field_name, field_value in e.response.headers.items(): log.debug(' %s: %s' % (field_name, field_value)) if e.response.status_code in (401, 403): raise self.AuthError(e) raise TTransportException( type=TTransportException.UNKNOWN, message='Unknown error talking to %s: %s' % (self.__uri, e))
def terminate_process(cls, state, process_name): log.debug('TaskRunnerHelper.terminate_process(%s)' % process_name) _, pid, _ = cls._get_process_tuple(state, process_name) if pid: log.debug(' => SIGTERM pid %s' % pid) cls.terminate_pid(pid) return bool(pid)
def genlang(self, lang, targets): protobuf_binary = select_binary( self.protoc_supportdir, self.protoc_version, 'protoc', self.context.config ) bases, sources = self._calculate_sources(targets) if lang == 'java': safe_mkdir(self.java_out) gen = '--java_out=%s' % self.java_out elif lang == 'python': safe_mkdir(self.py_out) gen = '--python_out=%s' % self.py_out else: raise TaskError('Unrecognized protobuf gen lang: %s' % lang) args = [self.protobuf_binary, gen] for base in bases: args.append('--proto_path=%s' % base) args.extend(sources) log.debug('Executing: %s' % ' '.join(args)) process = subprocess.Popen(args) result = process.wait() if result != 0: raise TaskError('%s ... exited non-zero (%i)' % (self.protobuf_binary, result))
def write(self, slice_, data): log.debug('Disk writing %s' % slice_) if len(data) != slice_.length: raise self.WriteError('Block must be of appropriate size!') with open(slice_._filename, 'r+b') as fp: fp.seek(slice_.start) fp.write(data)
def genlang(self, lang, targets): bases, sources = self._calculate_sources(targets) bases = bases.union(self._proto_path_imports(targets)) if lang == 'java': output_dir = self.java_out gen_flag = '--java_out' elif lang == 'python': output_dir = self.py_out gen_flag = '--python_out' else: raise TaskError('Unrecognized protobuf gen lang: %s' % lang) safe_mkdir(output_dir) gen = '%s=%s' % (gen_flag, output_dir) args = [self.protobuf_binary, gen] if self.plugins: for plugin in self.plugins: # TODO(Eric Ayers) Is it a good assumption that the generated source output dir is # acceptable for all plugins? args.append("--%s_protobuf_out=%s" % (plugin, output_dir)) for base in bases: args.append('--proto_path=%s' % base) args.extend(sources) log.debug('Executing: %s' % ' '.join(args)) process = subprocess.Popen(args) result = process.wait() if result != 0: raise TaskError('%s ... exited non-zero (%i)' % (self.protobuf_binary, result))
def __init__(self, task_monitor, sandbox, process_collector=ProcessTreeCollector, disk_collector=DiskCollector, process_collection_interval=Amount(20, Time.SECONDS), disk_collection_interval=Amount(1, Time.MINUTES), history_time=Amount(1, Time.HOURS)): """ task_monitor: TaskMonitor object specifying the task whose resources should be monitored sandbox: Directory for which to monitor disk utilisation """ self._task_monitor = task_monitor # exposes PIDs, sandbox self._task_id = task_monitor._task_id log.debug('Initialising resource collection for task %s' % self._task_id) self._process_collectors = dict() # ProcessStatus => ProcessTreeCollector # TODO(jon): sandbox is also available through task_monitor, but typically the first checkpoint # isn't written (and hence the header is not available) by the time we initialise here self._sandbox = sandbox self._process_collector_factory = process_collector self._disk_collector = disk_collector(self._sandbox) self._process_collection_interval = process_collection_interval.as_(Time.SECONDS) self._disk_collection_interval = disk_collection_interval.as_(Time.SECONDS) min_collection_interval = min(self._process_collection_interval, self._disk_collection_interval) history_length = int(history_time.as_(Time.SECONDS) / min_collection_interval) if history_length > self.MAX_HISTORY: raise ValueError("Requested history length too large") log.debug("Initialising ResourceHistory of length %s" % history_length) self._history = ResourceHistory(history_length) self._kill_signal = threading.Event() threading.Thread.__init__(self) self.daemon = True
def terminal_state(self): if self._terminal_state: log.debug('Forced terminal state: %s' % TaskState._VALUES_TO_NAMES.get(self._terminal_state, 'UNKNOWN')) return self._terminal_state else: return TaskState.SUCCESS if self.is_healthy() else TaskState.FAILED
def _get_user_topics(self): with CheatSheetsCache(self.ttl) as cs: if self.force or not cs.has_valid_user_cheat_sheets(self.user_name): log.debug('Fetching user cheats from server') user_sheet = self.api.sheets(self.user_name) cs.add_or_update_user_sheet(self.user_name, user_sheet) return cs.get_user_topics(self.user_name)
def on_killed(self, process_update): log.debug('Process on_killed %s', process_update) self._cleanup(process_update) self._runner._task_processes.pop(process_update.process) self._runner._watcher.unregister(process_update.process) log.debug('Process killed, marking it as a loss.') self._runner._plan.lost(process_update.process)
def select(self): """ Read and multiplex checkpoint records from all the forked off process coordinators. Checkpoint records can come from one of two places: in-process: checkpoint records synthesized for FORKED and LOST events out-of-process: checkpoint records from from file descriptors of forked coordinators Returns a list of RunnerCkpt objects that were successfully read, or an empty list if none were read. """ self._bind_processes() updates = [] for handle in filter(None, self._processes.values()): try: fstat = os.fstat(handle.fileno()) except OSError as e: log.error('Unable to fstat %s!' % handle.name) continue if handle.tell() > fstat.st_size: log.error('Truncated checkpoint record detected on %s!' % handle.name) elif handle.tell() < fstat.st_size: rr = ThriftRecordReader(handle, RunnerCkpt) while True: process_update = rr.try_read() if process_update: updates.append(process_update) else: break if len(updates) > 0: log.debug('select() returning %s updates:' % len(updates)) for update in updates: log.debug(' = %s' % update) return updates
def control(self, force=False): """ Bind to the checkpoint associated with this task, position to the end of the log if it exists, or create it if it doesn't. Fails if we cannot get "leadership" i.e. a file lock on the checkpoint stream. """ if self.is_terminal(): raise self.StateError('Cannot take control of a task in terminal state.') if self._sandbox: safe_mkdir(self._sandbox) ckpt_file = self._pathspec.getpath('runner_checkpoint') try: self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state) except TaskRunnerHelper.PermissionError: raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file) log.debug('Flipping recovery mode off.') self._recovery = False self._set_task_status(self.task_state()) self._resume_task() try: yield except Exception as e: log.error('Caught exception in self.control(): %s', e) log.error(' %s', traceback.format_exc()) self._ckpt.close()
def create(self): log.debug('DirectorySandbox: mkdir %s' % self.root) try: safe_mkdir(self.root) except (IOError, OSError) as e: raise self.CreationError('Failed to create the sandbox: %s' % e) if self._user: pwent, grent = self.get_user_and_group() try: log.debug('DirectorySandbox: chown %s:%s %s' % (self._user, grent.gr_name, self.root)) os.chown(self.root, pwent.pw_uid, pwent.pw_gid) log.debug('DirectorySandbox: chmod 700 %s' % self.root) os.chmod(self.root, 0700) except (IOError, OSError) as e: raise self.CreationError( 'Failed to chown/chmod the sandbox: %s' % e)
def stats_uploader_daemon(self, stats): """ Starts the StatsUploader as a daemon process if it is already not running """ log.debug("Checking if the statsUploaderDaemon is already running") stats_pid = os.path.join("/tmp", self._user, ".pid_stats") stats_uploader_dir = os.path.join("/tmp", self._user) dirutil.safe_mkdir(stats_uploader_dir) if not os.path.exists(stats_pid): log.debug("Starting the daemon") stats_log_file = os.path.join("/tmp", self._user, "buildtime_uploader") log.debug("The logs are writen to %s" % stats_log_file) if spawn_daemon(pidfile=stats_pid, quiet=True): force_stats_upload = False if "--force_stats_upload" in sys.argv: force_stats_upload = True su = StatsUploader(STATS_COLLECTION_URL, STATS_COLLECTION_PORT, STATS_COLLECTION_ENDPOINT, self._max_delay, self._get_default_stats_file(), self._user, force_stats_upload) su.upload_sync(stats)
def _await_nailgun_server(self, stdout, stderr): nailgun_timeout_seconds = 5 max_socket_connect_attempts = 10 nailgun = None port_parse_start = time.time() with safe_open(self._ng_out, 'r') as ng_out: while not nailgun: started = ng_out.readline() if started: port = self._parse_nailgun_port(started) nailgun = self._create_ngclient(port, stdout, stderr) log.debug('Detected ng server up on port %d' % port) elif time.time() - port_parse_start > nailgun_timeout_seconds: raise NailgunClient.NailgunError( 'Failed to read ng output after' ' %s seconds' % nailgun_timeout_seconds) attempt = 0 while nailgun: sock = nailgun.try_connect() if sock: sock.close() endpoint = self._get_nailgun_endpoint() if endpoint: log.debug( 'Connected to ng server with fingerprint %s pid: %d @ port: %d' % endpoint) else: raise NailgunClient.NailgunError( 'Failed to connect to ng server.') return nailgun elif attempt > max_socket_connect_attempts: raise nailgun.NailgunError( 'Failed to connect to ng output after %d connect attempts' % max_socket_connect_attempts) attempt += 1 log.debug('Failed to connect on attempt %d' % attempt) time.sleep(0.1)
def create(self): log.debug('DirectorySandbox: mkdir %s' % self.root) try: safe_mkdir(self.root) except (IOError, OSError) as e: raise self.CreationError('Failed to create the sandbox: %s' % e) if self._user: try: pwent = pwd.getpwnam(self._user) grent = grp.getgrgid(pwent.pw_gid) except KeyError: raise self.CreationError( 'Could not create sandbox because user does not exist: %s' % self._user) try: log.debug('DirectorySandbox: chown %s:%s %s' % (self._user, grent.gr_name, self.root)) os.chown(self.root, pwent.pw_uid, pwent.pw_gid) log.debug('DirectorySandbox: chmod 700 %s' % self.root) os.chmod(self.root, 0700) except (IOError, OSError) as e: raise self.CreationError('Failed to chown/chmod the sandbox: %s' % e)
def on_lost(self, task_update): log.debug('Task on_lost(%s)' % task_update) self._cleanup()
def shutdown(self): if log: log.debug('Shutting down metric sampler.') self._shutdown = True
def on_failed(self, task_update): log.debug('Task on_failed(%s)' % task_update) self._cleanup()
def schedule_cron(self, config, lock=None): log.info("Registering job %s with cron" % config.name()) log.debug('Full configuration: %s' % config.job()) log.debug('Lock %s' % lock) return self._scheduler_proxy.scheduleCronJob(config.job(), lock)
def run(self): while True: self._event.wait() log.debug('Join event triggered, joining serverset.') self._event.clear() self._joiner()
def on_initialization(self, header): log.debug('_on_initialization: %s' % header) ThermosTaskValidator.assert_valid_task(self._runner.task) ThermosTaskValidator.assert_valid_ports(self._runner.task, header.ports) self._checkpoint(RunnerCkpt(runner_header=header))
def on_forked(self, process_update): log.debug('Process on_forked %s' % process_update) task_process = self._runner._task_processes[process_update.process] task_process.rebind(process_update.coordinator_pid, process_update.fork_time) self._runner._plan.set_running(process_update.process)
def execute(self): """Perform final initialization and launch target process commandline in a subprocess.""" user, _ = self._getpwuid() username, homedir = user.pw_name, user.pw_dir # TODO(wickman) reconsider setsid now that we're invoking in a subshell os.setsid() if self._use_chroot: self._chroot() # If the mesos containerizer path is set, then this process will be launched from within an # isolated filesystem image by the mesos-containerizer executable. This executable needs to be # run as root so that it can properly set up the filesystem as such we'll skip calling setuid at # this point. We'll instead setuid after the process has been forked (mesos-containerizer itself # ensures the forked process is run as the correct user). taskfs_isolated = self._mesos_containerizer_path is not None if not taskfs_isolated: self._setuid() # start process start_time = self._platform.clock().time() if not self._sandbox: cwd = subprocess_cwd = sandbox = os.getcwd() else: if self._use_chroot: cwd = subprocess_cwd = sandbox = '/' elif taskfs_isolated: cwd = homedir = sandbox = self._container_sandbox subprocess_cwd = self._sandbox else: cwd = subprocess_cwd = homedir = sandbox = self._sandbox thermos_profile = os.path.join(sandbox, self.RCFILE) if self._preserve_env: env = deepcopy(os.environ) else: env = {} env.update({ 'HOME': homedir, 'LOGNAME': username, 'USER': username, 'PATH': os.environ['PATH'] }) wrapped_cmdline = self.wrapped_cmdline(cwd) log.debug('Wrapped cmdline: %s' % wrapped_cmdline) real_thermos_profile_path = os.path.join( os.environ['MESOS_DIRECTORY'], TASK_FILESYSTEM_MOUNT_POINT, thermos_profile.lstrip( '/')) if taskfs_isolated else thermos_profile if os.path.exists(real_thermos_profile_path): env.update(BASH_ENV=thermos_profile) log.debug('ENV is: %s' % env) subprocess_args = { 'args': wrapped_cmdline, 'close_fds': self.FD_CLOEXEC, 'cwd': subprocess_cwd, 'env': env, 'pathspec': self._pathspec } log_destination_resolver = LogDestinationResolver( self._pathspec, destination=self._logger_destination, mode=self._logger_mode, rotate_log_size=self._rotate_log_size, rotate_log_backups=self._rotate_log_backups) stdout, stderr, handlers_are_files = log_destination_resolver.get_handlers( ) if handlers_are_files: executor = SubprocessExecutor(stdout=stdout, stderr=stderr, **subprocess_args) else: executor = PipedSubprocessExecutor(stdout=stdout, stderr=stderr, **subprocess_args) pid = executor.start() # Now that we've forked the process, if the task's filesystem is isolated it's now safe to # setuid. if taskfs_isolated: self._setuid() self._write_process_update(state=ProcessState.RUNNING, pid=pid, start_time=start_time) rc = executor.wait() # indicate that we have finished/failed if rc < 0: state = ProcessState.KILLED elif rc == 0: state = ProcessState.SUCCESS else: state = ProcessState.FAILED self._write_process_update(state=state, return_code=rc, stop_time=self._platform.clock().time()) self._rc = rc
def watch(self, instance_ids, health_check=None): """Watches a set of instances and detects failures based on a delegated health check. Arguments: instance_ids -- set of instances to watch. Returns a set of instances that are considered failed. """ log.info('Watching instances: %s' % instance_ids) instance_ids = set(instance_ids) health_check = health_check or StatusHealthCheck() instance_states = {} def finished_instances(): return dict( (s_id, s) for s_id, s in instance_states.items() if s.finished) def set_instance_healthy(instance_id, now): if instance_id not in instance_states: instance_states[instance_id] = Instance(now) instance = instance_states.get(instance_id) if now > (instance.birthday + self._watch_secs): log.info( 'Instance %s has been up and healthy for at least %d seconds' % (instance_id, self._watch_secs)) instance.set_healthy(True) def set_instance_unhealthy(instance_id): log.info('Instance %s is unhealthy' % instance_id) if instance_id in instance_states: # An instance that was previously healthy and currently unhealthy has failed. instance_states[instance_id].set_healthy(False) else: # An instance never passed a health check (e.g.: failed before the first health check). instance_states[instance_id] = Instance(finished=True) while not self._terminating.is_set(): running_tasks = self._status_helper.get_tasks(instance_ids) now = self._clock.time() tasks_by_instance = dict( (task.assignedTask.instanceId, task) for task in running_tasks) for instance_id in instance_ids: if instance_id not in finished_instances(): running_task = tasks_by_instance.get(instance_id) if running_task is not None: task_healthy = health_check.health(running_task) if task_healthy: set_instance_healthy(instance_id, now) else: set_instance_unhealthy(instance_id) log.debug('Instances health: %s' % ['%s: %s' % val for val in instance_states.items()]) # Return if all tasks are finished. if set(finished_instances().keys()) == instance_ids: return set([ s_id for s_id, s in instance_states.items() if not s.healthy ]) self._terminating.wait(self._health_check_interval_seconds)
def __on_removed(self, root, task_id): log.debug('on_removed(%r, %r)', root, task_id) active_task = self._active_tasks.pop(task_id, None) if active_task: active_task.resource_monitor.kill() self._finished_tasks.pop(task_id, None)
def _log(self, msg, exc_info=None): log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg), exc_info=exc_info)
def create(args, options): validate_common_options(options) if not options.num_nodes: app.error("--num_nodes is required") if not options.cluster_user: app.error("--cluster_user is required") url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) values = dict( num_nodes=int(options.num_nodes), cluster_user=options.cluster_user, size=options.size if options.size else '', # 'urlencode()' doesn't accept None. backup_id=options.backup_id if options.backup_id else '', cluster_password=options.cluster_password if options.cluster_password else '') req = urllib2.Request(url, urllib.urlencode(values)) try: response = urllib2.urlopen(req).read() except urllib2.HTTPError as e: log.error("POST request failed: %s, %s, %s" % (e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) app.quit(1) try: result = json.loads(response) if not isinstance(result, dict): raise ValueError() except ValueError: log.error("Invalid response: %s" % response) app.quit(1) log.info("Cluster created. Cluster info: %s" % str(result)) with open(options.password_file, 'w') as f: f.write(result["cluster_password"]) log.info("Waiting for the master for this cluster to be elected...") master_endpoint = wait_for_master( result['cluster_url']).service_endpoint connection_str = "mysql://%s:%s@%s:%d/" % ( options.cluster_user, result["cluster_password"], master_endpoint.host, master_endpoint.port) log.info("Connecting to the MySQL cluster master: %s" % connection_str) engine = create_engine(connection_str) for i in range( 5 ): # Loop for 5 times/seconds to wait for the master to be promoted. try: # TODO(jyx): Test writing to the master and reading from the slave. result = engine.execute("SELECT 1;").scalar() assert 1 == int( result), "Expecting result to be 1 but got %s" % result break except OperationalError: if i == 4: raise log.debug("MySQL master not ready yet. Sleep for 1 second...") time.sleep(1) log.info("Cluster successfully started")
def run(self): """Thread entrypoint. Loop indefinitely, polling collectors at self._collection_interval and collating samples.""" log.debug('Commencing resource monitoring for task "%s"' % self._task_id) next_process_collection = 0 next_disk_collection = 0 while not self._kill_signal.is_set(): now = time.time() if now > next_process_collection: next_process_collection = now + self._process_collection_interval actives = set(self._get_active_processes()) current = set(self._process_collectors) for process in current - actives: self._process_collectors.pop(process) for process in actives - current: self._process_collectors[process] = ProcessTreeCollector( process.pid) for process, collector in self._process_collectors.items(): collector.sample() if now > next_disk_collection: next_disk_collection = now + self._disk_collection_interval if not self._disk_collector: sandbox = self._task_monitor.get_sandbox() if sandbox: self._disk_collector = self._disk_collector_class( sandbox) if self._disk_collector: self._disk_collector.sample() else: log.debug('No sandbox detected yet for %s' % self._task_id) try: disk_usage = self._disk_collector.value if self._disk_collector else 0 proc_usage_dict = dict() for process, collector in self._process_collectors.items(): proc_usage_dict.update({ process: self.ProcResourceResult(collector.value, collector.procs) }) self._history.add( now, self.FullResourceResult(proc_usage_dict, disk_usage)) except ValueError as err: log.warning("Error recording resource sample: %s" % err) log.debug( "TaskResourceMonitor: finished collection of %s in %.2fs" % (self._task_id, (time.time() - now))) # Sleep until any of the following conditions are met: # - it's time for the next disk collection # - it's time for the next process collection # - the result from the last disk collection is available via the DiskCollector # - the TaskResourceMonitor has been killed via self._kill_signal now = time.time() next_collection = min(next_process_collection - now, next_disk_collection - now) if self._disk_collector: waiter = EventMuxer(self._kill_signal, self._disk_collector.completed_event) else: waiter = self._kill_signal if next_collection > 0: waiter.wait(timeout=next_collection) else: log.warning( 'Task resource collection is backlogged. Consider increasing ' 'process_collection_interval and disk_collection_interval.' ) log.debug('Stopping resource monitoring for task "%s"' % self._task_id)
def stop(self): log.debug('Health checker thread stopped.') self.dead.set()
def on_process_transition(self, state, process_update): log.debug('_on_process_transition: %s' % process_update) self._checkpoint(RunnerCkpt(process_status=process_update))
def read_opcode(data, offset): opcode, offset = read_number(data, offset) if opcode not in ZK_REQUEST_TYPES: log.debug("Bad request type: %s", opcode) raise DeserializationError("Invalid request type: %d" % (opcode)) return (opcode, offset)
def on_task_transition(self, state, task_update): log.debug('_on_task_transition: %s' % task_update) self._checkpoint(RunnerCkpt(task_status=task_update))
def on_running(self, process_update): log.debug('Process on_running %s' % process_update) self._runner._plan.set_running(process_update.process)
def close_ckpt(self): """Force close the checkpoint stream. This is necessary for runners terminated through exception propagation.""" log.debug('Closing the checkpoint stream.') self._ckpt.close()
def on_lost(self, process_update): log.debug('Process on_lost %s' % process_update) self._cleanup(process_update) self._on_abnormal(process_update) self._runner._plan.lost(process_update.process)
def genlang(self, lang, targets): bases, sources = calculate_compile_roots(targets, self.is_gentarget) if lang == 'java': gen = self.gen_java.gen elif lang == 'python': gen = self.gen_python.gen else: raise TaskError('Unrecognized thrift gen lang: %s' % lang) args = [ self.thrift_binary, '--gen', gen, '-recurse', ] if self.strict: args.append('-strict') if self.verbose: args.append('-verbose') for base in bases: args.extend(('-I', base)) sessions = [] for source in sources: self.context.log.info('Generating thrift for %s\n' % source) # Create a unique session dir for this thrift root. Sources may be full paths but we only # need the path relative to the build root to ensure uniqueness. # TODO(John Sirois): file paths should be normalized early on and uniformly, fix the need to # relpath here at all. relsource = os.path.relpath(source, get_buildroot()) if lang == "python": copied_source = os.path.join(self._workdir, relsource) safe_mkdir(os.path.dirname(copied_source)) shutil.copyfile(source, copied_source) replace_python_keywords_in_file(copied_source) source = relsource = copied_source outdir = os.path.join(self.session_dir, '.'.join(relsource.split(os.path.sep))) safe_mkdir(outdir) cmd = args[:] cmd.extend(('-o', outdir)) cmd.append(source) log.debug('Executing: %s' % ' '.join(cmd)) sessions.append( self.ThriftSession(outdir, cmd, subprocess.Popen(cmd))) result = 0 for session in sessions: if result != 0: session.process.kill() else: result = session.process.wait() if result != 0: self.context.log.error('Failed: %s' % ' '.join(session.cmd)) else: _copytree(session.outdir, self.combined_dir) if result != 0: raise TaskError('%s ... exited non-zero (%i)' % (self.thrift_binary, result))
def wrapped_func(self, *args): log.debug('%s(%s)' % (func.__name__, ', '.join( '%s=%s' % (name, arg) for (name, arg) in zip(arg_names, args)))) return func(self, *args)
def create_job(self, config, lock=None): log.info('Creating job %s' % config.name()) log.debug('Full configuration: %s' % config.job()) log.debug('Lock %s' % lock) return self._scheduler_proxy.createJob(config.job(), lock)
def on_success(self, task_update): log.debug('Task on_success(%s)' % task_update) self._cleanup() log.info('Task succeeded.')
def on_cleaning(self, task_update): log.debug('Task on_cleaning(%s)' % task_update) self._runner._finalization_start = task_update.timestamp_ms / 1000.0 self._runner._terminate_plan(self._runner._regular_plan)
def _log(self, msg): log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg))