def __iter__(self): """ May raise: RecordIO.PrematureEndOfStream """ fd = os.dup(self._fp.fileno()) try: cur_fp = os.fdopen(fd, self._fp.mode) cur_fp.seek(0) except OSError as e: log.error('Failed to duplicate fd on %s, error = %s' % (self._fp.name, e)) try: os.close(fd) except OSError as e: if e.errno != errno.EBADF: log.error('Failed to close duped fd on %s, error = %s' % (self._fp.name, e)) return try: while True: blob = RecordIO.Reader.do_read(cur_fp, self._codec) if blob: yield blob else: break finally: cur_fp.close()
def delete(args, options): validate_common_options(options) with open(options.password_file, 'r') as f: password = f.read().strip() if not password: app.error("Empty password file") url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) values = dict(password=password) req = urllib2.Request(url, urllib.urlencode(values)) req.get_method = lambda: 'DELETE' try: response = urllib2.urlopen(req).read() except urllib2.HTTPError as e: log.error("DELETE request failed: %s, %s, %s" % ( e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) app.quit(1) try: result = json.loads(response) if not isinstance(result, dict): raise ValueError() except ValueError: log.error("Invalid response: %s" % response) app.quit(1) log.info("Cluster deletion result: %s" % result) log.info("Waiting for the cluster to terminate...") wait_for_termination(result['cluster_url']) log.info("Cluster terminated/deleted")
def __on_active(self, root, task_id): log.debug('on_active(%r, %r)', root, task_id) if task_id in self.finished_tasks: log.error('Found an active task (%s) in finished tasks?', task_id) return task_monitor = TaskMonitor(root, task_id) if self._disable_task_resource_collection: resource_monitor = NullTaskResourceMonitor() else: disk_collector_provider = DiskCollectorProvider( self._enable_mesos_disk_collector, self._disk_collector_settings) resource_monitor = TaskResourceMonitor( task_id, task_monitor, disk_collector_provider=disk_collector_provider, process_collection_interval=self. _task_process_collection_interval, disk_collection_interval=self._disk_collector_settings. disk_collection_interval) resource_monitor.start() self._active_tasks[task_id] = ActiveObservedTask( root, task_id, task_monitor, resource_monitor)
def launchTask(self, driver, task): """ Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks). Note that this task can be realized with a thread, a process, or some simple computation, however, no other callbacks will be invoked on this executor until this callback has returned. """ self.launched.set() self.log('launchTask got task: %s:%s' % (task.name, task.task_id.value)) # TODO(wickman) Update the tests to call registered(), then remove this line and issue # an assert if self._driver is not populated. self._driver = driver if self._runner: log.error('Already running a task! %s' % self._task_id) self.send_update(driver, task.task_id.value, mesos_pb.TASK_LOST, "Task already running on this executor: %s" % self._task_id) return self._slave_id = task.slave_id.value self._task_id = task.task_id.value try: assigned_task = assigned_task_from_mesos_task(task) mesos_task = mesos_task_instance_from_assigned_task(assigned_task) except Exception as e: log.fatal('Could not deserialize AssignedTask') log.fatal(traceback.format_exc()) self.send_update( driver, self._task_id, mesos_pb.TASK_FAILED, "Could not deserialize task: %s" % e) defer(driver.stop, delay=self.STOP_WAIT) return defer(lambda: self._run(driver, assigned_task, mesos_task))
def _shutdown(self, status_result): runner_status = self._runner.status try: deadline(self._runner.stop, timeout=self.STOP_TIMEOUT) except Timeout: log.error('Failed to stop runner within deadline.') try: deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT) except Timeout: log.error('Failed to stop all checkers within deadline.') # If the runner was alive when _shutdown was called, defer to the status_result, # otherwise the runner's terminal state is the preferred state. exit_status = runner_status or status_result self.send_update( self._driver, self._task_id, self.translate_exit_state_to_mesos(exit_status.status), status_result.reason) self.terminated.set() defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
def send_request(self, endpoint, message, ttl): url_base = 'http://%s:%d' % self._target try: urllib2.urlopen('%s/%s/%s/%d' % (url_base, endpoint, message, ttl)).read() except Exception as e: log.error('Failed to query %s: %s' % (url_base, e))
def compute_status(self): if self.is_alive: return None exit_state = self.EXIT_STATE_MAP.get(self.task_state()) if exit_state is None: log.error('Received unexpected exit state from TaskMonitor.') return exit_state
def select(self): """ Read and multiplex checkpoint records from all the forked off process coordinators. Checkpoint records can come from one of two places: in-process: checkpoint records synthesized for FORKED and LOST events out-of-process: checkpoint records from from file descriptors of forked coordinators Returns a list of RunnerCkpt objects that were successfully read, or an empty list if none were read. """ self._bind_processes() updates = [] for handle in filter(None, self._processes.values()): try: fstat = os.fstat(handle.fileno()) except OSError: log.error('Unable to fstat %s!' % handle.name) continue if handle.tell() > fstat.st_size: log.error('Truncated checkpoint record detected on %s!' % handle.name) elif handle.tell() < fstat.st_size: rr = ThriftRecordReader(handle, RunnerCkpt) while True: process_update = rr.try_read() if process_update: updates.append(process_update) else: break if len(updates) > 0: log.debug('select() returning %s updates:' % len(updates)) for update in updates: log.debug(' = %s' % update) return updates
def handle_process(self, task_id, process_id): all_processes = {} current_run = self._observer.process(task_id, process_id) if not current_run: HttpServer.abort(404, 'Invalid task/process combination: %s/%s' % (task_id, process_id)) process = self._observer.process_from_name(task_id, process_id) if process is None: msg = 'Could not recover process: %s/%s' % (task_id, process_id) log.error(msg) HttpServer.abort(404, msg) current_run_number = current_run['process_run'] all_processes[current_run_number] = current_run for run in range(current_run_number): all_processes[run] = self._observer.process(task_id, process_id, run) template = { 'task_id': task_id, 'process': { 'name': process_id, 'status': all_processes[current_run_number]["state"], 'cmdline': process.cmdline().get() }, } template['process'].update(**all_processes[current_run_number].get('used', {})) template['runs'] = all_processes log.debug('Rendering template is: %s' % template) return template
def setup_child_subreaping(): """ This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This means if any children processes need to be reparented, they will be reparented to this process. More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html and here: https://lwn.net/Articles/474787/ Callers should reap terminal children to prevent zombies. """ log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER") # This constant is taken from prctl.h PR_SET_CHILD_SUBREAPER = 36 try: library_name = ctypes.util.find_library('c') if library_name is None: log.warning("libc is not found. Unable to call prctl!") log.warning("Children subreaping is disabled!") return libc = ctypes.CDLL(library_name, use_errno=True) # If we are on a system where prctl doesn't exist, this will throw an # attribute error. ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) if ret != 0: errno = ctypes.get_errno() raise OSError(errno, os.strerror(errno)) except Exception as e: log.error("Unable to call prctl %s" % e) log.error("Children subreaping is disabled!")
def create(args, options): validate_common_options(options) if not options.num_nodes: app.error("--num_nodes is required") if not options.cluster_user: app.error("--cluster_user is required") url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) values = dict( num_nodes=int(options.num_nodes), cluster_user=options.cluster_user, size=options.size if options.size else '', backup_id=options.backup_id if options.backup_id else '') req = urllib2.Request(url, urllib.urlencode(values)) try: response = urllib2.urlopen(req).read() except urllib2.HTTPError as e: log.error("POST request failed: %s, %s, %s" % ( e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) app.quit(1) try: result = json.loads(response) if not isinstance(result, dict): raise ValueError() except ValueError: log.error("Invalid response: %s" % response) app.quit(1) log.info("Cluster created. Cluster info: %s" % str(result)) with open(options.password_file, 'w') as f: f.write(result["cluster_password"]) log.info("Waiting for the master for this cluster to be elected...") master_endpoint = wait_for_master(result['cluster_url']).service_endpoint connection_str = "mysql://%s:%s@%s:%d/" % ( options.cluster_user, result["cluster_password"], master_endpoint.host, master_endpoint.port) log.info("Connecting to the MySQL cluster master: %s" % connection_str) engine = create_engine(connection_str) for i in range(5): # Loop for 5 times/seconds to wait for the master to be promoted. try: # TODO(jyx): Test writing to the master and reading from the slave. result = engine.execute("SELECT 1;").scalar() assert 1 == int(result), "Expecting result to be 1 but got %s" % result break except OperationalError: if i == 4: raise log.debug("MySQL master not ready yet. Sleep for 1 second...") time.sleep(1) log.info("Cluster successfully started")
def connect(self): try: redis_conn = redis.StrictRedis(host=self.host, port=self.port, db=self.db) self.redis_pipeline = redis_conn.pipeline() return redis_conn except Exception as _e: log.error("RedisSink: ConnectionError\n %s %s" % (self.config, str(_e)))
def connect(self): try: sock = socket.socket() sock.connect((self.host, self.port)) return sock except Exception as _e: log.error("Cannot connect to Graphite Sink with config:%s\n%s" % (self.config, str(_e)))
def _run_task(self, task): assert self._runner, "_runner should be created before this method is called" try: self._runner.start() log.info("Task runner for task %s started" % task.task_id) self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING) except TaskError as e: log.error("Task runner for task %s failed to start: %s" % (task.task_id, str(e))) # Send TASK_FAILED if the task failed to start. self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED) except Exception as e: log.error("Error occurred while executing the task: %s" % e) log.error(traceback.format_exc()) # Send TASK_LOST for unknown errors. self._send_update(task.task_id.value, mesos_pb2.TASK_LOST) # Wait for the task's return code (when it terminates). try: returncode = self._runner.join() # Regardless of the return code, if '_runner' terminates, it failed! log.error("Task process terminated with return code %s" % returncode) except TaskError as e: log.error("Task terminated: %s" % e) if self._killed: self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED) else: self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED) self._kill()
def method_wrapper(*args): with self._lock: start = time.time() while not self._terminating.is_set() and ( time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS): try: method = getattr(self.client(), method_name) if not callable(method): return method resp = method(*args) if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT: raise self.TransientError(", ".join( [m.message for m in resp.details] if resp.details else [])) return resp except TRequestsTransport.AuthError as e: log.error(self.scheduler_client().get_failed_auth_message()) raise self.AuthError(e) except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e: if not self._terminating.is_set(): log.warning('Connection error with scheduler: %s, reconnecting...' % e) self.invalidate() self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS)) except Exception as e: # Take any error that occurs during the RPC call, and transform it # into something clients can handle. if not self._terminating.is_set(): raise self.ThriftInternalError("Error during thrift call %s to %s: %s" % (method_name, self.cluster.name, e)) if not self._terminating.is_set(): raise self.TimeoutError('Timed out attempting to issue %s to %s' % ( method_name, self.cluster.name))
def _update_endpoints(self, _1, event, state, _2): """Update endpoints from ZK. This function will block until the ZK servers respond or retry limit is hit. :raises ReconnectFailed: If reconnection fails. """ if not (state == zookeeper.CONNECTED_STATE and event == zookeeper.CHILD_EVENT) and not ( state == zookeeper.EXPIRED_SESSION_STATE ): return try: endpoints = [] endpoint_names = self._zk.get_children(self._endpoint, self._update_endpoints) endpoint_names.sort() for endpoint in endpoint_names: data = self._zk.get(posixpath.join(self._endpoint, endpoint)) service_endpoint = serverset_types.ServiceInstance() endpoints.append(codec.deserialize(service_endpoint, data[0])) old = set(map(_format_endpoint, self._endpoints)) new = set(map(_format_endpoint, endpoints)) log.debug("ServerSet endpoints at %r changed to: %s" % (self._endpoint, ", ".join(new))) log.debug(" Added: %s" % ", ".join(new - old)) log.debug(" Removed: %s" % ", ".join(old - new)) with self._lock: if self._watcher: self._watcher(self._endpoint, self._endpoints, endpoints) self._endpoints = endpoints except ZooKeeper.Error as e: log.error("Lost connection to ZooKeeper: %s, reestablishing." % e) self._reconnect()
def run(self): try: log.info("Setting filter: %s", self.config.filter) if self.config.iface == "any": # pragma: no cover sniff( filter=self.config.filter, store=0, prn=self.handle_packet, stop_filter=self.wants_stop ) else: sniff( filter=self.config.filter, store=0, prn=self.handle_packet, iface=self.config.iface, stop_filter=self.wants_stop ) except socket.error as ex: if self._error_to_stderr: sys.stderr.write("Error: %s, device: %s\n" % (ex, self.config.iface)) else: log.error("Error: %s, device: %s", ex, self.config.iface) finally: log.info("The sniff loop exited") os.kill(os.getpid(), signal.SIGINT)
def compute_status(self): if self.is_alive: return None if self._popen_signal != 0: return StatusResult( 'Task killed by signal %s.' % self._popen_signal, mesos_pb2.TASK_KILLED) if self._popen_rc == 0 or self._popen_rc == TERMINAL_TASK: exit_state = self.EXIT_STATE_MAP.get(self.task_state()) if exit_state is None: log.error('Received unexpected exit state from TaskMonitor.') return StatusResult('Task checkpoint could not be read.', mesos_pb2.TASK_LOST) else: return exit_state elif self._popen_rc == UNKNOWN_USER: return StatusResult('Task started with unknown user.', mesos_pb2.TASK_FAILED) elif self._popen_rc == INTERNAL_ERROR: return StatusResult('Thermos failed with internal error.', mesos_pb2.TASK_LOST) elif self._popen_rc == INVALID_TASK: return StatusResult('Thermos received an invalid task.', mesos_pb2.TASK_FAILED) elif self._popen_rc == UNKNOWN_ERROR: return StatusResult('Thermos failed with an unknown error.', mesos_pb2.TASK_LOST) else: return StatusResult( 'Thermos exited for unknown reason (exit status: %s)' % self._popen_rc, mesos_pb2.TASK_LOST)
def handle_process(self, task_id, process_id): all_processes = {} current_run = self._observer.process(task_id, process_id) if not current_run: HttpServer.abort( 404, 'Invalid task/process combination: %s/%s' % (task_id, process_id)) process = self._observer.process_from_name(task_id, process_id) if process is None: msg = 'Could not recover process: %s/%s' % (task_id, process_id) log.error(msg) HttpServer.abort(404, msg) current_run_number = current_run['process_run'] all_processes[current_run_number] = current_run for run in range(current_run_number): all_processes[run] = self._observer.process( task_id, process_id, run) template = { 'task_id': task_id, 'process': { 'name': process_id, 'status': all_processes[current_run_number]["state"], 'cmdline': process.cmdline().get() }, } template['process'].update( **all_processes[current_run_number].get('used', {})) template['runs'] = all_processes log.debug('Rendering template is: %s', template) return template
def open_checkpoint(cls, filename, force=False, state=None): """ Acquire a locked checkpoint stream. """ safe_mkdir(os.path.dirname(filename)) fp = lock_file(filename, "a+") if fp in (None, False): if force: log.info('Found existing runner, forcing leadership forfeit.') state = state or CheckpointDispatcher.from_file(filename) if cls.kill_runner(state): log.info('Successfully killed leader.') # TODO(wickman) Blocking may not be the best idea here. Perhaps block up to # a maximum timeout. But blocking is necessary because os.kill does not immediately # release the lock if we're in force mode. fp = lock_file(filename, "a+", blocking=True) else: log.error('Found existing runner, cannot take control.') if fp in (None, False): raise cls.PermissionError( 'Could not open locked checkpoint: %s, lock_file = %s' % (filename, fp)) ckpt = ThriftRecordWriter(fp) ckpt.set_sync(True) return ckpt
def _apply_states(self): """ os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt records. Attempt to read those records and update the high watermark for that stream. Returns True if new states were applied, False otherwise. """ ckpt_offset = None try: ckpt_offset = os.stat(self._runner_ckpt).st_size updated = False if self._ckpt_head < ckpt_offset: with open(self._runner_ckpt, "r") as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while True: runner_update = rr.try_read() if not runner_update: break try: self._dispatcher.dispatch(self._runnerstate, runner_update) except CheckpointDispatcher.InvalidSequenceNumber as e: log.error("Checkpoint stream is corrupt: %s" % e) break new_ckpt_head = fp.tell() updated = self._ckpt_head != new_ckpt_head self._ckpt_head = new_ckpt_head return updated except OSError as e: if e.errno == errno.ENOENT: # The log doesn't yet exist, will retry later. log.warning("Could not read from checkpoint %s" % self._runner_ckpt) return False else: raise
def launchTask(self, driver, task): """ Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks). Note that this task can be realized with a thread, a process, or some simple computation, however, no other callbacks will be invoked on this executor until this callback has returned. """ self.launched.set() self.log('TaskInfo: %s' % task) self.log('launchTask got task: %s:%s' % (task.name, task.task_id.value)) # TODO(wickman) Update the tests to call registered(), then remove this line and issue # an assert if self._driver is not populated. self._driver = driver if self._runner: log.error('Already running a task! %s' % self._task_id) self.send_update(driver, task.task_id.value, mesos_pb2.TASK_LOST, "Task already running on this executor: %s" % self._task_id) return self._slave_id = task.slave_id.value self._task_id = task.task_id.value assigned_task = self.validate_task(task) self.log("Assigned task: %s" % assigned_task) if not assigned_task: self.send_update(driver, self._task_id, mesos_pb2.TASK_FAILED, 'Could not deserialize task.') defer(driver.stop, delay=self.STOP_WAIT) return defer(lambda: self._run(driver, assigned_task, self.extract_mount_paths_from_task(task)))
def wait_start(self, timeout=MAX_WAIT): log.debug('Waiting for task to start.') def is_started(): return self._monitor and (self._monitor.active or self._monitor.finished) waited = Amount(0, Time.SECONDS) while waited < timeout: if not is_started(): log.debug(' - sleeping...') self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL else: break if not self.is_alive: if self._popen_rc != 0: raise TaskError('Task failed: %s' % self.compute_status().reason) else: # We can end up here if the process exited between the call to Popen and # waitpid (in is_alive), which is fine. log.info('Task runner exited: %s' % self.compute_status().reason) break if not is_started(): log.error('Task did not start with in deadline, forcing loss.') self.lose() raise TaskError('Task did not start within deadline.')
def statusUpdate(self, driver, status): with self._lock: # Forward the status update to the corresponding launcher. task_id = status.task_id.value launcher = self._get_launcher_by_task_id(task_id) if not launcher: log.info("Cluster for task %s doesn't exist. It could have been removed" % task_id) return try: launcher.status_update(status) except MySQLClusterLauncher.Error as e: log.error("Status update failed due to launcher error: %s" % e.message) self._stop() # Update metrics. # TODO(xujyan): This doesn't rule out duplicates, etc. We can consider updating these metrics # in the launcher. if status.state == mesos_pb2.TASK_FINISHED: self._metrics.tasks_finished.increment() elif status.state == mesos_pb2.TASK_FAILED: self._metrics.tasks_failed.increment() elif status.state == mesos_pb2.TASK_KILLED: self._metrics.tasks_killed.increment() elif status.state == mesos_pb2.TASK_LOST: self._metrics.tasks_lost.increment() if launcher.terminated: log.info("Deleting the launcher for cluster %s because the cluster has terminated" % launcher.cluster_name) self._delete_launcher(launcher)
def wait_start(self, timeout=MAX_WAIT): log.debug("Waiting for task to start.") def is_started(): return self._monitor and (self._monitor.active or self._monitor.finished) waited = Amount(0, Time.SECONDS) while waited < timeout: if not is_started(): log.debug(" - sleeping...") self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS)) waited += self.POLL_INTERVAL else: break if not self.is_alive: if self._popen_rc != 0: raise TaskError("Task failed: %s" % self.compute_status().reason) else: # We can end up here if the process exited between the call to Popen and # waitpid (in is_alive), which is fine. log.info("Task runner exited: %s" % self.compute_status().reason) break if not is_started(): log.error("Task did not start with in deadline, forcing loss.") self.lose() raise TaskError("Task did not start within deadline.")
def statusUpdate(self, driver, status): with self._lock: # Forward the status update to the corresponding launcher. task_id = status.task_id.value launcher = self._get_launcher_by_task_id(task_id) if not launcher: log.info( "Cluster for task %s doesn't exist. It could have been removed" % task_id) return try: launcher.status_update(status) except MySQLClusterLauncher.Error as e: log.error("Status update failed due to launcher error: %s" % e.message) self._stop() # Update metrics. # TODO(xujyan): This doesn't rule out duplicates, etc. We can consider updating these metrics # in the launcher. if status.state == mesos_pb2.TASK_FINISHED: self._metrics.tasks_finished.increment() elif status.state == mesos_pb2.TASK_FAILED: self._metrics.tasks_failed.increment() elif status.state == mesos_pb2.TASK_KILLED: self._metrics.tasks_killed.increment() elif status.state == mesos_pb2.TASK_LOST: self._metrics.tasks_lost.increment() if launcher.terminated: log.info( "Deleting the launcher for cluster %s because the cluster has terminated" % launcher.cluster_name) self._delete_launcher(launcher)
def is_alive(self): """ Is the process underlying the Thermos task runner alive? """ if not self._popen: return False if self._dead.is_set(): return False # N.B. You cannot mix this code and any code that relies upon os.wait # mechanisms with blanket child process collection. One example is the # Thermos task runner which calls os.wait4 -- without refactoring, you # should not mix a Thermos task runner in the same process as this # thread. try: pid, status = os.waitpid(self._popen.pid, os.WNOHANG) if pid == 0: return True else: self._popen_signal, self._popen_rc = self._decode_status( status) log.info( 'Detected runner termination: pid=%s, signal=%s, rc=%s' % (pid, self._popen_signal, self._popen_rc)) except OSError as e: log.error('is_alive got OSError: %s' % e) if e.errno != errno.ECHILD: raise self._dead.set() return False
def get(cls, task_id, checkpoint_root): """ Get a TaskRunner bound to the task_id in checkpoint_root. """ path = TaskPath(root=checkpoint_root, task_id=task_id, state="active") task_json = path.getpath("task_path") task_checkpoint = path.getpath("runner_checkpoint") if not os.path.exists(task_json): return None task = ThermosConfigLoader.load_json(task_json) if task is None: return None if len(task.tasks()) == 0: return None try: checkpoint = CheckpointDispatcher.from_file(task_checkpoint) if checkpoint is None or checkpoint.header is None: return None return cls( task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox, log_dir=checkpoint.header.log_dir, task_id=task_id, portmap=checkpoint.header.ports, hostname=checkpoint.header.hostname, ) except Exception as e: log.error("Failed to reconstitute checkpoint in TaskRunner.get: %s" % e, exc_info=True) return None
def select(self): """ Read and multiplex checkpoint records from all the forked off process coordinators. Checkpoint records can come from one of two places: in-process: checkpoint records synthesized for FORKED and LOST events out-of-process: checkpoint records from from file descriptors of forked coordinators Returns a list of RunnerCkpt objects that were successfully read, or an empty list if none were read. """ self._bind_processes() updates = [] for handle in filter(None, self._processes.values()): try: fstat = os.fstat(handle.fileno()) except OSError as e: log.error('Unable to fstat %s!' % handle.name) continue if handle.tell() > fstat.st_size: log.error('Truncated checkpoint record detected on %s!' % handle.name) elif handle.tell() < fstat.st_size: rr = ThriftRecordReader(handle, RunnerCkpt) while True: process_update = rr.try_read() if process_update: updates.append(process_update) else: break if len(updates) > 0: log.debug('select() returning %s updates:' % len(updates)) for update in updates: log.debug(' = %s' % update) return updates
def _update_endpoints(self, _1, event, state, _2): """Update endpoints from ZK. This function will block until the ZK servers respond or retry limit is hit. :raises ReconnectFailed: If reconnection fails. """ if not (state == zookeeper.CONNECTED_STATE and event == zookeeper.CHILD_EVENT) and not ( state == zookeeper.EXPIRED_SESSION_STATE): return try: endpoints = [] endpoint_names = self._zk.get_children(self._endpoint, self._update_endpoints) endpoint_names.sort() for endpoint in endpoint_names: data = self._zk.get(posixpath.join(self._endpoint, endpoint)) service_endpoint = serverset_types.ServiceInstance() endpoints.append(codec.deserialize(service_endpoint, data[0])) old = set(map(_format_endpoint, self._endpoints)) new = set(map(_format_endpoint, endpoints)) log.debug('ServerSet endpoints at %r changed to: %s' % (self._endpoint, ', '.join(new))) log.debug(' Added: %s' % ', '.join(new - old)) log.debug(' Removed: %s' % ', '.join(old - new)) with self._lock: if self._watcher: self._watcher(self._endpoint, self._endpoints, endpoints) self._endpoints = endpoints except ZooKeeper.Error as e: log.error('Lost connection to ZooKeeper: %s, reestablishing.' % e) self._reconnect()
def _run(self, driver, assigned_task, mounted_volume_paths): """ Commence running a Task. - Initialize the sandbox - Start the ThermosTaskRunner (fork the Thermos TaskRunner) - Set up necessary HealthCheckers - Set up StatusManager, and attach HealthCheckers """ self.send_update(driver, self._task_id, mesos_pb2.TASK_STARTING) if not self._initialize_sandbox(driver, assigned_task, mounted_volume_paths): return # start the process on a separate thread and give the message processing thread back # to the driver try: self._runner = self._runner_provider.from_assigned_task(assigned_task, self._sandbox) except TaskError as e: self.runner_aborted.set() self._die(driver, mesos_pb2.TASK_FAILED, str(e)) return if not isinstance(self._runner, TaskRunner): self._die(driver, mesos_pb2.TASK_FAILED, 'Unrecognized task!') return if not self._start_runner(driver, assigned_task): return try: self._start_status_manager(driver, assigned_task) except Exception: log.error(traceback.format_exc()) self._die(driver, mesos_pb2.TASK_FAILED, "Internal error")
def _rollback(self, instances_to_rollback, instance_configs): """Performs a rollback operation for the failed instances. Arguments: instances_to_rollback -- instance ids to rollback. instance_configs -- instance configuration to use for rollback. """ if not self._update_config.rollback_on_failure: log.info('Rollback on failure is disabled in config. Aborting rollback') return log.info('Reverting update for %s' % instances_to_rollback) instance_operation = self.OperationConfigs( from_config=instance_configs.local_config_map, to_config=instance_configs.remote_config_map ) instances_to_rollback.sort(reverse=True) failed_instances = [] while instances_to_rollback: batch_instances = instances_to_rollback[0 : self._update_config.batch_size] instances_to_rollback = list(set(instances_to_rollback) - set(batch_instances)) instances_to_rollback.sort(reverse=True) instances_to_watch = self._update_instances(batch_instances, instance_operation) failed_instances += self._watcher.watch(instances_to_watch) if failed_instances: log.error('Rollback failed for instances: %s' % failed_instances)
def control(self, force=False): """ Bind to the checkpoint associated with this task, position to the end of the log if it exists, or create it if it doesn't. Fails if we cannot get "leadership" i.e. a file lock on the checkpoint stream. """ if self.is_terminal(): raise self.StateError( 'Cannot take control of a task in terminal state.') if self._sandbox: safe_mkdir(self._sandbox) ckpt_file = self._pathspec.getpath('runner_checkpoint') try: self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state) except TaskRunnerHelper.PermissionError: raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file) log.debug('Flipping recovery mode off.') self._recovery = False self._set_task_status(self.task_state()) self._resume_task() try: yield except Exception as e: log.error('Caught exception in self.control(): %s' % e) log.error(' %s' % traceback.format_exc()) self._ckpt.close()
def get(cls, task_id, checkpoint_root): """ Get a TaskRunner bound to the task_id in checkpoint_root. """ path = TaskPath(root=checkpoint_root, task_id=task_id, state='active') task_json = path.getpath('task_path') task_checkpoint = path.getpath('runner_checkpoint') if not os.path.exists(task_json): return None task = ThermosConfigLoader.load_json(task_json) if task is None: return None if len(task.tasks()) == 0: return None try: checkpoint = CheckpointDispatcher.from_file(task_checkpoint) if checkpoint is None or checkpoint.header is None: return None return cls(task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox, log_dir=checkpoint.header.log_dir, task_id=task_id, portmap=checkpoint.header.ports, hostname=checkpoint.header.hostname) except Exception as e: log.error( 'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e, exc_info=True) return None
def _initialize_ckpt_header(self): """ Initializes the RunnerHeader for this checkpoint stream if it has not already been constructed. """ if self._state.header is None: try: uid = pwd.getpwnam(self._user).pw_uid except KeyError: # This will cause failures downstream, but they will at least be correctly # reflected in the process state. log.error('Unknown user %s.' % self._user) uid = None header = RunnerHeader( task_id=self._task_id, launch_time_ms=int(self._launch_time * 1000), sandbox=self._sandbox, log_dir=self._log_dir, hostname=self._hostname, user=self._user, uid=uid, ports=self._portmap) runner_ckpt = RunnerCkpt(runner_header=header) self._dispatcher.dispatch(self._state, runner_ckpt)
def validate_quota_from_requested(self, job_key, production, released, acquired): """Validates requested change will not exceed the available quota. Arguments: job_key -- job key. production -- production flag. released -- production CapacityRequest to be released (in case of job update). acquired -- production CapacityRequest to be acquired. Returns: ResponseCode.OK if check is successful. """ resp_ok = Response(responseCode=ResponseCode.OK, messageDEPRECATED='Quota check successful.') if not production: return resp_ok resp = self._scheduler.getQuota(job_key.role) if resp.responseCode != ResponseCode.OK: log.error('Failed to get quota from scheduler: %s' % resp.messageDEPRECATED) return resp allocated = CapacityRequest(resp.result.getQuotaResult.quota) consumed = CapacityRequest(resp.result.getQuotaResult.prodConsumption) requested = acquired - released effective = allocated - consumed - requested if not effective.valid(): log.info('Not enough quota to create/update job.') print_quota(allocated.quota(), 'Total allocated quota', job_key.role) print_quota(consumed.quota(), 'Consumed quota', job_key.role) print_quota(requested.quota(), 'Requested', job_key.name) return Response( responseCode=ResponseCode.INVALID_REQUEST, messageDEPRECATED='Failed quota check.') return resp_ok
def is_alive(self): """ Is the process underlying the Thermos task runner alive? """ if not self._popen: return False if self._dead.is_set(): return False # N.B. You cannot mix this code and any code that relies upon os.wait # mechanisms with blanket child process collection. One example is the # Thermos task runner which calls os.wait4 -- without refactoring, you # should not mix a Thermos task runner in the same process as this # thread. try: pid, _ = os.waitpid(self._popen.pid, os.WNOHANG) if pid == 0: return True else: log.info('Detected runner termination: pid=%s' % pid) except OSError as e: log.error('is_alive got OSError: %s' % e) if e.errno != errno.ECHILD: raise self._dead.set() return False
def _initialize_ckpt_header(self): """ Initializes the RunnerHeader for this checkpoint stream if it has not already been constructed. """ if self._state.header is None: try: uid = pwd.getpwnam(self._user).pw_uid except KeyError: # This will cause failures downstream, but they will at least be correctly # reflected in the process state. log.error('Unknown user %s.', self._user) uid = None header = RunnerHeader( task_id=self._task_id, launch_time_ms=int(self._launch_time * 1000), sandbox=self._sandbox, log_dir=self._log_dir, hostname=self._hostname, user=self._user, uid=uid, ports=self._portmap) runner_ckpt = RunnerCkpt(runner_header=header) self._dispatcher.dispatch(self._state, runner_ckpt)
def launchTask(self, driver, task): """ Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks). Note that this task can be realized with a thread, a process, or some simple computation, however, no other callbacks will be invoked on this executor until this callback has returned. """ self.launched.set() self.log('launchTask got task: %s:%s' % (task.name, task.task_id.value)) # TODO(wickman) Update the tests to call registered(), then remove this line and issue # an assert if self._driver is not populated. self._driver = driver if self._runner: log.error('Already running a task! %s' % self._task_id) self.send_update( driver, task.task_id.value, mesos_pb2.TASK_LOST, "Task already running on this executor: %s" % self._task_id) return self._slave_id = task.slave_id.value self._task_id = task.task_id.value assigned_task = self.validate_task(task) if not assigned_task: self.send_update(driver, self._task_id, mesos_pb2.TASK_FAILED, 'Could not deserialize task.') defer(driver.stop, delay=self.STOP_WAIT) return defer(lambda: self._run(driver, assigned_task))
def _rollback(self, instances_to_rollback, instance_configs): """Performs a rollback operation for the failed instances. Arguments: instances_to_rollback -- instance ids to rollback. instance_configs -- instance configuration to use for rollback. """ log.info('Reverting update for %s' % instances_to_rollback) instance_operation = self.OperationConfigs( from_config=instance_configs.local_config_map, to_config=instance_configs.remote_config_map) instances_to_rollback.sort(reverse=True) failed_instances = [] while instances_to_rollback: batch_instances = instances_to_rollback[0:self._update_config. batch_size] instances_to_rollback = list( set(instances_to_rollback) - set(batch_instances)) instances_to_rollback.sort(reverse=True) instances_to_watch = self._update_instances( batch_instances, instance_operation) failed_instances += self._watcher.watch(instances_to_watch) if failed_instances: log.error('Rollback failed for instances: %s' % failed_instances)
def control(self, force=False): """ Bind to the checkpoint associated with this task, position to the end of the log if it exists, or create it if it doesn't. Fails if we cannot get "leadership" i.e. a file lock on the checkpoint stream. """ if self.is_terminal(): raise self.StateError('Cannot take control of a task in terminal state.') if self._sandbox: safe_mkdir(self._sandbox) ckpt_file = self._pathspec.getpath('runner_checkpoint') try: self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state) except TaskRunnerHelper.PermissionError: raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file) log.debug('Flipping recovery mode off.') self._recovery = False self._set_task_status(self.task_state()) self._resume_task() try: yield except Exception as e: log.error('Caught exception in self.control(): %s', e) log.error(' %s', traceback.format_exc()) self._ckpt.close()
def addattachment(self, page, filename): """Add an attachment to an existing page. Note: this will first read the entire file into memory""" mime_type = mimetypes.guess_type(filename, strict=False)[0] if not mime_type: raise ConfluenceError('Failed to detect MIME type of %s' % filename) try: with open(filename, 'rb') as f: file_data = f.read() attachment = dict(fileName=basename(filename), contentType=mime_type) return self._api_entrypoint.addAttachment(self._session_token, page['id'], attachment, Binary(file_data)) except (IOError, OSError) as e: log.error('Failed to read data from file %s: %s' % (filename, str(e))) return None except XMLRPCError as e: log.error('Failed to add file attachment %s to page: %s' % (filename, page.get('title', '[unknown title]'))) return None
def _apply_states(self): """ os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt records. Attempt to read those records and update the high watermark for that stream. Returns True if new states were applied, False otherwise. """ ckpt_offset = None try: ckpt_offset = os.stat(self._runner_ckpt).st_size updated = False if self._ckpt_head < ckpt_offset: with open(self._runner_ckpt, 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while True: runner_update = rr.try_read() if not runner_update: break try: self._dispatcher.dispatch(self._runnerstate, runner_update) except CheckpointDispatcher.InvalidSequenceNumber as e: log.error('Checkpoint stream is corrupt: %s' % e) break new_ckpt_head = fp.tell() updated = self._ckpt_head != new_ckpt_head self._ckpt_head = new_ckpt_head return updated except OSError as e: if e.errno == errno.ENOENT: # The log doesn't yet exist, will retry later. log.warning('Could not read from checkpoint %s' % self._runner_ckpt) return False else: raise
def _check_sla(self, hostnames, grouping_function, percentage, duration): """Check if the provided list of hosts passes the job uptime SLA check. This is an all-or-nothing check, meaning that all provided hosts must pass their job SLA check for the maintenance to proceed. :param hostnames: list of host names to check SLA for :type hostnames: list of strings :param grouping_function: grouping function to apply to the given hosts :type grouping_function: function :param percentage: SLA uptime percentage override :type percentage: float :param duration: SLA uptime duration override :type duration: twitter.common.quantity.Amount :rtype: set of unsafe hosts """ vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames) host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function) unsafe_hostnames = set() # Given that maintenance is performed 1 group at a time, any result longer than 1 group # should be considered a batch failure. if host_groups: if len(host_groups) > 1: log.error("Illegal multiple groups detected in SLA results. Skipping hosts: %s" % hostnames) return set(hostnames) results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True) if results: print_results(results) return unsafe_hostnames return unsafe_hostnames
def safe_signal(cls, pid, sig=signal.SIGTERM): try: os.kill(pid, sig) except OSError as e: if e.errno not in (errno.ESRCH, errno.EPERM): log.error('Unexpected error in os.kill: %s' % e) except Exception as e: log.error('Unexpected error in os.kill: %s' % e)
def resolve(self): for job in self._jobs: resp = self._api.query(self.query_from(self._role, self._env, job)) if resp.responseCode != ResponseCode.OK: log.error("Failed to query job: %s" % job) continue for task in resp.result.scheduleStatusResult.tasks: yield task
def delete(j, name): """ delete job """ try: j.delete_job(name) except JenkinsAPIException as e: log.error("error deleting job: %s" % e)
def resolve(self): for job in self._jobs: resp = self._api.query(self.query_from(self._role, self._env, job)) if resp.responseCode != ResponseCode.OK: log.error('Failed to query job: %s' % job) continue for task in resp.result.scheduleStatusResult.tasks: yield task