def run_in_thread(p): q = Queue() t = Thread(target=enqueue_output, args=(p, q)) t.daemon = True # thread dies with the program t.start() lines = ["\n"] line = '' alive = True app = wx.GetApp().TopWindow if sx_print_to_consol: write_cmd = app.proj_tree_viewer.consol.log.AppendText else: write_cmd = app.shell.WriteTextAndPrompt while True: time.sleep(0.01) try: line = q.get_nowait() # or q.get(timeout=.1) except Empty: if len(lines) != 0: wx.CallAfter(write_cmd, ''.join(lines)) lines = [] except: import traceback traceback.print_exc() break else: # got line lines.append(line) if line.startswith('process terminated'): if len(lines) > 1: wx.CallAfter(write_cmd, ''.join(lines[:-1])) break return
class StoppableThread(threading.Thread): """This is thread can be stopped. Note: Thread by default does not return function result in any case, which is why I've implemented this workaroung with built-in Queue. """ def __init__(self, **kwargs): super(StoppableThread, self).__init__(**kwargs) self.__target = kwargs.get('target') self.__args = kwargs.get('args') if self.__args is None: self.__args = () self.__kwargs = kwargs.get('kwargs') if self.__kwargs is None: self.__kwargs = {} self.__result_queue = Queue() self.__stopped = threading.Event() def stop(self): """Stop the thread. It will not terminate code, but set the flag that should be handled in executed function. """ self.__stopped.set() def is_stopped(self): """Check the status of the thread. It only monitors the flag state. If task is stopped you have to pay attention to `.is_alive()`. """ return self.__stopped.is_set() def run(self): """Run the target function, check expected result and propagate exceptions. """ try: self.__kwargs['_is_stopped'] = self.__stopped.is_set try: if self.__target: func_result = self.__target(*self.__args, **self.__kwargs) finally: # Avoid a refcycle if the thread is running a function with # an argument that has a member that points to the thread. del self.__target, self.__args, self.__kwargs if func_result is None: func_result = {} elif not isinstance(func_result, dict): raise TypeError("Task has to return a dict or None.") except Exception: # pylint: disable=W0703 self.__result_queue.put(traceback.format_exc()) else: self.__result_queue.put(func_result) def get_result(self): """Return results of target function execution. """ self.join() try: return self.__result_queue.get_nowait() except Queue.Empty: return None
def run_for_max_seconds(max_secs, _function, *args, **kwargs): """ Run the given function for a maximum of `max_secs` seconds - continue running in a background thread if the function does not finish in time. """ def _worker(*_args): result = None try: result = _function(*args, **kwargs) except Exception as e: result = e result = True if result is None else result q.put(result) return result start = now() q = Queue() start_worker_thread(_worker) for i in range(max_secs * 2): result = None try: result = q.get_nowait() except Exception: pass if result is not None: if isinstance(result, Exception): raise result return result if now() - start >= max_secs: return time.sleep(0.5)
def _handle_monitor_event_process(self, process, error_message=None): stdout_queue = Queue() stderr_queue = Queue() stream_thread_map = { 'stdout': Thread(target=self._enqueue_stream, args=(process.stdout, stdout_queue), daemon=True), 'stderr': Thread(target=self._enqueue_stream, args=(process.stderr, stderr_queue), daemon=True) } stream_thread_map['stdout'].start() stream_thread_map['stderr'].start() try: while not self._proc_terminated: if not process.poll(): try: self.output.echo(stdout_queue.get_nowait()) except Empty: pass else: err = None try: err = stderr_queue.get_nowait() except Empty: pass # Avoid empty sys.excepthook errors from underlying future # There is already a uAMQP issue in work for this # https://github.com/Azure/azure-uamqp-python/issues/30 if err and "sys.excepthook" not in err: err = err.lstrip() err = err.lstrip('ERROR:') if error_message: err = "{}: {}".format(error_message, err) self.output.error(err) return False except KeyboardInterrupt: self.output.info('Terminating process...') self._terminate_process_tree() return True
def test_threading(self): result_queue = Queue() with OverrideDB(CommCareCase, self.other_db_1): obj = _override_db.class_to_db def run(): with OverrideDB(CommCareCase, self.other_db_2): result_queue.put(_override_db.class_to_db) t = threading.Thread(target=run) t.start() t.join() result = result_queue.get_nowait() self.assertNotEqual(id(obj), id(result))
class ThreadedTransport(object): def __init__(self, spider, thread_number): self.spider = spider self.thread_number = thread_number self.task_queue = Queue() self.result_queue = Queue() self.workers = [] self.freelist = [] for _ in six.moves.range(self.thread_number): thread = Thread(target=worker_thread, args=[ self.task_queue, self.result_queue, self.freelist, self.spider.shutdown_event ]) thread.daemon = True self.workers.append(thread) self.freelist.append(1) thread.start() def ready_for_task(self): return len(self.freelist) def get_free_threads_number(self): return len(self.freelist) def get_active_threads_number(self): return self.thread_number - len(self.freelist) def start_task_processing(self, task, grab, grab_config_backup): self.task_queue.put((task, grab, grab_config_backup)) def process_handlers(self): pass def iterate_results(self): while True: try: result = self.result_queue.get_nowait() except Empty: break else: # FORMAT: {ok, grab, grab_config_backup, task, # emsg, error_abbr} #grab.doc.error_code = None #grab.doc.error_msg = None yield result
class FluentdEvent(object): def __init__(self, app=None): self.app = app if app is not None: self.init_app(app) # Send events after every request finishes app.after_request(self.send_events) # Unbounded queue for sent events self.queue = Queue() def init_app(self, app): tag_prefix = app.config.get("FLUENTD_EVENT_TAG_PREFIX", "flask.fluentd_event") host = app.config.get("FLUENTD_EVENT_HOST", "localhost") port = int(app.config.get("FLUENTD_EVENT_PORT", 24224)) self._sender = sender.FluentSender(tag_prefix, host=host, port=port) # Use the newstyle teardown_appcontext if it's available, # otherwise fall back to the request context if hasattr(app, "teardown_appcontext"): app.teardown_appcontext(self.send_events) else: app.teardown_request(self.send_events) def event(self, tag, event): self.queue.put((tag, event)) def send_events(self, exception): """ Makes a best-effort to send all the events that it pushed during a request but capable of missing some """ pumping = True while pumping: try: tag, event = self.queue.get_nowait() self._sender.emit(tag, event) self.queue.task_done() except Empty: pumping = False except Exception as e: # This is bad but it's worse to foul the request because # of a logging issue logging.exception(e) self.queue.task_done() return exception
class Fluentd(object): def __init__(self, app=None): self.app = app if app is not None: self.init_app(app) # Send events after every request finishes app.after_request(self.send_events) # Unbounded queue for sent events self.queue = Queue() tag_label = app.config.get('EVENT_TAG_PREFIX', 'flask.fluentd') self._sender = sender.FluentSender(tag_label) def init_app(self, app): # Use the newstyle teardown_appcontext if it's available, # otherwise fall back to the request context if hasattr(app, 'teardown_appcontext'): app.teardown_appcontext(self.send_events) else: app.teardown_request(self.send_events) def event(self, pair): tag, evt = pair self.queue.put((tag, evt)) def send_events(self, exception): """ Makes a best-effort to send all the events that it pushed during a request but capable of missing some """ pumping = True while pumping: try: tag, evt = self.queue.get_nowait() self._sender.emit(tag, evt) self.queue.task_done() except Empty: pumping = False except Exception as e: # This is bad but it's worse to foul the request because # of a logging issue logging.exception(e) self.queue.task_done() return exception
class Scheduler(): def __init__(self): self.q = Queue() def add_request(self, request): # 请求入队的函数 self.q.put(request) def get_request(self): # 取出request并返回 try: request = self.q.get_nowait() except: request = None return request def _filter_request(self): '''请求去重''' # 暂时不实现 pass
class NLSocketPool(object): """Pool of netlink sockets.""" def __init__(self, size): if size <= 0: raise ValueError('Invalid socket pool size %r. Must be positive') self._semaphore = BoundedSemaphore(size) self._sockets = Queue(maxsize=size) @contextmanager def socket(self): """Returns a socket from the pool (creating it when needed).""" with self._semaphore: try: sock = self._sockets.get_nowait() except Empty: sock = _open_socket() try: yield sock finally: self._sockets.put_nowait(sock)
class Fluentd(object): def __init__(self, app=None): self.app = app if app is not None: self.init_app(app) # Send events after every request finishes app.after_request(self.send_events) # Unbounded queue for sent events self.queue = Queue() tag_label = app.config.get('EVENT_TAG_PREFIX', 'flask.fluentd') self._sender = sender.FluentSender(tag_label) def init_app(self, app): # Use the newstyle teardown_appcontext if it's available, # otherwise fall back to the request context if hasattr(app, 'teardown_appcontext'): app.teardown_appcontext(self.send_events) else: app.teardown_request(self.send_events) def event(self, pair): tag, evt = pair self.queue.put((tag, evt)) def send_events(self, exception): """ Makes a best-effort to send all the events that it pushed during a request but capable of missing some """ pumping = True while pumping: try: tag, evt = self.queue.get_nowait() self._sender.emit(tag, evt) self.queue.task_done() except Empty: pumping = False return exception
class AmqpSubscriber(Subscriber): def __init__(self, amqp_chan, exchanges): self.channel = amqp_chan self.messages = Queue(maxsize=0) qname, _, _ = self.channel.queue_declare() for exchange in exchanges: self.channel.queue_bind(qname, exchange) self.channel.basic_consume(queue=qname, callback=self.callback) def callback(self, msg): self.channel.basic_ack(msg.delivery_tag) self.messages.put_nowait(msg.body) def __iter__(self): return self def next(self): while self.messages.empty(): self.channel.wait() return self.messages.get_nowait() __next__ = next # PY3
class Scheduler(): def __init__(self): self.q = Queue() self.fp_set = set() self.total_repeat_nums = 0 def add_request(self, request): # 把request放入请求队列 # 判断指纹是否在集合中,如果不在就入队 if self._filter_request(request): self.q.put(request) def get_request(self): # 取出一个request;取不出就返回none try: request = self.q.get_nowait() except: request = None return request def _filter_request(self, request): '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True''' fp = self._gen_fp(request) if fp not in self.fp_set: self.fp_set.add(fp) return True self.total_repeat_nums += 1 # 重复的请求数 +1 logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False def _gen_fp(self, request): # 返回request的fp指纹字符串 url = canonicalize_url(request.url) method = request.method.upper() data = request.data if request.data else {} data = sorted(data.items(), key=lambda x: x[0]) # 把data字典按(k,v)进行迭代,按照k作为排序的依据 # 默认就是用k作为排序的依据 # key=lambda x:x[0] x就是每次迭代的(k,v), x[0]就是排序的依据 # 最终返回 [('a', 1), ('b', 2)] sha1 = hashlib.sha1() sha1.update(self._to_bytes(url)) sha1.update(self._to_bytes(method)) sha1.update(self._to_bytes(str(data))) fp = sha1.hexdigest() return fp def _to_bytes(self, string): """py2 py3 正好相反!""" if six.PY2: # 判断当前是否是python2 if isinstance(string, str): return string else: return string.encode() elif six.PY3: # 判断当前是否是python3 if isinstance(string, str): return string.encode() else: return string
class YoutubeDLDownloader(object): """Python class for downloading videos using youtube-dl & subprocess. Attributes: OK, ERROR, STOPPED, ALREADY, FILESIZE_ABORT, WARNING (int): Integers that describe the return code from the download() method. The larger the number the higher is the hierarchy of the code. Codes with smaller hierachy cannot overwrite codes with higher hierarchy. Args: youtubedl_path (string): Absolute path to youtube-dl binary. data_hook (function): Optional callback function to retrieve download process data. log_data (function): Optional callback function to write data to the log file. Warnings: The caller is responsible for calling the close() method after he has finished with the object in order for the object to be able to properly close down itself. Example: How to use YoutubeDLDownloader from a python script. from downloaders import YoutubeDLDownloader def data_hook(data): print(data) downloader = YoutubeDLDownloader('/usr/bin/youtube-dl', data_hook) downloader.download(<URL STRING>, ['-f', 'flv']) """ OK = 0 WARNING = 1 ERROR = 2 FILESIZE_ABORT = 3 ALREADY = 4 STOPPED = 5 def __init__(self, youtubedl_path, data_hook=None, log_data=None): self.youtubedl_path = youtubedl_path self.data_hook = data_hook self.log_data = log_data self._return_code = self.OK self._proc = None self._stderr_queue = Queue() self._stderr_reader = PipeReader(self._stderr_queue) def download(self, url, options): """Download url using given options. Args: url (string): URL string to download. options (list): Python list that contains youtube-dl options. Returns: An integer that shows the status of the download process. There are 6 different return codes. OK (0): The download process completed successfully. WARNING (1): A warning occured during the download process. ERROR (2): An error occured during the download process. FILESIZE_ABORT (3): The corresponding url video file was larger or smaller from the given filesize limit. ALREADY (4): The given url is already downloaded. STOPPED (5): The download process was stopped by the user. """ self._return_code = self.OK cmd = self._get_cmd(url, options) self._create_process(cmd) if self._proc is not None: self._stderr_reader.attach_filedescriptor(self._proc.stderr) while self._proc_is_alive(): stdout = self._proc.stdout.readline().rstrip() stdout = convert_item(stdout, to_unicode=True) if stdout: data_dict = extract_data(stdout) self._extract_info(data_dict) self._hook_data(data_dict) # Read stderr after download process has been completed # We don't need to read stderr in real time while not self._stderr_queue.empty(): stderr = self._stderr_queue.get_nowait() if len(stderr) == 0: break stderr = convert_item(stderr.rstrip(), to_unicode=True) self._log(stderr) if self._is_warning(stderr): self._set_returncode(self.WARNING) else: self._set_returncode(self.ERROR) # Set return code to ERROR if we could not start the download process # or the childs return code is greater than zero # NOTE: In Linux if the called script is just empty Python exits # normally (ret=0), so we cant detect this or similar cases # using the code below # NOTE: In Unix a negative return code (-N) indicates that the child # was terminated by signal N (e.g. -9 = SIGKILL) if self._proc is None or self._proc.returncode > 0: self._return_code = self.ERROR if self._proc is not None and self._proc.returncode > 0: self._log('Child process exited with non-zero code: {}'.format( self._proc.returncode)) self._last_data_hook() return self._return_code def stop(self): """Stop the download process and set return code to STOPPED. """ if self._proc_is_alive(): if os.name == 'nt': # os.killpg is not available on Windows # See: https://bugs.python.org/issue5115 self._proc.kill() # When we kill the child process on Windows the return code # gets set to 1, so we want to reset the return code back to 0 # in order to avoid creating logging output in the download(...) # method self._proc.returncode = 0 else: os.killpg(self._proc.pid, signal.SIGKILL) self._set_returncode(self.STOPPED) def close(self): """Destructor like function for the object. """ self._stderr_reader.join() def _set_returncode(self, code): """Set self._return_code only if the hierarchy of the given code is higher than the current self._return_code. """ if code >= self._return_code: self._return_code = code def _is_warning(self, stderr): return stderr.split(':')[0] == 'WARNING' def _last_data_hook(self): """Set the last data information based on the return code. """ data_dictionary = {} if self._return_code == self.OK: data_dictionary['status'] = 'Finished' elif self._return_code == self.ERROR: data_dictionary['status'] = 'Error' data_dictionary['speed'] = '' data_dictionary['eta'] = '' elif self._return_code == self.WARNING: data_dictionary['status'] = 'Warning' data_dictionary['speed'] = '' data_dictionary['eta'] = '' elif self._return_code == self.STOPPED: data_dictionary['status'] = 'Stopped' data_dictionary['speed'] = '' data_dictionary['eta'] = '' elif self._return_code == self.ALREADY: data_dictionary['status'] = 'Already Downloaded' else: data_dictionary['status'] = 'Filesize Abort' self._hook_data(data_dictionary) def _extract_info(self, data): """Extract informations about the download process from the given data. Args: data (dict): Python dictionary that contains different keys. The keys are not standar the dictionary can also be empty when there are no data to extract. See extract_data(). """ if 'status' in data: if data['status'] == 'Already Downloaded': # Set self._return_code to already downloaded # and trash that key self._set_returncode(self.ALREADY) data['status'] = None if data['status'] == 'Filesize Abort': # Set self._return_code to filesize abort # and trash that key self._set_returncode(self.FILESIZE_ABORT) data['status'] = None def _log(self, data): """Log data using the callback function. """ if self.log_data is not None: self.log_data(data) def _hook_data(self, data): """Pass data back to the caller. """ if self.data_hook is not None: self.data_hook(data) def _proc_is_alive(self): """Returns True if self._proc is alive else False. """ if self._proc is None: return False return self._proc.poll() is None def _get_cmd(self, url, options): """Build the subprocess command. Args: url (string): URL string to download. options (list): Python list that contains youtube-dl options. Returns: Python list that contains the command to execute. """ if os.name == 'nt': cmd = [self.youtubedl_path] + options + [url] else: cmd = ['python', self.youtubedl_path] + options + [url] return cmd def _create_process(self, cmd): """Create new subprocess. Args: cmd (list): Python list that contains the command to execute. """ info = preexec = None # Keep a unicode copy of cmd for the log ucmd = cmd if os.name == 'nt': # Hide subprocess window info = subprocess.STARTUPINFO() info.dwFlags |= subprocess.STARTF_USESHOWWINDOW else: # Make subprocess the process group leader # in order to kill the whole process group with os.killpg preexec = os.setsid # Encode command for subprocess # Refer to http://stackoverflow.com/a/9951851/35070 if sys.version_info < (3, 0): cmd = convert_item(cmd, to_unicode=False) try: self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=preexec, startupinfo=info) except (ValueError, OSError) as error: self._log('Failed to start process: {}'.format(ucmd)) self._log(convert_item(str(error), to_unicode=True))
class BatchTaskQueue(TaskQueue): """ A class for managing batch async operations. """ def __init__(self, work_func, max_batch_size=DEFAULT_BATCH_SIZE, batch_cushion=DEFAULT_BATCH_CUSHION, interval=DEFAULT_INTERVAL, **kwargs): """ :param work_func: Work function input params of list(item), items are added with add_item :type work_func: function :param max_batch_size: The max number of elements in a batch call :type max_batch_size: int :param batch_cushion: The batch cushion between items uploaded and the specified max :type batch_cushion: int :param interval: The interval between checking and uploading added items :type interval: int """ super(BatchTaskQueue, self).__init__(**kwargs) self._max_batch_size = max_batch_size self._batch_cushion = batch_cushion self._batch_size = self._max_batch_size - self._batch_cushion if self._batch_size <= 0: self._logger.warning("Batch size - batch cushion is less than 1, defaulting to 1.") self._batch_size = MIN_BATCH_SIZE self._items = Queue() self._work_func = work_func self._daemon = Daemon(self._do_work, interval, _parent_logger=self._logger, _ident="{}Daemon".format(self.identity)) self._daemon.start() def add_item(self, item): """ :param func: Function to be executed asynchronously :type func: builtin.function :param task_priority: Priority for the task, higher items have higher priority :type task_priority: int or None """ self._items.put(item) def _handle_batch(self): batch = [] for item in range(self._batch_size): try: item = self._items.get_nowait() batch.append(item) except Empty: break self._logger.debug("Batch size {}.".format(len(batch))) if len(batch) > 0: self.add(self._work_func, batch) def _do_work(self): if not self._items.empty(): queue_size = self._items.qsize() num_batches = 1 + int(queue_size / self._batch_size) with TaskQueue(_ident="BatchTaskQueueAdd_{}_Batches".format(num_batches)) as task_queue: for _ in range(num_batches): task_queue.add(self._handle_batch) def __exit__(self, *args): super(BatchTaskQueue, self).__exit__(*args) self._daemon.stop() def flush(self, *args, **kwargs): self._do_work() super(BatchTaskQueue, self).flush(*args, **kwargs)
class SocketServer(object): """ZMQ-based socket server for sending and receiving messages from the host PC. Because of the weird way in which PyEPL handles events, we can't run this as its own thread, but instead have to poll for events in the general PyEPL machinery. In the future, we should clean up PyEPL entirely so that it does not block other threads (amongst other reasons). :param zmq.Context ctx: """ def __init__(self, ctx=None): self.ctx = ctx or zmq.Context() self._handlers = [] self.sock = self.ctx.socket(zmq.PAIR) self._bound = False self.poller = zmq.Poller() self.poller.register(self.sock, zmq.POLLIN) # Outgoing message queue self._out_queue = Queue() # time of last sent heartbeat message self._last_heartbeat = 0. # Logging of sent and received messages. self.logger = create_logger("network") def join(self): """Block until all outgoing messages have been processed.""" self.logger.warning("Joining doesn't work yet; doing nothing...") # self._out_queue.join() def bind(self, address="tcp://*:8889"): """Bind the socket to start listening for connections. :param str address: ZMQ address string """ self.sock.bind(address) self._bound = True def register_handler(self, func): """Register a message handler. :param callable func: Handler function which takes the message as its only argument. """ self.logger.debug("Adding handler: %s", func.__name__) self._handlers.append(func) def enqueue_message(self, msg): """Submit a new outgoing message to the queue.""" self._out_queue.put_nowait(msg) def send(self, msg): """Immediately transmit a message to the host PC. It is advisable to not call this method directly in most cases, but rather enqueue a message to be sent via :meth:`enqueue_message`. :param RAMMessage msg: Message to send. """ out = msg.jsonize() try: self.log_message(msg, incoming=False) self.sock.send(out, zmq.NOBLOCK) except: self.logger.error("Sending failed!") def send_heartbeat(self): """Convenience method to send a heartbeat message to the host PC.""" if time.time() - self._last_heartbeat >= 1.0: self.send(HeartbeatMessage()) self._last_heartbeat = time.time() def log_message(self, message, incoming=True): """Log a message to the log file.""" if not incoming: message = message.to_dict() message["in_or_out"] = "in" if incoming else "out" self.logger.info("%s", json.dumps(message)) def handle_incoming(self): events = self.poller.poll(1) if self.sock in dict(events): try: msg = self.sock.recv_json() self.log_message(msg, incoming=True) except: self.logger.error("Unable to decode JSON.", exc_info=True) return for handler in self._handlers: try: handler(msg) except: self.logger.error("Error handling message", exc_info=True) continue def handle_outgoing(self): try: while not self._out_queue.empty(): msg = self._out_queue.get_nowait() self.send(msg) self._out_queue.task_done( ) # so we can join the queue elsewhere except: self.logger.error("Error in outgoing message processing", exc_info=True) def update(self): """Call periodically to check for incoming messages and/or send messages in the outgoing queue. """ self.handle_incoming() self.handle_outgoing()
class CoqtailHandler(StreamRequestHandler): """Forward messages between Vim and Coqtail.""" # Redraw rate in seconds refresh_rate = 0.05 # How often to check for a closed channel check_close_rate = 1 # Is the channel open closed = False # Is a request currently being handled working = False # Is the client synchronous sync = False def parse_msgs(self): # type: () -> None """Parse messages sent over a Vim channel.""" while not self.closed: try: msg = self.rfile.readline().decode("utf-8") msg_id, data = json.loads(msg) except ValueError: # Check if channel closed self.closed = True break if msg_id >= 0: bnum, func, args = data if func == "interrupt": self.interrupt() else: self.reqs.put((msg_id, bnum, func, args)) else: # N.B. Accessing self.resps concurrently creates a race condition # where defaultdict could construct a Queue twice with self.resp_lk: self.resps[-msg_id].put((msg_id, data)) def get_msg(self, msg_id=None): # type: (Optional[int]) -> Sequence[Any] """Check for any pending messages from Vim.""" if msg_id is None: queue = self.reqs # type: Queue[Any] else: with self.resp_lk: queue = self.resps[msg_id] while not self.closed: try: return queue.get(timeout=self.check_close_rate) # type: ignore except Empty: pass raise EOFError def handle(self): # type: () -> None """Forward requests from Vim to the appropriate Coqtail function.""" self.coq = Coqtail(self) self.closed = False self.reqs = Queue( ) # type: Queue[Tuple[int, int, str, Mapping[str, Any]]] self.resps = ddict( Queue) # type: DefaultDict[int, Queue[Tuple[int, Any]]] self.resp_lk = threading.Lock() read_thread = threading.Thread(target=self.parse_msgs) read_thread.daemon = True read_thread.start() while not self.closed: try: self.working = False self.msg_id, self.bnum, func, args = self.get_msg() self.refresh_time = 0.0 self.working = True except EOFError: break handler = { "start": self.coq.start, "stop": self.coq.stop, "step": self.coq.step, "rewind": self.coq.rewind, "to_line": self.coq.to_line, "to_top": self.coq.to_top, "query": self.coq.query, "endpoint": self.coq.endpoint, "toggle_debug": self.coq.toggle_debug, "splash": self.coq.splash, "sync": self.coq.sync, "find_def": self.coq.find_def, "find_lib": self.coq.find_lib, "refresh": self.coq.refresh, }.get(func, None) try: ret = handler( **args) if handler is not None else None # type: ignore msg = [self.msg_id, {"buf": self.bnum, "ret": ret}] self.wfile.write(json.dumps(msg).encode("utf-8") + b"\n") # Python 2 doesn't have BrokenPipeError except (EOFError, OSError): break try: del self.resps[self.msg_id] except KeyError: pass if func == "stop": break def vimeval(self, expr, wait=True): # type: (List[Any], bool) -> Any """Send Vim a request.""" if wait: expr += [-self.msg_id] self.wfile.write(json.dumps(expr).encode("utf-8") + b"\n") if wait: msg_id, res = self.get_msg(self.msg_id) assert msg_id == -self.msg_id return res return None def vimcall(self, expr, wait, *args): # type: (str, bool, *Any) -> Any """Request Vim to evaluate a function call.""" return self.vimeval(["call", expr, args], wait=wait) def vimvar(self, var, val=None): # type: (str, Optional[Any]) -> Any """Get or set the value of a Vim variable.""" if val is None: return self.vimcall("getbufvar", True, self.bnum, var) else: return self.vimcall("setbufvar", True, self.bnum, var, val) def refresh(self, goals=True, force=True, scroll=False): # type: (bool, bool, bool) -> None """Refresh the highlighting and auxiliary panels.""" if not force: cur_time = time.time() force = cur_time - self.refresh_time > self.refresh_rate self.refresh_time = cur_time if force: self.vimcall( "coqtail#panels#refresh", self.sync, self.bnum, self.coq.highlights, self.coq.panels(goals), scroll, ) def interrupt(self): # type: () -> None """Interrupt Coqtop and clear the request queue.""" if self.coq.coqtop is not None and self.working: self.working = False while not self.reqs.empty(): try: self.reqs.get_nowait() except Empty: break self.coq.coqtop.interrupt()
class Schedule(object): """ Schedule 调度器组件 """ def __init__(self): self.queue = Queue() self.__filter_set = Set() self.total_request_num = 0 self.total_repeat_num = 0 def add_to_queue(self, request): """ add request to queue if request not in __filter_set """ fp = self.__get_fingerprint(request) if self._filter_request(fp, request): self.__filter_set.add_fp(fp) self.queue.put(request) self.total_request_num += 1 else: self.total_repeat_num += 1 def get(self): """ get request from Schedule.queue :return request """ try: request = self.queue.get_nowait() except: return None else: return request def _filter_request(self, fp, request): """ Use set filter request """ if fp in self.__filter_set: logger.info("Filter Request [{}] <{}>".format( request.method, request.url)) return False else: # 如果不是重复请求,允许添加到请求队列 return True def __get_fingerprint(self, request): """ 指纹去重 根据url method params data 给出指纹 """ import w3lib.url url = w3lib.url.canonicalize_url(request.url) method = request.method.upper() params = request.params if request.params else {} params = str(sorted(params.items(), key=lambda x: x[0])) data = request.data if request.data else {} data = str(sorted(data.items(), key=lambda x: x[0])) from hashlib import sha1 sha1_data = sha1() sha1_data.update(self.get_utf8_str(url)) sha1_data.update(self.get_utf8_str(method)) sha1_data.update(self.get_utf8_str(params)) sha1_data.update(self.get_utf8_str(data)) fp = sha1_data.hexdigest() return fp def get_utf8_str(self, string): """ 判断字符串类型,并将Unicode字符串编码为utf-8 """ if six.PY2: if isinstance(string, str): return string else: return string.encode("utf-8") else: if isinstance(string, bytes): return string else: return string.encode("utf-8")
class RabbitMQCrashStorage(CrashStorageBase): """This class is an implementation of a Socorro Crash Storage system. It is used as a crash queing methanism for raw crashes. It implements the save_raw_crash method as a queue submission function, and the new_crashes generator as a queue consumption function. Please note: as it only queues the crash_id and not the whole raw crash, it is not suitable to actually save a crash. It is a very lossy container. This class should be used in conjuction with a more persistant storage mechanism. The implementations CrashStorage classes can use arbitrarly high or low level semantics to talk to their underlying resource. In the RabbitMQ, implementation, queing through the 'save_raw_crash' method is given full transactional semantics using the TransactorExecutor classes. The 'new_crashes' generator has a lower level relationship with the underlying connection object""" required_config = Namespace() required_config.add_option( 'rabbitmq_class', default=ConnectionContextPooled, # we choose a pooled connection # because we need thread safe # connection behaviors doc='the class responsible for connecting to RabbitMQ', reference_value_from='resource.rabbitmq', ) required_config.add_option( 'transaction_executor_class', default= 'socorro.lib.transaction.TransactionExecutorWithInfiniteBackoff', doc='a class that will manage transactions', from_string_converter=class_converter, reference_value_from='resource.rabbitmq', ) required_config.add_option( 'routing_key', default='socorro.normal', doc='the name of the queue to recieve crashes', reference_value_from='resource.rabbitmq', ) required_config.add_option( 'filter_on_legacy_processing', default=True, doc='toggle for using or ignoring the throttling flag', reference_value_from='resource.rabbitmq', ) required_config.add_option( 'throttle', default=100, doc='percentage of the time that rabbit will try to queue', reference_value_from='resource.rabbitmq', ) def __init__(self, config, namespace='', quit_check_callback=None): super(RabbitMQCrashStorage, self).__init__(config, namespace=namespace, quit_check_callback=quit_check_callback) self.config = config # Note: this may continue to grow if we aren't acking certain UUIDs. # We should find a way to time out UUIDs after a certain time. self.acknowledgement_token_cache = {} self.acknowledgment_queue = Queue() self.rabbitmq = config.rabbitmq_class(config) self.transaction = config.transaction_executor_class( config, self.rabbitmq, quit_check_callback=quit_check_callback) # cache this object so we don't have to remake it for every transaction self._basic_properties = pika.BasicProperties( delivery_mode=2, # make message persistent ) if config.throttle == 100: self.dont_queue_this_crash = lambda: False else: self.dont_queue_this_crash = ( lambda: randint(1, 100) > config.throttle) def save_raw_crash(self, raw_crash, dumps, crash_id): if self.dont_queue_this_crash(): self.config.logger.info( 'Crash %s filtered out of RabbitMQ queue %s', crash_id, self.config.routing_key) return try: this_crash_should_be_queued = ( not self.config.filter_on_legacy_processing or raw_crash.legacy_processing == 0) except KeyError: self.config.logger.debug( 'RabbitMQCrashStorage legacy_processing key absent in crash ' '%s', crash_id) return if this_crash_should_be_queued: self.config.logger.debug('RabbitMQCrashStorage saving crash %s', crash_id) self.transaction(self._save_raw_crash_transaction, crash_id) return True else: self.config.logger.debug( 'RabbitMQCrashStorage not saving crash %s, legacy processing ' 'flag is %s', crash_id, raw_crash.legacy_processing) def _save_raw_crash_transaction(self, connection, crash_id): connection.channel.basic_publish(exchange='', routing_key=self.config.routing_key, body=crash_id, properties=self._basic_properties) def _basic_get_transaction(self, conn, queue): """reorganize the the call to rabbitmq basic_get so that it can be used by the transaction retry wrapper.""" things = conn.channel.basic_get(queue=queue) return things def new_crashes(self): """This generator fetches crash_ids from RabbitMQ.""" # We've set up RabbitMQ to require acknowledgement of processing of a # crash_id from this generator. It is the responsibility of the # consumer of the crash_id to tell this instance of the class when has # completed its work on the crash_id. That is done with the call to # 'ack_crash' below. Because RabbitMQ connections are not thread safe, # only the thread that read the crash may acknowledge it. 'ack_crash' # queues the crash_id. The '_consume_acknowledgement_queue' function # is run to send acknowledgments back to RabbitMQ self._consume_acknowledgement_queue() queues = [ self.rabbitmq.config.priority_queue_name, self.rabbitmq.config.standard_queue_name, self.rabbitmq.config.reprocessing_queue_name, self.rabbitmq.config.priority_queue_name, ] while True: for queue in queues: method_frame, header_frame, body = self.transaction( self._basic_get_transaction, queue=queue) if method_frame and self._suppress_duplicate_jobs( body, method_frame): continue if method_frame: break # must consume ack queue before testing for end of iterator # or the last job won't get ack'd self._consume_acknowledgement_queue() if not method_frame: # there was nothing in the queue - leave the iterator return self.acknowledgement_token_cache[body] = method_frame yield body queues.reverse() def ack_crash(self, crash_id): self.acknowledgment_queue.put(crash_id) def _suppress_duplicate_jobs(self, crash_id, acknowledgement_token): """if this crash is in the cache, then it is already in progress and this is a duplicate. Acknowledge it, then return to True to let the caller know to skip on to the next crash.""" if crash_id in self.acknowledgement_token_cache: # reject this crash - it's already being processsed self.config.logger.info('duplicate job: %s is already in progress', crash_id) # ack this self.transaction(self._transaction_ack_crash, crash_id, acknowledgement_token) return True return False def _consume_acknowledgement_queue(self): """The acknowledgement of the processing of each crash_id yielded from the 'new_crashes' method must take place on the same connection that the crash_id came from. The crash_ids are queued in the 'acknowledgment_queue'. That queue is consumed by the QueuingThread""" try: while True: crash_id_to_be_acknowledged = \ self.acknowledgment_queue.get_nowait() # self.config.logger.debug( # 'RabbitMQCrashStorage set to acknowledge %s', # crash_id_to_be_acknowledged # ) try: acknowledgement_token = \ self.acknowledgement_token_cache[ crash_id_to_be_acknowledged ] self.transaction(self._transaction_ack_crash, crash_id_to_be_acknowledged, acknowledgement_token) del self.acknowledgement_token_cache[ crash_id_to_be_acknowledged] except KeyError: self.config.logger.warning( 'RabbitMQCrashStorage tried to acknowledge crash %s' ', which was not in the cache', crash_id_to_be_acknowledged, exc_info=True) except Exception: self.config.logger.error( 'RabbitMQCrashStorage unexpected failure on %s', crash_id_to_be_acknowledged, exc_info=True) except Empty: pass # nothing to do with an empty queue def _transaction_ack_crash(self, connection, crash_id, acknowledgement_token): connection.channel.basic_ack( delivery_tag=acknowledgement_token.delivery_tag) self.config.logger.debug( 'RabbitMQCrashStorage acking %s with delivery_tag %s', crash_id, acknowledgement_token.delivery_tag)
class Coqtop(object): """Provide an interface to the background Coqtop process.""" def __init__(self, version, done_callback): # type: (Text, Callable[[], None]) -> None """Initialize Coqtop state. coqtop - The Coqtop process done_callback - A function to call when finished waiting for Coqtop states - A stack of previous state_ids (grows to the right) state_id - The current state_id root_state - The starting state_id out_q - A thread-safe queue of data read from Coqtop xml - The XML interface for the given version """ self.coqtop = None # type: Optional[subprocess.Popen[bytes]] self.done_callback = done_callback self.states = [] # type: List[int] self.state_id = -1 self.root_state = -1 self.out_q = Queue() # type: Queue[bytes] self.xml = XMLInterface(version) self.stopping = False # Debugging self.log = None # type: Optional[IO[Text]] self.handler = logging.NullHandler() # type: logging.Handler self.logger = logging.getLogger(str(id(self))) self.logger.addHandler(self.handler) self.logger.setLevel(logging.INFO) # Coqtop Interface # # These are expressed as generators that spawn a thread to interact with # Coqtop, yield and wait to be told whether the user interrupted with # CTRL-C, then yield the final result. This is done because Vim cannot # capture signals while running Python plugins, so we have to busy wait in # Vim instead. # Ideally the type would be Generator[None, bool, bool] and the final # 'yield's would be 'return's, but Python 2 doesn't support returning # values from generators. def start(self, coq_path, *args, **kwargs): # type: (str, *str, **int) -> Generator[Optional[bool], bool, None] """Launch the Coqtop process.""" assert self.coqtop is None self.logger.debug("start") timeout = kwargs.get("timeout", None) for launch in self.xml.launch(coq_path): try: self.coqtop = subprocess.Popen( launch + args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0, ) # Spawn threads to monitor Coqtop's stdout and stderr for f in (self.capture_out, self.capture_err, self.capture_dead): read_thread = threading.Thread(target=f) read_thread.daemon = True read_thread.start() # Initialize Coqtop call = self.call(self.xml.init(), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) if isinstance(response, Err): yield False return self.root_state = response.val self.state_id = response.val yield True return except OSError: continue # Failed to launch Coqtop self.coqtop = None self.done_callback() yield # type: ignore[misc] # (see comment above start()) yield False def stop(self): # type: () -> None """End the Coqtop process.""" if self.coqtop is not None: self.logger.debug("stop") self.stopping = True # Close debugging log self.handler.flush() self.handler.close() if self.log is not None: self.log.close() try: # Try to terminate Coqtop cleanly # TODO: use Quit call self.coqtop.terminate() self.coqtop.communicate() except (OSError, ValueError, AttributeError): try: # Force Coqtop to stop self.coqtop.kill() except (OSError, AttributeError): pass self.coqtop = None def advance( self, cmd, # type: Text encoding="utf-8", # type: str timeout=None, # type: Optional[int] ): # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None] """Advance Coqtop by sending 'cmd'.""" self.logger.debug("advance: %s", cmd) call = self.call(self.xml.add(cmd, self.state_id, encoding=encoding), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) if isinstance(response, Err): yield False, response.msg, response.loc return # In addition to sending 'cmd', also check status in order to force it # to be evaluated call = self.call(self.xml.status(encoding=encoding), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) status = call.send(stopped) # Combine messages msgs = "\n\n".join(msg for msg in (response.msg, response.val["res_msg"], status.msg) if msg != "") if isinstance(status, Err): # Reset state id to before the error call = self.call(self.xml.edit_at(self.state_id, 1)) next(call) yield # type: ignore[misc] # (see comment above start()) call.send(False) yield False, msgs, status.loc return self.states.append(self.state_id) self.state_id = response.val["state_id"] yield True, msgs, None def rewind(self, steps=1): # type: (int) -> Generator[Tuple[bool, int], bool, None] """Go back 'steps' states.""" self.logger.debug("rewind: %d", steps) if steps > len(self.states): self.state_id = self.root_state self.states = [] steps = len(self.states) else: # In 8.4 query and option commands will be recorded with # state_id = -1. Need to count them and reduce number of steps to # rewind so Coqtop doesn't go too far back fake_steps = sum(s == -1 for s in self.states[-steps:]) if self.states[-steps] != -1: self.state_id = self.states[-steps] else: self.state_id = 0 self.states = self.states[:-steps] steps -= fake_steps call = self.call(self.xml.edit_at(self.state_id, steps)) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) if isinstance(response, Ok): yield True, response.val else: yield False, 0 def query( self, cmd, # type: Text in_script, # type: bool encoding="utf-8", # type: str timeout=None, # type: Optional[int] ): # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None] """Query Coqtop with 'cmd'.""" self.logger.debug("query: %s", cmd) call = self.call(self.xml.query(cmd, self.state_id, encoding=encoding), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) if isinstance(response, Ok): # If the query was called from within the script we need to record # the state id so rewinding will work properly. Since 8.4 uses # number of steps rather than state ids, record '-1' to indicate # that no rewind should actually be done if in_script: if self.xml.versions >= (8, 5, 0): self.states.append(self.state_id) else: self.states.append(-1) yield True, response.msg, None else: yield False, response.msg, response.loc def goals(self, timeout=None): # type: (Optional[int]) -> Generator[Tuple[bool, Text, Optional[Tuple[List[Any], List[Any], List[Any], List[Any]]]], bool, None] """Get the current set of hypotheses and goals.""" self.logger.debug("goals") call = self.call(self.xml.goal(), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) if isinstance(response, Ok): yield True, response.msg, response.val else: yield False, "", None def mk_cases(self, ty, encoding="utf-8", timeout=None): # type: (Text, str, Optional[int]) -> Generator[Tuple[bool, Text], bool, None] """Return cases for each constructor of 'ty'.""" self.logger.debug("mk_cases: %s", ty) call = self.call(self.xml.mk_cases(ty, encoding=encoding), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) if isinstance(response, Ok): yield True, response.val else: yield False, response.msg def do_option( self, cmd, # type: Text in_script, # type: bool encoding="utf-8", # type: str timeout=None, # type: Optional[int] ): # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None] """Set or get an option.""" self.logger.debug("do_option: %s", cmd) vals, opt = self.xml.parse_option(cmd) if vals is None: call = self.call(self.xml.get_options(encoding=encoding), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) if isinstance(response, Ok): optval = [(val, desc) for name, desc, val in response.val if name == opt] if optval != []: ret = "{}: {}".format(optval[0][1], optval[0][0]) # type: Text else: ret = "Invalid option name" else: for val in vals: call = self.call(self.xml.set_options(opt, val, encoding=encoding), timeout=timeout) next(call) stopped = yield # type: ignore[misc] # (see comment above start()) response = call.send(stopped) ret = response.msg if isinstance(response, Ok): break if isinstance(response, Ok): # Hack to associate setting an option with a new state id by # executing a noop so it works correctly with rewinding if in_script: noop_call = self.advance(self.xml.noop, encoding) next(noop_call) while True: yield # type: ignore[misc] # (see comment above start()) noop_ret = noop_call.send(False) if noop_ret is not None: success, _, _ = noop_ret assert success break yield True, ret, None else: yield False, response.msg, response.loc def dispatch( self, cmd, # type: Text in_script=True, # type: bool encoding="utf-8", # type: str timeout=None, # type: Optional[int] ): # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None] """Decide whether 'cmd' is setting/getting an option, a query, or a regular command. """ # Make sure 'cmd' is a string format that supports unicode cmd = ensure_text(cmd, encoding) # type: ignore[no-untyped-call] if self.xml.is_option(cmd): call = self.do_option(cmd, in_script, encoding, timeout) elif self.xml.is_query(cmd): call = self.query(cmd, in_script, encoding, timeout) elif in_script: call = self.advance(cmd, encoding, timeout) else: self.done_callback() yield # type: ignore[misc] # (see comment above start()) yield True, "Command only allowed in script.", None return next(call) while True: stopped = yield # type: ignore[misc] # (see comment above start()) ret = call.send(stopped) if ret is not None: yield ret break # Interacting with Coqtop # def call( self, cmdtype_msg, # type: Tuple[Text, Optional[bytes]] timeout=None, # type: Optional[int] ): # type: (...) -> Generator[Union[Ok, Err], bool, None] """Send 'msg' to the Coqtop process and wait for the response.""" # Check if Coqtop has stopped if not self.running(): raise CoqtopError("Coqtop is not running.") # Throw away any unread messages self.empty_out() cmd, msg = cmdtype_msg # 'msg' can be None if a command does not exist for a particular # version and is being faked. # N.B. It is important that the '_standardize' function being called # does not depend on the value it is passed since it is None if msg is None: self.done_callback() yield # type: ignore[misc] # (see comment above start()) yield self.xml.standardize(cmd, Ok(None)) return # Don't bother doing prettyxml if debugging isn't on if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug(prettyxml(msg)) self.send_cmd(msg) if timeout == 0: timeout = None # The got_response event tells the timeout_thread that get_answer() # returned normally, while timed_out will be set by timeout_thread if # time runs out without receiving a response got_response = threading.Event() timed_out = threading.Event() timeout_thread = threading.Thread(target=self.timeout_thread, args=(timeout, got_response, timed_out)) timeout_thread.daemon = True # Start a thread to get Coqtop's response res_ref = Ref() answer_thread = threading.Thread(target=self.get_answer, args=(res_ref, )) answer_thread.daemon = True # Start threads and yield back to caller to wait for Coqtop to finish timeout_thread.start() answer_thread.start() stopped = yield # type: ignore[misc] # (see comment above start()) # Notify timeout_thread that a response is received and wait for # threads to finish got_response.set() timeout_thread.join() answer_thread.join() response = res_ref.val # Check for user interrupt or timeout if isinstance(response, Err): if stopped: response = STOPPED_ERR elif timed_out.is_set(): response = TIMEOUT_ERR yield self.xml.standardize(cmd, response) def timeout_thread(self, timeout, got_response, timed_out): # type: (int, threading.Event, threading.Event) -> None """Wait on the 'got_response' Event for timeout seconds and set 'timed_out' and interrupt the Coqtop process if it is not set in time. """ if self.coqtop is None: raise CoqtopError("coqtop must not be None in timeout_thread()") if not got_response.wait(timeout): self.interrupt() timed_out.set() def get_answer(self, res_ref): # type: (Ref) -> None """Read from 'out_q' and wait until a full response is received.""" data = [] while True: data.append(self.out_q.get()) xml = b"".join(data) if not self.xml.worth_parsing(xml): continue response = self.xml.raw_response(xml) if response is None: continue # Don't bother doing prettyxml if debugging isn't on if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug( prettyxml(b"<response>" + xml + b"</response>")) res_ref.val = response # Notify the caller that Coqtop is done self.done_callback() break def empty_out(self): # type: () -> None """Pop data until 'out_q' is empty.""" while not self.out_q.empty(): try: self.out_q.get_nowait() except Empty: return def capture_out(self): # type: () -> None """Continually read data from Coqtop's stdout into 'out_q'.""" if self.coqtop is None: raise CoqtopError("coqtop must not be None in capture_out()") if self.coqtop.stdout is None: raise CoqtopError( "coqtop stdout must not be None in capture_out()") fd = self.coqtop.stdout.fileno() while not self.stopping: try: self.out_q.put(os.read(fd, 0x10000)) except (AttributeError, OSError, ValueError): # Coqtop died return def capture_err(self): # type: () -> None """Continually read data from Coqtop's stderr and print it.""" if self.coqtop is None: raise CoqtopError("coqtop must not be None in capture_err()") if self.coqtop.stderr is None: raise CoqtopError( "coqtop stderr must not be None in capture_err()") fd = self.coqtop.stderr.fileno() while not self.stopping: try: print(os.read(fd, 0x10000).decode()) except (AttributeError, OSError, ValueError): # Coqtop died return def capture_dead(self): # type: () -> None """Continually check if Coqtop has died.""" while self.running(): time.sleep(1) self.stop() def send_cmd(self, cmd): # type: (bytes) -> None """Write to Coqtop's stdin.""" if self.coqtop is None: raise CoqtopError("coqtop must not be None in send_cmd()") if self.coqtop.stdin is None: raise CoqtopError("coqtop stdin must not be None in send_cmd()") self.coqtop.stdin.write(cmd) self.coqtop.stdin.flush() def interrupt(self): # type: () -> None """Send a SIGINT signal to Coqtop.""" if self.coqtop is None: raise CoqtopError("Coqtop is not running.") self.coqtop.send_signal(signal.SIGINT) # Current State # def running(self): # type: () -> bool """Check if Coqtop has already been started.""" return self.coqtop is not None and self.coqtop.poll() is None # Debugging # def toggle_debug(self): # type: () -> Optional[str] """Enable or disable logging of debug messages.""" self.logger.removeHandler(self.handler) self.handler.flush() self.handler.close() if self.log is None: # Create unique log file pre = "coqtop_{}_".format( datetime.datetime.now().strftime("%y%m%d_%H%M%S")) fmt = logging.Formatter("%(asctime)s: %(message)s") # Python 2 says _TemporaryFileWrapper is incompatible with IO[Text] self.log = NamedTemporaryFile( mode="w", prefix=pre, delete=False) # type: ignore[assignment] self.handler = logging.StreamHandler(self.log) self.handler.setFormatter(fmt) self.logger.addHandler(self.handler) self.logger.setLevel(logging.DEBUG) return self.log.name # type: ignore[no-any-return, attr-defined] # (see above) else: # Clean up old logging self.log.close() # Set to null logging self.log = None self.handler = logging.NullHandler() self.logger.addHandler(self.handler) self.logger.setLevel(logging.CRITICAL) return None
class JobHandlerQueue(Monitors): """ Job Handler's Internal Queue, this is what actually implements waiting for jobs to be runnable and dispatching to a JobRunner. """ STOP_SIGNAL = object() def __init__(self, app, dispatcher): """Initializes the Job Handler Queue, creates (unstarted) monitoring thread""" self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context self.track_jobs_in_database = self.app.config.track_jobs_in_database # Initialize structures for handling job limits self.__clear_job_count() # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting_jobs = [] # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times) self.job_wrappers = {} name = "JobHandlerQueue.monitor_thread" self._init_monitor_thread(name, target=self.__monitor, config=app.config) def start(self): """ Starts the JobHandler's thread after checking for any unhandled jobs. """ log.debug('Handler queue starting for jobs assigned to handler: %s', self.app.config.server_name) # Recover jobs at startup self.__check_jobs_at_startup() # Start the queue self.monitor_thread.start() # The stack code is initialized in the application JobHandlerMessage().bind_default_handler(self, '_handle_message') self.app.application_stack.register_message_handler(self._handle_message, name=JobHandlerMessage.target) log.info("job handler queue started") def job_wrapper(self, job, use_persisted_destination=False): return JobWrapper(job, self, use_persisted_destination=use_persisted_destination) def job_pair_for_id(self, id): job = self.sa_session.query(model.Job).get(id) return job, self.job_wrapper(job, use_persisted_destination=True) def __write_registry_file_if_absent(self, job): # TODO: remove this and the one place it is called in late 2018, this # hack attempts to minimize the job failures due to upgrades from 17.05 # Galaxies. job_wrapper = self.job_wrapper(job) cwd = job_wrapper.working_directory datatypes_config = os.path.join(cwd, "registry.xml") if not os.path.exists(datatypes_config): try: self.app.datatypes_registry.to_xml_file(path=datatypes_config) except OSError: pass def __check_jobs_at_startup(self): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.track_jobs_in_database: in_list = (model.Job.states.QUEUED, model.Job.states.RUNNING) else: in_list = (model.Job.states.NEW, model.Job.states.QUEUED, model.Job.states.RUNNING) if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin(model.User) \ .filter(model.Job.state.in_(in_list) & (model.Job.handler == self.app.config.server_name) & or_((model.Job.user_id == null()), (model.User.active == true()))).all() else: jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(model.Job.state.in_(in_list) & (model.Job.handler == self.app.config.server_name)).all() for job in jobs_at_startup: self.__write_registry_file_if_absent(job) if not self.app.toolbox.has_tool(job.tool_id, job.tool_version, exact=True): log.warning("(%s) Tool '%s' removed from tool config, unable to recover job" % (job.id, job.tool_id)) self.job_wrapper(job).fail('This tool was disabled before the job completed. Please contact your Galaxy administrator.') elif job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug("(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id) job.job_runner_name = None if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist job_wrapper = self.job_wrapper(job) job_destination = self.dispatcher.url_to_destination(job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover(job, job_wrapper) log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug("(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % (job.id, job.state)) if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) else: # Already dispatched and running job_wrapper = self.__recover_job_wrapper(job) self.dispatcher.recover(job, job_wrapper) if self.sa_session.dirty: self.sa_session.flush() def __recover_job_wrapper(self, job): # Already dispatched and running job_wrapper = self.job_wrapper(job) # Use the persisted destination as its params may differ from # what's in the job_conf xml job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) # resubmits are not persisted (it's a good thing) so they # should be added back to the in-memory destination on startup try: config_job_destination = self.app.job_config.get_destination(job.destination_id) job_destination.resubmit = config_job_destination.resubmit except KeyError: log.debug('(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id) job_wrapper.job_runner_mapper.cached_job_destination = job_destination return job_wrapper def __monitor(self): """ Continually iterate the waiting jobs, checking is each is ready to run and dispatching if so. """ while self.monitor_running: try: # If jobs are locked, there's nothing to monitor and we skip # to the sleep. if not self.app.job_manager.job_lock: self.__monitor_step() except Exception: log.exception("Exception in monitor_step") self._monitor_sleep(1) def __monitor_step(self): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. If the job belongs to an inactive user it is ignored. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] resubmit_jobs = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA), (model.HistoryDatasetAssociation.deleted == true()), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == true())))).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != null()), (model.LibraryDatasetDatasetAssociation.deleted == true()), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == true())))).subquery() if self.app.config.user_activation_on: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin(model.User) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.Job.user_id == null()), (model.User.active == true())), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() # Fetch all "resubmit" jobs resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.RESUBMITTED), (model.Job.handler == self.app.config.server_name))) \ .order_by(model.Job.id).all() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append(self.sa_session.query(model.Job).get(job_id)) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append(self.sa_session.query(model.Job).get(job_id)) except Empty: pass # Ensure that we get new job counts on each iteration self.__clear_job_count() # Check resubmit jobs first so that limits of new jobs will still be enforced for job in resubmit_jobs: log.debug('(%s) Job was resubmitted and is being dispatched immediately', job.id) # Reassemble resubmit job destination from persisted value jw = self.__recover_job_wrapper(job) if jw.is_ready_for_resubmission(job): self.increase_running_job_count(job.user_id, jw.job_destination.id) self.dispatcher.put(jw) # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue if job.copied_from_job_id: copied_from_job = self.sa_session.query(model.Job).get(job.copied_from_job_id) job.numeric_metrics = copied_from_job.numeric_metrics job.text_metrics = copied_from_job.text_metrics job.dependencies = copied_from_job.dependencies job.state = copied_from_job.state job.stderr = copied_from_job.stderr job.stdout = copied_from_job.stdout job.command_line = copied_from_job.command_line job.traceback = copied_from_job.traceback job.tool_version = copied_from_job.tool_version job.exit_code = copied_from_job.exit_code job.job_runner_name = copied_from_job.job_runner_name job.job_runner_external_id = copied_from_job.job_runner_external_id continue job_state = self.__check_job_state(job) if job_state == JOB_WAIT: new_waiting_jobs.append(job.id) elif job_state == JOB_INPUT_ERROR: log.info("(%d) Job unable to run: one or more inputs in error state" % job.id) elif job_state == JOB_INPUT_DELETED: log.info("(%d) Job unable to run: one or more inputs deleted" % job.id) elif job_state == JOB_READY: self.dispatcher.put(self.job_wrappers.pop(job.id)) log.info("(%d) Job dispatched" % job.id) elif job_state == JOB_DELETED: log.info("(%d) Job deleted by user while still queued" % job.id) elif job_state == JOB_ADMIN_DELETED: log.info("(%d) Job deleted by admin while still queued" % job.id) elif job_state in (JOB_USER_OVER_QUOTA, JOB_USER_OVER_TOTAL_WALLTIME): if job_state == JOB_USER_OVER_QUOTA: log.info("(%d) User (%s) is over quota: job paused" % (job.id, job.user_id)) else: log.info("(%d) User (%s) is over total walltime limit: job paused" % (job.id, job.user_id)) job.set_state(model.Job.states.PAUSED) for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add(dataset_assoc.dataset.dataset) self.sa_session.add(job) elif job_state == JOB_ERROR: log.error("(%d) Error checking job readiness" % job.id) else: log.error("(%d) Job in unknown state '%s'" % (job.id, job_state)) new_waiting_jobs.append(job.id) except Exception: log.exception("failure running job %d", job.id) # Update the waiting list if not self.track_jobs_in_database: self.waiting_jobs = new_waiting_jobs # Remove cached wrappers for any jobs that are no longer being tracked for id in list(self.job_wrappers.keys()): if id not in new_waiting_jobs: del self.job_wrappers[id] # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove() def __check_job_state(self, job): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ if not self.track_jobs_in_database: in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job) if in_memory_not_ready_state: return in_memory_not_ready_state # Else, if tracking in the database, job.state is guaranteed to be NEW and # the inputs are guaranteed to be OK. # Create the job wrapper so that the destination can be set job_id = job.id job_wrapper = self.job_wrappers.get(job_id, None) if not job_wrapper: job_wrapper = self.job_wrapper(job) self.job_wrappers[job_id] = job_wrapper # If state == JOB_READY, assume job_destination also set - otherwise # in case of various error or cancelled states do not assume # destination has been set. state, job_destination = self.__verify_job_ready(job, job_wrapper) if state == JOB_READY: # PASS. increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration self.increase_running_job_count(job.user_id, job_destination.id) return state def __verify_job_ready(self, job, job_wrapper): """ Compute job destination and verify job is ready at that destination by checking job limits and quota. If this method return a job state of JOB_READY - it MUST also return a job destination. """ job_destination = None try: assert job_wrapper.tool is not None, 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' # Cause the job_destination to be set and cached by the mapper job_destination = job_wrapper.job_destination except AssertionError as e: log.warning("(%s) Tool '%s' removed from tool config, unable to run job" % (job.id, job.tool_id)) job_wrapper.fail(e) return JOB_ERROR, job_destination except JobNotReadyException as e: job_state = e.job_state or JOB_WAIT return job_state, None except Exception as e: failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception('Failed to generate job destination') else: log.debug("Intentionally failing job with message (%s)" % failure_message) job_wrapper.fail(failure_message) return JOB_ERROR, job_destination # job is ready to run, check limits # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable state = self.__check_destination_jobs(job, job_wrapper) if state == JOB_READY: state = self.__check_user_jobs(job, job_wrapper) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota(job.user) if quota is not None: try: usage = self.app.quota_agent.get_usage(user=job.user, history=job.history) if usage > quota: return JOB_USER_OVER_QUOTA, job_destination except AssertionError as e: pass # No history, should not happen with an anon user # Check total walltime limits if (state == JOB_READY and "delta" in self.app.job_config.limits.total_walltime): jobs_to_check = self.sa_session.query(model.Job).filter( model.Job.user_id == job.user.id, model.Job.update_time >= datetime.datetime.now() - datetime.timedelta( self.app.job_config.limits.total_walltime["window"] ), model.Job.state == 'ok' ).all() time_spent = datetime.timedelta(0) for job in jobs_to_check: # History is job.state_history started = None finished = None for history in sorted( job.state_history, key=lambda history: history.update_time): if history.state == "running": started = history.create_time elif history.state == "ok": finished = history.create_time time_spent += finished - started if time_spent > self.app.job_config.limits.total_walltime["delta"]: return JOB_USER_OVER_TOTAL_WALLTIME, job_destination return state, job_destination def __verify_in_memory_job_inputs(self, job): """ Perform the same checks that happen via SQL for in-memory managed jobs. """ if job.state == model.Job.states.DELETED: return JOB_DELETED elif job.state == model.Job.states.ERROR: return JOB_ADMIN_DELETED for dataset_assoc in job.input_datasets + job.input_library_datasets: idata = dataset_assoc.dataset if not idata: continue # don't run jobs for which the input dataset was deleted if idata.deleted: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail("input data %s (file: %s) was deleted before the job started" % (idata.hid, idata.file_name)) return JOB_INPUT_DELETED # an error in the input data causes us to bail immediately elif idata.state == idata.states.ERROR: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail("input data %s is in error state" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state == idata.states.FAILED_METADATA: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail("input data %s failed to properly set metadata" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state != idata.states.OK and not (idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id): # need to requeue return JOB_WAIT # All inputs ready to go. return None def __clear_job_count(self): self.user_job_count = None self.user_job_count_per_destination = None self.total_job_count_per_destination = None def get_user_job_count(self, user_id): self.__cache_user_job_count() # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching rval = self.user_job_count.get(user_id, 0) if not self.app.config.cache_user_job_count: result = self.sa_session.execute(select([func.count(model.Job.table.c.id)]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id == user_id)))) for row in result: # there should only be one row rval += row[0] return rval def __cache_user_job_count(self): # Cache the job count if necessary if self.user_job_count is None and self.app.config.cache_user_job_count: self.user_job_count = {} query = self.sa_session.execute(select([model.Job.table.c.user_id, func.count(model.Job.table.c.user_id)]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id != null()))) .group_by(model.Job.table.c.user_id)) for row in query: self.user_job_count[row[0]] = row[1] elif self.user_job_count is None: self.user_job_count = {} def get_user_job_count_per_destination(self, user_id): self.__cache_user_job_count_per_destination() cached = self.user_job_count_per_destination.get(user_id, {}) if self.app.config.cache_user_job_count: rval = cached else: # The cached count is still used even when we're not caching, it is # incremented when a job is run by this handler to ensure that # multiple jobs can't get past the limits in one iteration of the # queue. rval = {} rval.update(cached) result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)), (model.Job.table.c.user_id == user_id))) .group_by(model.Job.table.c.destination_id)) for row in result: # Add the count from the database to the cached count rval[row['destination_id']] = rval.get(row['destination_id'], 0) + row['job_count'] return rval def __cache_user_job_count_per_destination(self): # Cache the job count if necessary if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count: self.user_job_count_per_destination = {} result = self.sa_session.execute(select([model.Job.table.c.user_id, model.Job.table.c.destination_id, func.count(model.Job.table.c.user_id).label('job_count')]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)))) .group_by(model.Job.table.c.user_id, model.Job.table.c.destination_id)) for row in result: if row['user_id'] not in self.user_job_count_per_destination: self.user_job_count_per_destination[row['user_id']] = {} self.user_job_count_per_destination[row['user_id']][row['destination_id']] = row['job_count'] elif self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} def increase_running_job_count(self, user_id, destination_id): if self.app.job_config.limits.registered_user_concurrent_jobs or \ self.app.job_config.limits.anonymous_user_concurrent_jobs or \ self.app.job_config.limits.destination_user_concurrent_jobs: if self.user_job_count is None: self.user_job_count = {} if self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} self.user_job_count[user_id] = self.user_job_count.get(user_id, 0) + 1 if user_id not in self.user_job_count_per_destination: self.user_job_count_per_destination[user_id] = {} self.user_job_count_per_destination[user_id][destination_id] = self.user_job_count_per_destination[user_id].get(destination_id, 0) + 1 if self.app.job_config.limits.destination_total_concurrent_jobs: if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} self.total_job_count_per_destination[destination_id] = self.total_job_count_per_destination.get(destination_id, 0) + 1 def __check_user_jobs(self, job, job_wrapper): # TODO: Update output datasets' _state = LIMITED or some such new # state, so the UI can reflect what jobs are waiting due to concurrency # limits if job.user: # Check the hard limit first if self.app.job_config.limits.registered_user_concurrent_jobs: count = self.get_user_job_count(job.user_id) # Check the user's number of dispatched jobs against the overall limit if count >= self.app.job_config.limits.registered_user_concurrent_jobs: return JOB_WAIT # If we pass the hard limit, also check the per-destination count id = job_wrapper.job_destination.id count_per_id = self.get_user_job_count_per_destination(job.user_id) if id in self.app.job_config.limits.destination_user_concurrent_jobs: count = count_per_id.get(id, 0) # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_user_concurrent_jobs[id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_user_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [d.id for d in self.app.job_config.get_destinations(tag)]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_user_concurrent_jobs[tag]: return JOB_WAIT elif job.galaxy_session: # Anonymous users only get the hard limit if self.app.job_config.limits.anonymous_user_concurrent_jobs: count = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_(model.Job.session_id == job.galaxy_session.id, or_(model.Job.state == model.Job.states.RUNNING, model.Job.state == model.Job.states.QUEUED))).count() if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs: return JOB_WAIT else: log.warning('Job %s is not associated with a user or session so job concurrency limit cannot be checked.' % job.id) return JOB_READY def __cache_total_job_count_per_destination(self): # Cache the job count if necessary if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')]) .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)))) .group_by(model.Job.table.c.destination_id)) for row in result: self.total_job_count_per_destination[row['destination_id']] = row['job_count'] def get_total_job_count_per_destination(self): self.__cache_total_job_count_per_destination() # Always use caching (at worst a job will have to wait one iteration, # and this would be more fair anyway as it ensures FIFO scheduling, # insofar as FIFO would be fair...) return self.total_job_count_per_destination def __check_destination_jobs(self, job, job_wrapper): if self.app.job_config.limits.destination_total_concurrent_jobs: id = job_wrapper.job_destination.id count_per_id = self.get_total_job_count_per_destination() if id in self.app.job_config.limits.destination_total_concurrent_jobs: count = count_per_id.get(id, 0) # Check the number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_total_concurrent_jobs[id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_total_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [d.id for d in self.app.job_config.get_destinations(tag)]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_total_concurrent_jobs[tag]: return JOB_WAIT return JOB_READY def _handle_setup_msg(self, job_id=None): job = self.sa_session.query(model.Job).get(job_id) if job.handler is None: job.handler = self.app.config.server_name self.sa_session.add(job) self.sa_session.flush() # If not tracking jobs in the database self.put(job.id, job.tool_id) else: log.warning("(%s) Handler '%s' received setup message but handler '%s' is already assigned, ignoring", job.id, self.app.config.server_name, job.handler) def put(self, job_id, tool_id): """Add a job to the queue (by job identifier)""" if not self.track_jobs_in_database: self.queue.put((job_id, tool_id)) self.sleeper.wake() else: # Workflow invocations farmed out to workers will submit jobs through here. If a handler is unassigned, we # will submit for one, or else claim it ourself. TODO: This should be moved to a higher level as it's now # implemented here and in MessageJobQueue job = self.sa_session.query(model.Job).get(job_id) if job.handler is None and self.app.application_stack.has_pool(self.app.application_stack.pools.JOB_HANDLERS): msg = JobHandlerMessage(task='setup', job_id=job_id) self.app.application_stack.send_message(self.app.application_stack.pools.JOB_HANDLERS, msg) def shutdown(self): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info("sending stop signal to worker thread") self.stop_monitoring() if not self.app.config.track_jobs_in_database: self.queue.put(self.STOP_SIGNAL) # A message could still be received while shutting down, should be ok since they will be picked up on next startup. self.app.application_stack.deregister_message_handler(name=JobHandlerMessage.target) self.sleeper.wake() self.shutdown_monitor() log.info("job handler queue stopped") self.dispatcher.shutdown()
class ReadableProcess(object): def __init__(self, command): super(ReadableProcess, self).__init__() self._command = command self._out_pipe = StringIO() self._err_pipe = StringIO() self._stdout_t = None self._stderr_t = None self._process = None self._out_queue = None self._err_queue = None self._out_t = None self._err_t = None @property def returncode(self): if self._process: return self._process.returncode return 1 def poll(self): if self._process: return self._process.poll() return 1 def enqueue_output(self, out, queue): for line in iter(out.readline, b''): queue.put(line) out.close() def kill(self): if self._process: os.killpg(os.getpgid(self.pid()), signal.SIGTERM) self._process.terminate() def pid(self): if self._process: return self._process.pid def run(self): if self._process: raise ValueError("Already Ran") self._out_queue = Queue() self._err_queue = Queue() try: self._process = subprocess.Popen( self._command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid, ) self._stdout_t = Thread(target=self.enqueue_output, args=(self._process.stdout, self._out_queue)) self._stdout_t.daemon = True self._stderr_t = Thread(target=self.enqueue_output, args=(self._process.stderr, self._err_queue)) self._stderr_t.daemon = True self._stdout_t.start() self._stderr_t.start() except Exception: import traceback fmt = traceback.format_exc() for line in fmt.split("\n"): self._err_queue.put(line) def read_stdout_line(self): try: return self._out_queue.get_nowait() except Empty: return None def read_stdout_all(self): buf = [] line = self.read_stdout_line() while line and line is not None: buf.append(text_type(line)) line = self.read_stdout_line() time.sleep(0.1) return "\n".join(buf) def read_stderr_line(self): try: return self._err_queue.get_nowait() except Empty: return None def read_stderr_all(self): buf = [] line = self.read_stderr_line() while line and line is not None: buf.append(text_type(line)) line = self.read_stderr_line() time.sleep(0.1) return "\n".join(buf) def read_all(self): return self.read_stdout_all(), self.read_stderr_all()
class RabbitMQCrashStorage(CrashStorageBase): """This class is an implementation of a Socorro Crash Storage system. It is used as a crash queing methanism for raw crashes. It implements the save_raw_crash method as a queue submission function, and the new_crashes generator as a queue consumption function. Please note: as it only queues the crash_id and not the whole raw crash, it is not suitable to actually save a crash. It is a very lossy container. This class should be used in conjuction with a more persistant storage mechanism. The implementations CrashStorage classes can use arbitrarly high or low level semantics to talk to their underlying resource. In the RabbitMQ, implementation, queing through the 'save_raw_crash' method is given full transactional semantics using the TransactorExecutor classes. The 'new_crashes' generator has a lower level relationship with the underlying connection object""" required_config = Namespace() required_config.add_option( 'rabbitmq_class', default=ConnectionContextPooled, # we choose a pooled connection # because we need thread safe # connection behaviors doc='the class responsible for connecting to RabbitMQ', reference_value_from='resource.rabbitmq', ) required_config.add_option( 'transaction_executor_class', default="socorro.database.transaction_executor." "TransactionExecutorWithInfiniteBackoff", doc='a class that will manage transactions', from_string_converter=class_converter, reference_value_from='resource.rabbitmq', ) required_config.add_option( 'routing_key', default='socorro.normal', doc='the name of the queue to recieve crashes', reference_value_from='resource.rabbitmq', ) required_config.add_option( 'filter_on_legacy_processing', default=True, doc='toggle for using or ignoring the throttling flag', reference_value_from='resource.rabbitmq', ) required_config.add_option( 'throttle', default=100, doc='percentage of the time that rabbit will try to queue', reference_value_from='resource.rabbitmq', ) def __init__(self, config, namespace='', quit_check_callback=None): super(RabbitMQCrashStorage, self).__init__( config, namespace=namespace, quit_check_callback=quit_check_callback ) self.config = config # Note: this may continue to grow if we aren't acking certain UUIDs. # We should find a way to time out UUIDs after a certain time. self.acknowledgement_token_cache = {} self.acknowledgment_queue = Queue() self.rabbitmq = config.rabbitmq_class(config) self.transaction = config.transaction_executor_class( config, self.rabbitmq, quit_check_callback=quit_check_callback ) # cache this object so we don't have to remake it for every transaction self._basic_properties = pika.BasicProperties( delivery_mode=2, # make message persistent ) if config.throttle == 100: self.dont_queue_this_crash = lambda: False else: self.dont_queue_this_crash = ( lambda: randint(1, 100) > config.throttle ) def save_raw_crash(self, raw_crash, dumps, crash_id): if self.dont_queue_this_crash(): self.config.logger.info( 'Crash %s filtered out of RabbitMQ queue %s', crash_id, self.config.routing_key ) return try: this_crash_should_be_queued = ( not self.config.filter_on_legacy_processing or raw_crash.legacy_processing == 0 ) except KeyError: self.config.logger.debug( 'RabbitMQCrashStorage legacy_processing key absent in crash ' '%s', crash_id ) return if this_crash_should_be_queued: self.config.logger.debug( 'RabbitMQCrashStorage saving crash %s', crash_id ) self.transaction(self._save_raw_crash_transaction, crash_id) return True else: self.config.logger.debug( 'RabbitMQCrashStorage not saving crash %s, legacy processing ' 'flag is %s', crash_id, raw_crash.legacy_processing ) def _save_raw_crash_transaction(self, connection, crash_id): connection.channel.basic_publish( exchange='', routing_key=self.config.routing_key, body=crash_id, properties=self._basic_properties ) def _basic_get_transaction(self, conn, queue): """reorganize the the call to rabbitmq basic_get so that it can be used by the transaction retry wrapper.""" things = conn.channel.basic_get(queue=queue) return things def new_crashes(self): """This generator fetches crash_ids from RabbitMQ.""" # We've set up RabbitMQ to require acknowledgement of processing of a # crash_id from this generator. It is the responsibility of the # consumer of the crash_id to tell this instance of the class when has # completed its work on the crash_id. That is done with the call to # 'ack_crash' below. Because RabbitMQ connections are not thread safe, # only the thread that read the crash may acknowledge it. 'ack_crash' # queues the crash_id. The '_consume_acknowledgement_queue' function # is run to send acknowledgments back to RabbitMQ self._consume_acknowledgement_queue() queues = [ self.rabbitmq.config.priority_queue_name, self.rabbitmq.config.standard_queue_name, self.rabbitmq.config.reprocessing_queue_name, self.rabbitmq.config.priority_queue_name, ] while True: for queue in queues: method_frame, header_frame, body = self.transaction( self._basic_get_transaction, queue=queue ) if method_frame and self._suppress_duplicate_jobs( body, method_frame ): continue if method_frame: break # must consume ack queue before testing for end of iterator # or the last job won't get ack'd self._consume_acknowledgement_queue() if not method_frame: # there was nothing in the queue - leave the iterator return self.acknowledgement_token_cache[body] = method_frame yield body queues.reverse() def ack_crash(self, crash_id): self.acknowledgment_queue.put(crash_id) def _suppress_duplicate_jobs(self, crash_id, acknowledgement_token): """if this crash is in the cache, then it is already in progress and this is a duplicate. Acknowledge it, then return to True to let the caller know to skip on to the next crash.""" if crash_id in self.acknowledgement_token_cache: # reject this crash - it's already being processsed self.config.logger.info( 'duplicate job: %s is already in progress', crash_id ) # ack this self.transaction( self._transaction_ack_crash, crash_id, acknowledgement_token ) return True return False def _consume_acknowledgement_queue(self): """The acknowledgement of the processing of each crash_id yielded from the 'new_crashes' method must take place on the same connection that the crash_id came from. The crash_ids are queued in the 'acknowledgment_queue'. That queue is consumed by the QueuingThread""" try: while True: crash_id_to_be_acknowledged = \ self.acknowledgment_queue.get_nowait() # self.config.logger.debug( # 'RabbitMQCrashStorage set to acknowledge %s', # crash_id_to_be_acknowledged # ) try: acknowledgement_token = \ self.acknowledgement_token_cache[ crash_id_to_be_acknowledged ] self.transaction( self._transaction_ack_crash, crash_id_to_be_acknowledged, acknowledgement_token ) del self.acknowledgement_token_cache[ crash_id_to_be_acknowledged ] except KeyError: self.config.logger.warning( 'RabbitMQCrashStorage tried to acknowledge crash %s' ', which was not in the cache', crash_id_to_be_acknowledged, exc_info=True ) except Exception: self.config.logger.error( 'RabbitMQCrashStorage unexpected failure on %s', crash_id_to_be_acknowledged, exc_info=True ) except Empty: pass # nothing to do with an empty queue def _transaction_ack_crash( self, connection, crash_id, acknowledgement_token ): connection.channel.basic_ack( delivery_tag=acknowledgement_token.delivery_tag ) self.config.logger.debug( 'RabbitMQCrashStorage acking %s with delivery_tag %s', crash_id, acknowledgement_token.delivery_tag )
class TaskQueue(ChainedIdentity): """ A class for managing async tasks. """ def __init__(self, worker_pool=None, error_handler=None, flush_timeout_seconds=None, **kwargs): """ :param worker_pool: Thread pool for executing tasks :type worker_pool: concurrent.futures.ThreadPoolExecutor :param error_handler: Extension point for processing error queue items :type error_handler: function(error, logging.Logger) :param timeout_seconds: Task flush timeout in seconds :type timeout_seconds: timeout in seconds """ super(TaskQueue, self).__init__(**kwargs) self._tasks = Queue() self._results = [] # For right now, don't need queue for errors, but it's # probable that we'll want the error handler looping on queue thread self._errors = [] self._err_handler = error_handler self._worker_pool = worker_pool if worker_pool is not None else WorkerPool(_parent_logger=self._logger) self._task_number = 0 self._flush_timeout_seconds = DEFAULT_FLUSH_TIMEOUT_SECONDS if flush_timeout_seconds: self._flush_timeout_seconds = flush_timeout_seconds self._logger.debug('Overriding default timeout to {}'.format(flush_timeout_seconds)) def __enter__(self): self._logger.debug("[Start]") return self def __exit__(self, *args): self._logger.debug("[Stop] - waiting default timeout") self.flush(self.identity) # TODO: Adding functions with this method needs to be more configurable def add(self, func, *args, **kwargs): """ :param func: Function to be executed asynchronously :type func: builtin.function """ future = self.create_future(func, *args, **kwargs) ident = "{}_{}".format(self._tasks.qsize(), func.__name__) task = AsyncTask(future, _ident=ident, _parent_logger=self._logger) self.add_task(task) return task def add_task(self, async_task): """ :param async_task: asynchronous task to be added to the queue and possibly processed :type async_task: azureml._async.AsyncTask """ '''Blocking, no timeout add task to queue''' if not isinstance(async_task, AsyncTask): raise ValueError("Can only add AsyncTask, got {0}".format(type(async_task))) self._logger.debug("Adding task {0} to queue of approximate size: {1}".format(async_task.ident, self._tasks.qsize())) self._tasks.put(async_task) def create_future(self, func, *args, **kwargs): return self._worker_pool.submit(func, *args, **kwargs) def flush(self, source, timeout_seconds=None): with self._log_context("WaitFlushSource:{}".format(source)) as log_context: if timeout_seconds is None: log_context.debug("Overriding default flush timeout from None to {}". format(self._flush_timeout_seconds)) timeout_seconds = self._flush_timeout_seconds else: log_context.debug("flush timeout {} is different from task queue timeout {}, using flush timeout". format(timeout_seconds, self._flush_timeout_seconds)) start_time = time.time() # Take tasks off of the queue tasks_to_wait = [] while True: try: tasks_to_wait.append(self._tasks.get_nowait()) except Empty: break message = "" timeout_time = start_time + timeout_seconds log_context.debug("Waiting {} seconds on tasks: {}.".format(timeout_seconds, tasks_to_wait)) not_done = True while not_done and time.time() <= timeout_time: completed_tasks = [task for task in tasks_to_wait if task.done()] tasks_to_wait = [task for task in tasks_to_wait if not task.done()] not_done = len(tasks_to_wait) != 0 self._results.extend((task.wait(awaiter_name=self.identity) for task in completed_tasks)) if not_done: for task in tasks_to_wait: message += "Waiting on task: {}.\n".format(task.ident) message += "{} tasks left. Current duration of flush {} seconds.\n".format( len(tasks_to_wait), time.time() - start_time) time.sleep(.25) self._logger.debug(message) # Reach this case on timeout if not_done: azureml_error = AzureMLError.create( FlushTaskTimeout, timeout_seconds=timeout_seconds ) raise AzureMLException._with_error(azureml_error) @property def results(self): for result in self._results: yield result def errors(self): for error in self._errors: yield error
class JobHandlerStopQueue(Monitors): """ A queue for jobs which need to be terminated prematurely. """ STOP_SIGNAL = object() def __init__(self, app, dispatcher): self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting = [] name = "JobHandlerStopQueue.monitor_thread" self._init_monitor_thread(name, config=app.config) log.info("job handler stop queue started") def start(self): # Start the queue self.monitor_thread.start() log.info("job handler stop queue started") def monitor(self): """ Continually iterate the waiting jobs, stop any that are found. """ # HACK: Delay until after forking, we need a way to do post fork notification!!! time.sleep(10) while self.monitor_running: try: self.monitor_step() except Exception: log.exception("Exception in monitor_step") # Sleep self._monitor_sleep(1) def monitor_step(self): """ Called repeatedly by `monitor` to stop jobs. """ # Pull all new jobs from the queue at once jobs_to_check = [] if self.app.config.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs newly_deleted_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter((model.Job.state == model.Job.states.DELETED_NEW) & (model.Job.handler == self.app.config.server_name)).all() for job in newly_deleted_jobs: jobs_to_check.append((job, job.stderr)) # Also pull from the queue (in the case of Administrative stopped jobs) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, error_msg = message # Get the job object and append to watch queue jobs_to_check.append( (self.sa_session.query(model.Job).get(job_id), error_msg)) except Empty: pass for job, error_msg in jobs_to_check: if (job.state not in (job.states.DELETED_NEW, job.states.DELETED) and job.finished): # terminated before it got here log.debug('Job %s already finished, not deleting or stopping', job.id) continue final_state = job.states.DELETED if error_msg is not None: final_state = job.states.ERROR job.info = error_msg job.set_final_state(final_state) self.sa_session.add(job) self.sa_session.flush() if job.job_runner_name is not None: # tell the dispatcher to stop the job self.dispatcher.stop(job) def put(self, job_id, error_msg=None): if not self.app.config.track_jobs_in_database: self.queue.put((job_id, error_msg)) def shutdown(self): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info("sending stop signal to worker thread") self.stop_monitoring() if not self.app.config.track_jobs_in_database: self.queue.put(self.STOP_SIGNAL) self.shutdown_monitor() log.info("job handler stop queue stopped")
class JobHandlerQueue(Monitors): """ Job Handler's Internal Queue, this is what actually implements waiting for jobs to be runnable and dispatching to a JobRunner. """ STOP_SIGNAL = object() def __init__(self, app, dispatcher): """Initializes the Job Handler Queue, creates (unstarted) monitoring thread""" self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context self.track_jobs_in_database = self.app.config.track_jobs_in_database # Initialize structures for handling job limits self.__clear_job_count() # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting_jobs = [] # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times) self.job_wrappers = {} name = "JobHandlerQueue.monitor_thread" self._init_monitor_thread(name, target=self.__monitor, config=app.config) def start(self): """ Starts the JobHandler's thread after checking for any unhandled jobs. """ log.debug('Handler queue starting for jobs assigned to handler: %s', self.app.config.server_name) # Recover jobs at startup self.__check_jobs_at_startup() # Start the queue self.monitor_thread.start() # The stack code is initialized in the application JobHandlerMessage().bind_default_handler(self, '_handle_message') self.app.application_stack.register_message_handler( self._handle_message, name=JobHandlerMessage.target) log.info("job handler queue started") def job_wrapper(self, job, use_persisted_destination=False): return JobWrapper(job, self, use_persisted_destination=use_persisted_destination) def job_pair_for_id(self, id): job = self.sa_session.query(model.Job).get(id) return job, self.job_wrapper(job, use_persisted_destination=True) def __write_registry_file_if_absent(self, job): # TODO: remove this and the one place it is called in late 2018, this # hack attempts to minimize the job failures due to upgrades from 17.05 # Galaxies. job_wrapper = self.job_wrapper(job) cwd = job_wrapper.working_directory datatypes_config = os.path.join(cwd, "registry.xml") if not os.path.exists(datatypes_config): try: self.app.datatypes_registry.to_xml_file(path=datatypes_config) except OSError: pass def __check_jobs_at_startup(self): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.track_jobs_in_database: in_list = (model.Job.states.QUEUED, model.Job.states.RUNNING) else: in_list = (model.Job.states.NEW, model.Job.states.QUEUED, model.Job.states.RUNNING) if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin(model.User) \ .filter(model.Job.state.in_(in_list) & (model.Job.handler == self.app.config.server_name) & or_((model.Job.user_id == null()), (model.User.active == true()))).all() else: jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(model.Job.state.in_(in_list) & (model.Job.handler == self.app.config.server_name)).all() for job in jobs_at_startup: self.__write_registry_file_if_absent(job) if not self.app.toolbox.has_tool( job.tool_id, job.tool_version, exact=True): log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % (job.id, job.tool_id)) self.job_wrapper(job).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) elif job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id) job.job_runner_name = None if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist job_wrapper = self.job_wrapper(job) job_destination = self.dispatcher.url_to_destination( job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover(job, job_wrapper) log.info( '(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % (job.id, job.state)) if self.track_jobs_in_database: job.set_state(model.Job.states.NEW) else: self.queue.put((job.id, job.tool_id)) else: # Already dispatched and running job_wrapper = self.__recover_job_wrapper(job) self.dispatcher.recover(job, job_wrapper) if self.sa_session.dirty: self.sa_session.flush() def __recover_job_wrapper(self, job): # Already dispatched and running job_wrapper = self.job_wrapper(job) # Use the persisted destination as its params may differ from # what's in the job_conf xml job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) # resubmits are not persisted (it's a good thing) so they # should be added back to the in-memory destination on startup try: config_job_destination = self.app.job_config.get_destination( job.destination_id) job_destination.resubmit = config_job_destination.resubmit except KeyError: log.debug( '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id) job_wrapper.job_runner_mapper.cached_job_destination = job_destination return job_wrapper def __monitor(self): """ Continually iterate the waiting jobs, checking is each is ready to run and dispatching if so. """ while self.monitor_running: try: # If jobs are locked, there's nothing to monitor and we skip # to the sleep. if not self.app.job_manager.job_lock: self.__monitor_step() except Exception: log.exception("Exception in monitor_step") self._monitor_sleep(1) def __monitor_step(self): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. If the job belongs to an inactive user it is ignored. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] resubmit_jobs = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA), (model.HistoryDatasetAssociation.deleted == true()), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == true())))).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != null()), (model.LibraryDatasetDatasetAssociation.deleted == true()), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == true())))).subquery() if self.app.config.user_activation_on: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .outerjoin(model.User) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.Job.user_id == null()), (model.User.active == true())), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() else: jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() # Fetch all "resubmit" jobs resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.RESUBMITTED), (model.Job.handler == self.app.config.server_name))) \ .order_by(model.Job.id).all() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) except Empty: pass # Ensure that we get new job counts on each iteration self.__clear_job_count() # Check resubmit jobs first so that limits of new jobs will still be enforced for job in resubmit_jobs: log.debug( '(%s) Job was resubmitted and is being dispatched immediately', job.id) # Reassemble resubmit job destination from persisted value jw = self.__recover_job_wrapper(job) if jw.is_ready_for_resubmission(job): self.increase_running_job_count(job.user_id, jw.job_destination.id) self.dispatcher.put(jw) # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue if job.copied_from_job_id: copied_from_job = self.sa_session.query(model.Job).get( job.copied_from_job_id) job.numeric_metrics = copied_from_job.numeric_metrics job.text_metrics = copied_from_job.text_metrics job.dependencies = copied_from_job.dependencies job.state = copied_from_job.state job.stderr = copied_from_job.stderr job.stdout = copied_from_job.stdout job.command_line = copied_from_job.command_line job.traceback = copied_from_job.traceback job.tool_version = copied_from_job.tool_version job.exit_code = copied_from_job.exit_code job.job_runner_name = copied_from_job.job_runner_name job.job_runner_external_id = copied_from_job.job_runner_external_id continue job_state = self.__check_job_state(job) if job_state == JOB_WAIT: new_waiting_jobs.append(job.id) elif job_state == JOB_INPUT_ERROR: log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id) elif job_state == JOB_INPUT_DELETED: log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id) elif job_state == JOB_READY: self.dispatcher.put(self.job_wrappers.pop(job.id)) log.info("(%d) Job dispatched" % job.id) elif job_state == JOB_DELETED: log.info("(%d) Job deleted by user while still queued" % job.id) elif job_state == JOB_ADMIN_DELETED: log.info("(%d) Job deleted by admin while still queued" % job.id) elif job_state in (JOB_USER_OVER_QUOTA, JOB_USER_OVER_TOTAL_WALLTIME): if job_state == JOB_USER_OVER_QUOTA: log.info("(%d) User (%s) is over quota: job paused" % (job.id, job.user_id)) else: log.info( "(%d) User (%s) is over total walltime limit: job paused" % (job.id, job.user_id)) job.set_state(model.Job.states.PAUSED) for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add(dataset_assoc.dataset.dataset) self.sa_session.add(job) elif job_state == JOB_ERROR: log.error("(%d) Error checking job readiness" % job.id) else: log.error("(%d) Job in unknown state '%s'" % (job.id, job_state)) new_waiting_jobs.append(job.id) except Exception: log.exception("failure running job %d", job.id) # Update the waiting list if not self.track_jobs_in_database: self.waiting_jobs = new_waiting_jobs # Remove cached wrappers for any jobs that are no longer being tracked for id in list(self.job_wrappers.keys()): if id not in new_waiting_jobs: del self.job_wrappers[id] # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove() def __check_job_state(self, job): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ if not self.track_jobs_in_database: in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job) if in_memory_not_ready_state: return in_memory_not_ready_state # Else, if tracking in the database, job.state is guaranteed to be NEW and # the inputs are guaranteed to be OK. # Create the job wrapper so that the destination can be set job_id = job.id job_wrapper = self.job_wrappers.get(job_id, None) if not job_wrapper: job_wrapper = self.job_wrapper(job) self.job_wrappers[job_id] = job_wrapper # If state == JOB_READY, assume job_destination also set - otherwise # in case of various error or cancelled states do not assume # destination has been set. state, job_destination = self.__verify_job_ready(job, job_wrapper) if state == JOB_READY: # PASS. increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration self.increase_running_job_count(job.user_id, job_destination.id) return state def __verify_job_ready(self, job, job_wrapper): """ Compute job destination and verify job is ready at that destination by checking job limits and quota. If this method return a job state of JOB_READY - it MUST also return a job destination. """ job_destination = None try: assert job_wrapper.tool is not None, 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' # Cause the job_destination to be set and cached by the mapper job_destination = job_wrapper.job_destination except AssertionError as e: log.warning( "(%s) Tool '%s' removed from tool config, unable to run job" % (job.id, job.tool_id)) job_wrapper.fail(e) return JOB_ERROR, job_destination except JobNotReadyException as e: job_state = e.job_state or JOB_WAIT return job_state, None except Exception as e: failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception('Failed to generate job destination') else: log.debug("Intentionally failing job with message (%s)" % failure_message) job_wrapper.fail(failure_message) return JOB_ERROR, job_destination # job is ready to run, check limits # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable state = self.__check_destination_jobs(job, job_wrapper) if state == JOB_READY: state = self.__check_user_jobs(job, job_wrapper) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota(job.user) if quota is not None: try: usage = self.app.quota_agent.get_usage(user=job.user, history=job.history) if usage > quota: return JOB_USER_OVER_QUOTA, job_destination except AssertionError as e: pass # No history, should not happen with an anon user # Check total walltime limits if (state == JOB_READY and "delta" in self.app.job_config.limits.total_walltime): jobs_to_check = self.sa_session.query(model.Job).filter( model.Job.user_id == job.user.id, model.Job.update_time >= datetime.datetime.now() - datetime.timedelta( self.app.job_config.limits.total_walltime["window"]), model.Job.state == 'ok').all() time_spent = datetime.timedelta(0) for job in jobs_to_check: # History is job.state_history started = None finished = None for history in sorted(job.state_history, key=lambda history: history.update_time): if history.state == "running": started = history.create_time elif history.state == "ok": finished = history.create_time time_spent += finished - started if time_spent > self.app.job_config.limits.total_walltime["delta"]: return JOB_USER_OVER_TOTAL_WALLTIME, job_destination return state, job_destination def __verify_in_memory_job_inputs(self, job): """ Perform the same checks that happen via SQL for in-memory managed jobs. """ if job.state == model.Job.states.DELETED: return JOB_DELETED elif job.state == model.Job.states.ERROR: return JOB_ADMIN_DELETED for dataset_assoc in job.input_datasets + job.input_library_datasets: idata = dataset_assoc.dataset if not idata: continue # don't run jobs for which the input dataset was deleted if idata.deleted: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail( "input data %s (file: %s) was deleted before the job started" % (idata.hid, idata.file_name)) return JOB_INPUT_DELETED # an error in the input data causes us to bail immediately elif idata.state == idata.states.ERROR: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail( "input data %s is in error state" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state == idata.states.FAILED_METADATA: self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail( "input data %s failed to properly set metadata" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id): # need to requeue return JOB_WAIT # All inputs ready to go. return None def __clear_job_count(self): self.user_job_count = None self.user_job_count_per_destination = None self.total_job_count_per_destination = None def get_user_job_count(self, user_id): self.__cache_user_job_count() # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching rval = self.user_job_count.get(user_id, 0) if not self.app.config.cache_user_job_count: result = self.sa_session.execute( select([func.count(model.Job.table.c.id)]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id == user_id)))) for row in result: # there should only be one row rval += row[0] return rval def __cache_user_job_count(self): # Cache the job count if necessary if self.user_job_count is None and self.app.config.cache_user_job_count: self.user_job_count = {} query = self.sa_session.execute( select([ model.Job.table.c.user_id, func.count(model.Job.table.c.user_id) ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING, model.Job.states.RESUBMITTED)), (model.Job.table.c.user_id != null()))).group_by( model.Job.table.c.user_id)) for row in query: self.user_job_count[row[0]] = row[1] elif self.user_job_count is None: self.user_job_count = {} def get_user_job_count_per_destination(self, user_id): self.__cache_user_job_count_per_destination() cached = self.user_job_count_per_destination.get(user_id, {}) if self.app.config.cache_user_job_count: rval = cached else: # The cached count is still used even when we're not caching, it is # incremented when a job is run by this handler to ensure that # multiple jobs can't get past the limits in one iteration of the # queue. rval = {} rval.update(cached) result = self.sa_session.execute( select([ model.Job.table.c.destination_id, func.count( model.Job.table.c.destination_id).label('job_count') ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING)), (model.Job.table.c.user_id == user_id))).group_by( model.Job.table.c.destination_id)) for row in result: # Add the count from the database to the cached count rval[row['destination_id']] = rval.get(row['destination_id'], 0) + row['job_count'] return rval def __cache_user_job_count_per_destination(self): # Cache the job count if necessary if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count: self.user_job_count_per_destination = {} result = self.sa_session.execute( select([ model.Job.table.c.user_id, model.Job.table.c.destination_id, func.count(model.Job.table.c.user_id).label('job_count') ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING)))).group_by( model.Job.table.c.user_id, model.Job.table.c.destination_id)) for row in result: if row['user_id'] not in self.user_job_count_per_destination: self.user_job_count_per_destination[row['user_id']] = {} self.user_job_count_per_destination[row['user_id']][ row['destination_id']] = row['job_count'] elif self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} def increase_running_job_count(self, user_id, destination_id): if self.app.job_config.limits.registered_user_concurrent_jobs or \ self.app.job_config.limits.anonymous_user_concurrent_jobs or \ self.app.job_config.limits.destination_user_concurrent_jobs: if self.user_job_count is None: self.user_job_count = {} if self.user_job_count_per_destination is None: self.user_job_count_per_destination = {} self.user_job_count[user_id] = self.user_job_count.get(user_id, 0) + 1 if user_id not in self.user_job_count_per_destination: self.user_job_count_per_destination[user_id] = {} self.user_job_count_per_destination[user_id][ destination_id] = self.user_job_count_per_destination[ user_id].get(destination_id, 0) + 1 if self.app.job_config.limits.destination_total_concurrent_jobs: if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} self.total_job_count_per_destination[ destination_id] = self.total_job_count_per_destination.get( destination_id, 0) + 1 def __check_user_jobs(self, job, job_wrapper): # TODO: Update output datasets' _state = LIMITED or some such new # state, so the UI can reflect what jobs are waiting due to concurrency # limits if job.user: # Check the hard limit first if self.app.job_config.limits.registered_user_concurrent_jobs: count = self.get_user_job_count(job.user_id) # Check the user's number of dispatched jobs against the overall limit if count >= self.app.job_config.limits.registered_user_concurrent_jobs: return JOB_WAIT # If we pass the hard limit, also check the per-destination count id = job_wrapper.job_destination.id count_per_id = self.get_user_job_count_per_destination(job.user_id) if id in self.app.job_config.limits.destination_user_concurrent_jobs: count = count_per_id.get(id, 0) # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_user_concurrent_jobs[ id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_user_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_user_concurrent_jobs[ tag]: return JOB_WAIT elif job.galaxy_session: # Anonymous users only get the hard limit if self.app.job_config.limits.anonymous_user_concurrent_jobs: count = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_(model.Job.session_id == job.galaxy_session.id, or_(model.Job.state == model.Job.states.RUNNING, model.Job.state == model.Job.states.QUEUED))).count() if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs: return JOB_WAIT else: log.warning( 'Job %s is not associated with a user or session so job concurrency limit cannot be checked.' % job.id) return JOB_READY def __cache_total_job_count_per_destination(self): # Cache the job count if necessary if self.total_job_count_per_destination is None: self.total_job_count_per_destination = {} result = self.sa_session.execute( select([ model.Job.table.c.destination_id, func.count( model.Job.table.c.destination_id).label('job_count') ]).where( and_( model.Job.table.c.state.in_( (model.Job.states.QUEUED, model.Job.states.RUNNING)))).group_by( model.Job.table.c.destination_id)) for row in result: self.total_job_count_per_destination[ row['destination_id']] = row['job_count'] def get_total_job_count_per_destination(self): self.__cache_total_job_count_per_destination() # Always use caching (at worst a job will have to wait one iteration, # and this would be more fair anyway as it ensures FIFO scheduling, # insofar as FIFO would be fair...) return self.total_job_count_per_destination def __check_destination_jobs(self, job, job_wrapper): if self.app.job_config.limits.destination_total_concurrent_jobs: id = job_wrapper.job_destination.id count_per_id = self.get_total_job_count_per_destination() if id in self.app.job_config.limits.destination_total_concurrent_jobs: count = count_per_id.get(id, 0) # Check the number of dispatched jobs in the assigned destination id against the limit for that id if count >= self.app.job_config.limits.destination_total_concurrent_jobs[ id]: return JOB_WAIT # If we pass the destination limit (if there is one), also check limits on any tags (if any) if job_wrapper.job_destination.tags: for tag in job_wrapper.job_destination.tags: # Check each tag for this job's destination if tag in self.app.job_config.limits.destination_total_concurrent_jobs: # Only if there's a limit defined for this tag count = 0 for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]: # Add up the aggregate job total for this tag count += count_per_id.get(id, 0) if count >= self.app.job_config.limits.destination_total_concurrent_jobs[ tag]: return JOB_WAIT return JOB_READY def _handle_setup_msg(self, job_id=None): job = self.sa_session.query(model.Job).get(job_id) if job.handler is None: job.handler = self.app.config.server_name self.sa_session.add(job) self.sa_session.flush() # If not tracking jobs in the database self.put(job.id, job.tool_id) else: log.warning( "(%s) Handler '%s' received setup message but handler '%s' is already assigned, ignoring", job.id, self.app.config.server_name, job.handler) def put(self, job_id, tool_id): """Add a job to the queue (by job identifier)""" if not self.track_jobs_in_database: self.queue.put((job_id, tool_id)) self.sleeper.wake() else: # Workflow invocations farmed out to workers will submit jobs through here. If a handler is unassigned, we # will submit for one, or else claim it ourself. TODO: This should be moved to a higher level as it's now # implemented here and in MessageJobQueue job = self.sa_session.query(model.Job).get(job_id) if job.handler is None and self.app.application_stack.has_pool( self.app.application_stack.pools.JOB_HANDLERS): msg = JobHandlerMessage(task='setup', job_id=job_id) self.app.application_stack.send_message( self.app.application_stack.pools.JOB_HANDLERS, msg) def shutdown(self): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info("sending stop signal to worker thread") self.stop_monitoring() if not self.app.config.track_jobs_in_database: self.queue.put(self.STOP_SIGNAL) # A message could still be received while shutting down, should be ok since they will be picked up on next startup. self.app.application_stack.deregister_message_handler( name=JobHandlerMessage.target) self.sleeper.wake() self.shutdown_monitor() log.info("job handler queue stopped") self.dispatcher.shutdown()
class JobHandlerStopQueue(Monitors): """ A queue for jobs which need to be terminated prematurely. """ STOP_SIGNAL = object() def __init__(self, app, dispatcher): self.app = app self.dispatcher = dispatcher self.sa_session = app.model.context # Keep track of the pid that started the job manager, only it # has valid threads self.parent_pid = os.getpid() # Contains new jobs. Note this is not used if track_jobs_in_database is True self.queue = Queue() # Contains jobs that are waiting (only use from monitor thread) self.waiting = [] name = "JobHandlerStopQueue.monitor_thread" self._init_monitor_thread(name, config=app.config) log.info("job handler stop queue started") def start(self): # Start the queue self.monitor_thread.start() log.info("job handler stop queue started") def monitor(self): """ Continually iterate the waiting jobs, stop any that are found. """ # HACK: Delay until after forking, we need a way to do post fork notification!!! time.sleep(10) while self.monitor_running: try: self.monitor_step() except Exception: log.exception("Exception in monitor_step") # Sleep self._monitor_sleep(1) def monitor_step(self): """ Called repeatedly by `monitor` to stop jobs. """ # Pull all new jobs from the queue at once jobs_to_check = [] if self.app.config.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs newly_deleted_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter((model.Job.state == model.Job.states.DELETED_NEW) & (model.Job.handler == self.app.config.server_name)).all() for job in newly_deleted_jobs: jobs_to_check.append((job, job.stderr)) # Also pull from the queue (in the case of Administrative stopped jobs) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, error_msg = message # Get the job object and append to watch queue jobs_to_check.append((self.sa_session.query(model.Job).get(job_id), error_msg)) except Empty: pass for job, error_msg in jobs_to_check: if (job.state not in (job.states.DELETED_NEW, job.states.DELETED) and job.finished): # terminated before it got here log.debug('Job %s already finished, not deleting or stopping', job.id) continue final_state = job.states.DELETED if error_msg is not None: final_state = job.states.ERROR job.info = error_msg job.set_final_state(final_state) self.sa_session.add(job) self.sa_session.flush() if job.job_runner_name is not None: # tell the dispatcher to stop the job self.dispatcher.stop(job) def put(self, job_id, error_msg=None): if not self.app.config.track_jobs_in_database: self.queue.put((job_id, error_msg)) def shutdown(self): """Attempts to gracefully shut down the worker thread""" if self.parent_pid != os.getpid(): # We're not the real job queue, do nothing return else: log.info("sending stop signal to worker thread") self.stop_monitoring() if not self.app.config.track_jobs_in_database: self.queue.put(self.STOP_SIGNAL) self.shutdown_monitor() log.info("job handler stop queue stopped")
class ThreadedFifoBuffer(FifoBuffer): """ FIFO-in-memory connection inside dedicated thread. This is external-IO usable for Moler since it has it's own runner (thread) that can work in background and pull data from FIFO-mem connection. Usable for integration tests. """ def __init__(self, moler_connection, echo=True, name=None, logger_name=""): """Initialization of FIFO-mem-threaded connection.""" super(ThreadedFifoBuffer, self).__init__(moler_connection=moler_connection, echo=echo, name=name, logger_name=logger_name) self.pulling_thread = None self.injections = Queue() def open(self): """Start thread pulling data from FIFO buffer.""" ret = super(ThreadedFifoBuffer, self).open() done = threading.Event() self.pulling_thread = TillDoneThread(target=self.pull_data, done_event=done, kwargs={'pulling_done': done}) self.pulling_thread.start() self._log(msg="open {}".format(self), level=logging.INFO) self._notify_on_connect() return ret def close(self): """Stop pulling thread.""" if self.pulling_thread: self.pulling_thread.join() self.pulling_thread = None super(ThreadedFifoBuffer, self).close() self._log(msg="closed {}".format(self), level=logging.INFO) self._notify_on_disconnect() def inject(self, input_bytes, delay=0.0): """ Add bytes to end of buffer :param input_bytes: iterable of bytes to inject :param delay: delay before each inject :return: None """ for data in input_bytes: self.injections.put((data, delay)) if not delay: time.sleep(0.05) # give subsequent read() a chance to get data def _inject_deferred(self): if self.deferred_injections: for data, delay in self.deferred_injections: self.injections.put((data, delay)) self.deferred_injections = [] time.sleep(0.05) # give subsequent read() a chance to get data def pull_data(self, pulling_done): """Pull data from FIFO buffer.""" while not pulling_done.is_set(): self.read() # internally forwards to embedded Moler connection try: data, delay = self.injections.get_nowait() if delay: time.sleep(delay) self._inject(data) self.injections.task_done() except Empty: time.sleep(0.01) # give FIFO chance to get data
class AsynchronousJobRunner(BaseJobRunner, Monitors): """Parent class for any job runner that runs jobs asynchronously (e.g. via a distributed resource manager). Provides general methods for having a thread to monitor the state of asynchronous jobs and submitting those jobs to the correct methods (queue, finish, cleanup) at appropriate times.. """ def __init__(self, app, nworkers, **kwargs): super(AsynchronousJobRunner, self).__init__(app, nworkers, **kwargs) # 'watched' and 'queue' are both used to keep track of jobs to watch. # 'queue' is used to add new watched jobs, and can be called from # any thread (usually by the 'queue_job' method). 'watched' must only # be modified by the monitor thread, which will move items from 'queue' # to 'watched' and then manage the watched jobs. self.watched = [] self.monitor_queue = Queue() def _init_monitor_thread(self): name = "%s.monitor_thread" % self.runner_name super(AsynchronousJobRunner, self)._init_monitor_thread(name=name, target=self.monitor, start=True, config=self.app.config) def handle_stop(self): # DRMAA and SGE runners should override this and disconnect. pass def monitor(self): """ Watches jobs currently in the monitor queue and deals with state changes (queued to running) and job completion. """ while True: # Take any new watched jobs and put them on the monitor list try: while True: async_job_state = self.monitor_queue.get_nowait() if async_job_state is STOP_SIGNAL: # TODO: This is where any cleanup would occur self.handle_stop() return self.watched.append(async_job_state) except Empty: pass # Iterate over the list of watched jobs and check state try: self.check_watched_items() except Exception: log.exception('Unhandled exception checking active jobs') # Sleep a bit before the next state check time.sleep(1) def monitor_job(self, job_state): self.monitor_queue.put(job_state) def shutdown(self): """Attempts to gracefully shut down the monitor thread""" log.info("%s: Sending stop signal to monitor thread" % self.runner_name) self.monitor_queue.put(STOP_SIGNAL) # Call the parent's shutdown method to stop workers self.shutdown_monitor() super(AsynchronousJobRunner, self).shutdown() def check_watched_items(self): """ This method is responsible for iterating over self.watched and handling state changes and updating self.watched with a new list of watched job states. Subclasses can opt to override this directly (as older job runners will initially) or just override check_watched_item and allow the list processing to reuse the logic here. """ new_watched = [] for async_job_state in self.watched: new_async_job_state = self.check_watched_item(async_job_state) if new_async_job_state: new_watched.append(new_async_job_state) self.watched = new_watched # Subclasses should implement this unless they override check_watched_items all together. def check_watched_item(self, job_state): raise NotImplementedError() def finish_job(self, job_state): """ Get the output/error for a finished job, pass to `job_wrapper.finish` and cleanup all the job's temporary files. """ galaxy_id_tag = job_state.job_wrapper.get_id_tag() external_job_id = job_state.job_id # To ensure that files below are readable, ownership must be reclaimed first job_state.job_wrapper.reclaim_ownership() # wait for the files to appear which_try = 0 collect_output_success = True while which_try < self.app.config.retry_job_output_collection + 1: try: with open(job_state.output_file, "rb") as stdout_file, open(job_state.error_file, 'rb') as stderr_file: stdout = shrink_stream_by_size( stdout_file, DATABASE_MAX_STRING_SIZE, join_by="\n..\n", left_larger=True, beginning_on_size_error=True) stderr = shrink_stream_by_size( stderr_file, DATABASE_MAX_STRING_SIZE, join_by="\n..\n", left_larger=True, beginning_on_size_error=True) break except Exception as e: if which_try == self.app.config.retry_job_output_collection: stdout = '' stderr = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER log.error('(%s/%s) %s: %s' % (galaxy_id_tag, external_job_id, stderr, str(e))) collect_output_success = False else: time.sleep(1) which_try += 1 if not collect_output_success: job_state.fail_message = stderr job_state.runner_state = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER self.mark_as_failed(job_state) return try: # This should be an 8-bit exit code, but read ahead anyway: exit_code_str = open(job_state.exit_code_file, "r").read(32) except Exception: # By default, the exit code is 0, which typically indicates success. exit_code_str = "0" try: # Decode the exit code. If it's bogus, then just use 0. exit_code = int(exit_code_str) except ValueError: log.warning("(%s/%s) Exit code '%s' invalid. Using 0." % (galaxy_id_tag, external_job_id, exit_code_str)) exit_code = 0 # clean up the job files cleanup_job = job_state.job_wrapper.cleanup_job if cleanup_job == "always" or (not stderr and cleanup_job == "onsuccess"): job_state.cleanup() try: self._finish_or_resubmit_job(job_state, stdout, stderr, exit_code) except Exception: log.exception("(%s/%s) Job wrapper finish method failed" % (galaxy_id_tag, external_job_id)) job_state.job_wrapper.fail("Unable to finish job", exception=True) def mark_as_finished(self, job_state): self.work_queue.put((self.finish_job, job_state)) def mark_as_failed(self, job_state): self.work_queue.put((self.fail_job, job_state))
class SingleMachineBatchSystem(BatchSystemSupport): """ The interface for running jobs on a single machine, runs all the jobs you give it as they come in, but in parallel. Uses a single "daddy" thread to manage a fleet of child processes. Communication with the daddy thread happens via two queues: one queue of jobs waiting to be run (the input queue), and one queue of jobs that are finished/stopped and need to be returned by getUpdatedBatchJob (the output queue). When the batch system is shut down, the daddy thread is stopped. If running in debug-worker mode, jobs are run immediately as they are sent to the batch system, in the sending thread, and the daddy thread is not run. But the queues are still used. """ @classmethod def supportsAutoDeployment(cls): return False @classmethod def supportsWorkerCleanup(cls): return True numCores = cpu_count() minCores = 0.1 """ The minimal fractional CPU. Tasks with a smaller core requirement will be rounded up to this value. """ physicalMemory = toil.physicalMemory() def __init__(self, config, maxCores, maxMemory, maxDisk): # Limit to the smaller of the user-imposed limit and what we actually # have on this machine for each resource. # # If we don't have up to the limit of the resource (and the resource # isn't the inlimited sentinel), warn. if maxCores > self.numCores: if maxCores != sys.maxsize: # We have an actually specified limit and not the default log.warning( 'Not enough cores! User limited to %i but we only have %i.', maxCores, self.numCores) maxCores = self.numCores if maxMemory > self.physicalMemory: if maxMemory != sys.maxsize: # We have an actually specified limit and not the default log.warning( 'Not enough memory! User limited to %i bytes but we only have %i bytes.', maxMemory, self.physicalMemory) maxMemory = self.physicalMemory self.physicalDisk = toil.physicalDisk(config) if maxDisk > self.physicalDisk: if maxDisk != sys.maxsize: # We have an actually specified limit and not the default log.warning( 'Not enough disk space! User limited to %i bytes but we only have %i bytes.', maxDisk, self.physicalDisk) maxDisk = self.physicalDisk super(SingleMachineBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) assert self.maxCores >= self.minCores assert self.maxMemory >= 1 # The scale allows the user to apply a factor to each task's cores requirement, thereby # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores # (scale > 1). self.scale = config.scale if config.badWorker > 0 and config.debugWorker: # We can't throw SIGUSR1 at the worker because it is also going to # be the leader and/or test harness. raise RuntimeError( "Cannot use badWorker and debugWorker together; " "worker would have to kill the leader") self.debugWorker = config.debugWorker # A counter to generate job IDs and a lock to guard it self.jobIndex = 0 self.jobIndexLock = Lock() # A dictionary mapping IDs of submitted jobs to the command line self.jobs = {} """ :type: dict[str,toil.job.JobNode] """ # A queue of jobs waiting to be executed. Consumed by the daddy thread. self.inputQueue = Queue() # A queue of finished jobs. Produced by the daddy thread. self.outputQueue = Queue() # A dictionary mapping IDs of currently running jobs to their Info objects self.runningJobs = {} """ :type: dict[str,Info] """ # These next two are only used outside debug-worker mode # A dict mapping PIDs to Popen objects for running jobs. # Jobs that don't fork are executed one at a time in the main thread. self.children = {} """ :type: dict[int,subprocess.Popen] """ # A dict mapping child PIDs to the Job IDs they are supposed to be running. self.childToJob = {} """ :type: dict[int,str] """ # A pool representing available CPU in units of minCores self.coreFractions = ResourcePool( int(old_div(self.maxCores, self.minCores)), 'cores') # A pool representing available memory in bytes self.memory = ResourcePool(self.maxMemory, 'memory') # A pool representing the available space in bytes self.disk = ResourcePool(self.maxDisk, 'disk') # If we can't schedule something, we fill this in with a reason why self.schedulingStatusMessage = None # We use this event to signal shutdown self.shuttingDown = Event() # A thread in charge of managing all our child processes. # Also takes care of resource accounting. self.daddyThread = None # If it breaks it will fill this in self.daddyException = None if self.debugWorker: log.debug('Started in worker debug mode.') else: self.daddyThread = Thread(target=self.daddy, daemon=True) self.daddyThread.start() log.debug('Started in normal mode.') def daddy(self): """ Be the "daddy" thread. Our job is to look at jobs from the input queue. If a job fits in the available resources, we allocate resources for it and kick off a child process. We also check on our children. When a child finishes, we reap it, release its resources, and put its information in the output queue. """ try: log.debug('Started daddy thread.') while not self.shuttingDown.is_set(): # Main loop while not self.shuttingDown.is_set(): # Try to start as many jobs as we can try to start try: # Grab something from the input queue if available. args = self.inputQueue.get_nowait() jobCommand, jobID, jobCores, jobMemory, jobDisk, environment = args coreFractions = int(old_div(jobCores, self.minCores)) # Try to start the child result = self._startChild(jobCommand, jobID, coreFractions, jobMemory, jobDisk, environment) if result is None: # We did not get the resources to run this job. # Requeue last, so we can look at the next job. # TODO: Have some kind of condition the job can wait on, # but without threads (queues for jobs needing # cores/memory/disk individually)? self.inputQueue.put(args) break # Otherwise it's a PID if it succeeded, or False if it couldn't # start. But we don't care either way here. except Empty: # Nothing to run. Stop looking in the queue. break # Now check on our children. for done_pid in self._pollForDoneChildrenIn(self.children): # A child has actually finished. # Clean up after it. self._handleChild(done_pid) # Then loop again: start and collect more jobs. # TODO: It would be good to be able to wait on a new job or a finished child, whichever comes first. # For now we just sleep and loop. time.sleep(0.01) # When we get here, we are shutting down. for popen in self.children.values(): # Kill all the children, going through popen to avoid signaling re-used PIDs. popen.kill() for popen in self.children.values(): # Reap all the children popen.wait() # Then exit the thread. return except Exception as e: log.critical('Unhandled exception in daddy thread: %s', traceback.format_exc()) # Pass the exception back to the main thread so it can stop the next person who calls into us. self.daddyException = e raise def _checkOnDaddy(self): if self.daddyException is not None: # The daddy thread broke and we cannot do our job log.critical( 'Propagating unhandled exception in daddy thread to main thread' ) exc = self.daddyException self.daddyException = None raise exc def _pollForDoneChildrenIn(self, pid_to_popen): """ See if any children represented in the given dict from PID to Popen object have finished. Return a collection of their PIDs. Guarantees that each child's exit code will be gettable via wait() on the child's Popen object (i.e. does not reap the child, unless via Popen). """ # We keep our found PIDs in a set so we can work around waitid showing # us the same one repeatedly. ready = set() # Find the waitid function waitid = getattr(os, 'waitid', None) if callable(waitid): # waitid exists (not Mac) while True: # Poll for any child to have exit, but don't reap it. Leave reaping # to the Popen. # TODO: What if someone else in Toil wants to do this syscall? # TODO: Is this one-notification-per-done-child with WNOHANG? Or # can we miss some? Or do we see the same one repeatedly until it # is reaped? try: siginfo = waitid(os.P_ALL, -1, os.WEXITED | os.WNOWAIT | os.WNOHANG) except ChildProcessError: # This happens when there is nothing to wait on right now, # instead of the weird C behavior of overwriting a field in # a pointed-to struct. siginfo = None if siginfo is not None and siginfo.si_pid in pid_to_popen and siginfo.si_pid not in ready: # Something new finished ready.add(siginfo.si_pid) else: # Nothing we own that we haven't seen before has finished. return ready else: # On Mac there's no waitid and no way to wait and not reap. # Fall back on polling all the Popen objects. # To make this vaguely efficient we have to return done children in # batches. for pid, popen in pid_to_popen.items(): if popen.poll() is not None: # Process is done ready.add(pid) log.debug('Child %d has stopped', pid) # Return all the done processes we found return ready def _runDebugJob(self, jobCommand, jobID, environment): """ Run the jobCommand right now, in the current thread. May only be called in debug-worker mode. Assumes resources are available. """ assert self.debugWorker # TODO: It is not possible to kill running jobs in forkless mode, # because they are run immediately in the main thread. info = Info(time.time(), None, None, killIntended=False) self.runningJobs[jobID] = info if jobCommand.startswith("_toil_worker "): # We can actually run in this thread jobName, jobStoreLocator, jobStoreID = jobCommand.split()[ 1:] # Parse command jobStore = Toil.resumeJobStore(jobStoreLocator) toil_worker.workerScript( jobStore, jobStore.config, jobName, jobStoreID, redirectOutputToLogFile=not self.debugWorker ) # Call the worker else: # Run synchronously. If starting or running the command fails, let the exception stop us. subprocess.check_call(jobCommand, shell=True, env=dict(os.environ, **environment)) self.runningJobs.pop(jobID) if not info.killIntended: self.outputQueue.put( UpdatedBatchJobInfo(jobID=jobID, exitStatus=0, wallTime=time.time() - info.time, exitReason=None)) def getSchedulingStatusMessage(self): # Implement the abstractBatchSystem's scheduling status message API return self.schedulingStatusMessage def _setSchedulingStatusMessage(self, message): """ If we can't run a job, we record a short message about why not. If the leader wants to know what is up with us (for example, to diagnose a deadlock), it can ask us for the message. """ self.schedulingStatusMessage = message def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, environment): """ Start a child process for the given job. Allocate its required resources and save it and save it in our bookkeeping structures. If the job is started, returns its PID. If the job fails to start, reports it as failed and returns False. If the job cannot get the resources it needs to start, returns None. """ # We fill this in if we manage to actually start the child. popen = None # This is when we started working on the job. startTime = time.time() # See if we can fit the job in our resource pools right now. if self.coreFractions.acquireNow(coreFractions): # We got some cores if self.memory.acquireNow(jobMemory): # We got some memory if self.disk.acquireNow(jobDisk): # We got the final resource, disk. # Actually run the job. # When it finishes we will release what it was using. # So it is important to not lose track of the child process. try: # Launch the job popen = subprocess.Popen(jobCommand, shell=True, env=dict( os.environ, **environment)) except Exception: # If the job can't start, make sure we release resources now self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self.disk.release(jobDisk) log.error('Could not start job %s: %s', jobID, traceback.format_exc()) # Report as failed. self.outputQueue.put( UpdatedBatchJobInfo( jobID=jobID, exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE, wallTime=0, exitReason=None)) # Free resources self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self.disk.release(jobDisk) # Complain it broke. return False else: # If the job did start, record it self.children[popen.pid] = popen # Make sure we can look it up by PID later self.childToJob[popen.pid] = jobID # Record that the job is running, and the resources it is using info = Info(startTime, popen, (coreFractions, jobMemory, jobDisk), killIntended=False) self.runningJobs[jobID] = info log.debug('Launched job %s as child %d', jobID, popen.pid) # Report success starting the job # Note that if a PID were somehow 0 it would look like False assert popen.pid != 0 return popen.pid else: # We can't get disk, so free cores and memory self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self._setSchedulingStatusMessage( 'Not enough disk to run job %s' % jobID) else: # Free cores, since we can't get memory self.coreFractions.release(coreFractions) self._setSchedulingStatusMessage( 'Not enough memory to run job %s' % jobID) else: self._setSchedulingStatusMessage('Not enough cores to run job %s' % jobID) # If we get here, we didn't succeed or fail starting the job. # We didn't manage to get the resources. # Report that. return None def _handleChild(self, pid): """ Handle a child process PID that has finished. The PID must be for a child job we started. Not thread safe to run at the same time as we are making more children. Remove the child from our bookkeeping structures and free its resources. """ # Look up the child popen = self.children[pid] jobID = self.childToJob[pid] info = self.runningJobs[jobID] # Unpack the job resources (coreFractions, jobMemory, jobDisk) = info.resources # Clean up our records of the job. self.runningJobs.pop(jobID) self.childToJob.pop(pid) self.children.pop(pid) # See how the child did, and reap it. statusCode = popen.wait() if statusCode != 0 and not info.killIntended: log.error("Got exit code %i (indicating failure) " "from job %s.", statusCode, self.jobs[jobID]) if not info.killIntended: # Report if the job failed and we didn't kill it. # If we killed it then it shouldn't show up in the queue. self.outputQueue.put( UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None)) # Free up the job's resources. self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self.disk.release(jobDisk) log.debug('Child %d for job %s succeeded', pid, jobID) def issueBatchJob(self, jobNode): """Adds the command and resources to a queue to be run.""" self._checkOnDaddy() # Round cores to minCores and apply scale. # Make sure to give minCores even if asked for 0 cores, or negative or something. cores = max( math.ceil(jobNode.cores * self.scale / self.minCores) * self.minCores, self.minCores) # Don't do our own assertions about job size vs. our configured size. # The abstract batch system can handle it. self.checkResourceRequest(jobNode.memory, cores, jobNode.disk, name=jobNode.jobName, detail='Scale is set to {}.'.format( self.scale)) self.checkResourceRequest(jobNode.memory, cores, jobNode.disk) log.debug( "Issuing the command: %s with memory: %i, cores: %i, disk: %i" % (jobNode.command, jobNode.memory, cores, jobNode.disk)) with self.jobIndexLock: jobID = self.jobIndex self.jobIndex += 1 self.jobs[jobID] = jobNode.command if self.debugWorker: # Run immediately, blocking for return. # Ignore resource requirements; we run one job at a time self._runDebugJob(jobNode.command, jobID, self.environment.copy()) else: # Queue the job for later self.inputQueue.put( (jobNode.command, jobID, cores, jobNode.memory, jobNode.disk, self.environment.copy())) return jobID def killBatchJobs(self, jobIDs): """Kills jobs by ID.""" self._checkOnDaddy() log.debug('Killing jobs: {}'.format(jobIDs)) for jobID in jobIDs: if jobID in self.runningJobs: info = self.runningJobs[jobID] info.killIntended = True if info.popen != None: log.debug('Send kill to PID %s', info.popen.pid) info.popen.kill() log.debug('Sent kill to PID %s', info.popen.pid) else: # No popen if running in forkless mode currently assert self.debugWorker log.critical("Can't kill job: %s in debug mode" % jobID) while jobID in self.runningJobs: pass def getIssuedBatchJobIDs(self): """Just returns all the jobs that have been run, but not yet returned as updated.""" self._checkOnDaddy() return list(self.jobs.keys()) def getRunningBatchJobIDs(self): self._checkOnDaddy() now = time.time() return { jobID: now - info.time for jobID, info in list(self.runningJobs.items()) } def shutdown(self): """ Cleanly terminate and join daddy thread. """ if self.daddyThread is not None: # Tell the daddy thread to stop. self.shuttingDown.set() # Wait for it to stop. self.daddyThread.join() BatchSystemSupport.workerCleanup(self.workerCleanupInfo) def getUpdatedBatchJob(self, maxWait): """Returns a tuple of a no-longer-running job, the return value of its process, and its runtime, or None.""" self._checkOnDaddy() try: item = self.outputQueue.get(timeout=maxWait) except Empty: return None self.jobs.pop(item.jobID) log.debug("Ran jobID: %s with exit value: %i", item.jobID, item.exitStatus) return item @classmethod def setOptions(cls, setOption): setOption("scale", default=1)
class AsynchronousJobRunner(BaseJobRunner, Monitors): """Parent class for any job runner that runs jobs asynchronously (e.g. via a distributed resource manager). Provides general methods for having a thread to monitor the state of asynchronous jobs and submitting those jobs to the correct methods (queue, finish, cleanup) at appropriate times.. """ def __init__(self, app, nworkers, **kwargs): super(AsynchronousJobRunner, self).__init__(app, nworkers, **kwargs) # 'watched' and 'queue' are both used to keep track of jobs to watch. # 'queue' is used to add new watched jobs, and can be called from # any thread (usually by the 'queue_job' method). 'watched' must only # be modified by the monitor thread, which will move items from 'queue' # to 'watched' and then manage the watched jobs. self.watched = [] self.monitor_queue = Queue() def _init_monitor_thread(self): name = "%s.monitor_thread" % self.runner_name super(AsynchronousJobRunner, self)._init_monitor_thread(name=name, target=self.monitor, start=True, config=self.app.config) def handle_stop(self): # DRMAA and SGE runners should override this and disconnect. pass def monitor(self): """ Watches jobs currently in the monitor queue and deals with state changes (queued to running) and job completion. """ while True: # Take any new watched jobs and put them on the monitor list try: while True: async_job_state = self.monitor_queue.get_nowait() if async_job_state is STOP_SIGNAL: # TODO: This is where any cleanup would occur self.handle_stop() return self.watched.append(async_job_state) except Empty: pass # Iterate over the list of watched jobs and check state try: self.check_watched_items() except Exception: log.exception('Unhandled exception checking active jobs') # Sleep a bit before the next state check time.sleep(1) def monitor_job(self, job_state): self.monitor_queue.put(job_state) def shutdown(self): """Attempts to gracefully shut down the monitor thread""" log.info("%s: Sending stop signal to monitor thread" % self.runner_name) self.monitor_queue.put(STOP_SIGNAL) # Call the parent's shutdown method to stop workers self.shutdown_monitor() super(AsynchronousJobRunner, self).shutdown() def check_watched_items(self): """ This method is responsible for iterating over self.watched and handling state changes and updating self.watched with a new list of watched job states. Subclasses can opt to override this directly (as older job runners will initially) or just override check_watched_item and allow the list processing to reuse the logic here. """ new_watched = [] for async_job_state in self.watched: new_async_job_state = self.check_watched_item(async_job_state) if new_async_job_state: new_watched.append(new_async_job_state) self.watched = new_watched # Subclasses should implement this unless they override check_watched_items all together. def check_watched_item(self, job_state): raise NotImplementedError() def finish_job(self, job_state): """ Get the output/error for a finished job, pass to `job_wrapper.finish` and cleanup all the job's temporary files. """ galaxy_id_tag = job_state.job_wrapper.get_id_tag() external_job_id = job_state.job_id # To ensure that files below are readable, ownership must be reclaimed first job_state.job_wrapper.reclaim_ownership() # wait for the files to appear which_try = 0 collect_output_success = True while which_try < self.app.config.retry_job_output_collection + 1: try: with open(job_state.output_file, "rb") as stdout_file, open(job_state.error_file, 'rb') as stderr_file: stdout = shrink_stream_by_size(stdout_file, DATABASE_MAX_STRING_SIZE, join_by="\n..\n", left_larger=True, beginning_on_size_error=True) stderr = shrink_stream_by_size(stderr_file, DATABASE_MAX_STRING_SIZE, join_by="\n..\n", left_larger=True, beginning_on_size_error=True) break except Exception as e: if which_try == self.app.config.retry_job_output_collection: stdout = '' stderr = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER log.error('(%s/%s) %s: %s' % (galaxy_id_tag, external_job_id, stderr, str(e))) collect_output_success = False else: time.sleep(1) which_try += 1 if not collect_output_success: job_state.fail_message = stderr job_state.runner_state = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER self.mark_as_failed(job_state) return try: # This should be an 8-bit exit code, but read ahead anyway: exit_code_str = open(job_state.exit_code_file, "r").read(32) except Exception: # By default, the exit code is 0, which typically indicates success. exit_code_str = "0" try: # Decode the exit code. If it's bogus, then just use 0. exit_code = int(exit_code_str) except ValueError: log.warning("(%s/%s) Exit code '%s' invalid. Using 0." % (galaxy_id_tag, external_job_id, exit_code_str)) exit_code = 0 # clean up the job files cleanup_job = job_state.job_wrapper.cleanup_job if cleanup_job == "always" or (not stderr and cleanup_job == "onsuccess"): job_state.cleanup() try: self._finish_or_resubmit_job(job_state, stdout, stderr, exit_code) except Exception: log.exception("(%s/%s) Job wrapper finish method failed" % (galaxy_id_tag, external_job_id)) job_state.job_wrapper.fail("Unable to finish job", exception=True) def mark_as_finished(self, job_state): self.work_queue.put((self.finish_job, job_state)) def mark_as_failed(self, job_state): self.work_queue.put((self.fail_job, job_state))
class CachePipeline(object): def __init__(self, spider, cache): self.spider = spider self.cache = cache self.queue_size = 100 self.input_queue = Queue() self.result_queue = Queue() self.is_working = Event() self.is_paused = Event() self.thread = Thread(target=self.thread_worker) self.thread.daemon = True self.thread.start() def has_free_resources(self): return (self.input_queue.qsize() < self.queue_size and self.result_queue.qsize() < self.queue_size) def is_idle(self): return (not self.is_working.is_set() and not self.input_queue.qsize() and not self.input_queue.qsize()) def thread_worker(self): while True: while self.is_paused.is_set(): time.sleep(0.01) try: action, data = self.input_queue.get(True, 0.1) except Empty: if self.spider.shutdown_event.is_set(): #print('!CACHE: EXITING CACHE PIPELINE') return self.shutdown() #else: # print('no shutdown event') else: self.is_working.set() #print('!CACHE:got new task from input: %s:%s' # % (action, data)) assert action in ('load', 'save', 'pause') if action == 'load': task, grab = data result = None if self.is_cache_loading_allowed(task, grab): #print('!CACHE: query cache storage') result = self.load_from_cache(task, grab) if result: #print('!CACHE: cached result is None') #print('!! PUT RESULT INTO CACHE PIPE ' # 'RESULT QUEUE (cache)') self.result_queue.put(('network_result', result)) else: self.result_queue.put(('task', task)) elif action == 'save': task, grab = data if self.is_cache_saving_allowed(task, grab): with self.spider.timer.log_time('cache'): with self.spider.timer.log_time('cache.write'): self.cache.save_response(task.url, grab) elif action == 'pause': self.is_paused.set() self.is_working.clear() def is_cache_loading_allowed(self, task, grab): # 1) cache data should be refreshed # 2) cache is disabled for that task # 3) request type is not cacheable return (not task.get('refresh_cache', False) and not task.get('disable_cache', False) and grab.detect_request_method() == 'GET') def is_cache_saving_allowed(self, task, grab): """ Check if network transport result could be saved to cache layer. res: {ok, grab, grab_config_backup, task, emsg} """ if grab.request_method == 'GET': if not task.get('disable_cache'): if self.spider.is_valid_network_response_code( grab.doc.code, task): return True return False def load_from_cache(self, task, grab): with self.spider.timer.log_time('cache'): with self.spider.timer.log_time('cache.read'): cache_item = self.cache.get_item(grab.config['url'], timeout=task.cache_timeout) if cache_item is None: return None else: with self.spider.timer.log_time( 'cache.read.prepare_request'): grab.prepare_request() with self.spider.timer.log_time( 'cache.read.load_response'): self.cache.load_response(grab, cache_item) grab.log_request('CACHED') self.spider.stat.inc('spider:request-cache') return { 'ok': True, 'task': task, 'grab': grab, 'grab_config_backup': grab.dump_config(), 'emsg': None } def shutdown(self): try: self.cache.close() except AttributeError: print('Cache %s does not support close method' % self.cache) def pause(self): self.add_task(('pause', None)) self.is_paused.wait() def resume(self): self.is_paused.clear() def get_ready_results(self): res = [] while True: try: action, result = self.result_queue.get_nowait() except Empty: break else: assert action in ('network_result', 'task') res.append((action, result)) return res def add_task(self, task): self.input_queue.put(task)
class Scheduler(): def __init__(self): self.q = Queue() self.fp_set = set() self.total_repeat_nums = 0 def add_request(self, request): # 请求入队的函数: 指纹不在集合中self._filter_request(request)返回True if self._filter_request(request): self.q.put(request) def get_request(self): # 取出request并返回 try: request = self.q.get_nowait() except: request = None return request def _filter_request(self, request): '''请求去重: request的指纹不在集合中,指纹入集合,返回true''' fp = self._gen_fp(request) if fp not in self.fp_set: self.fp_set.add(fp) return True self.total_repeat_nums += 1 logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False def _gen_fp(self, request): """返回request的fp""" url = canonicalize_url(request.url) method = request.method.upper() data = request.data if request.data else {} data = sorted(data.items(), key=lambda x: x[0]) # data.items() 返回dict_items([('b', 2), ('a', 1)]) 迭代对象 # sorted(dict.items()) # [('a', 1), ('b', 2)] # key参数接收一个lambda函数 表示按照返回的对象进行排序 # x就是 dict.items()中每次迭代返回的 对象 (k, v) # x[0] 就是 dict中的k # data = sorted(data.items(), key=lambda x:x[0])表示对data.items()依照字典的k进行排序 sha1 = hashlib.sha1() sha1.update(self._to_bytes(url)) sha1.update(self._to_bytes(method)) sha1.update(self._to_bytes(str(data))) fp = sha1.hexdigest() return fp def _to_bytes(self, string): """py2和py3字符串类型正好相反""" if six.PY2: # 判断当前是不是python2 if isinstance(string, str): return string else: return string.encode() elif six.PY3: if isinstance(string, str): return string.encode() else: return string
class ParasolBatchSystem(BatchSystemSupport): """ The interface for Parasol. """ @classmethod def supportsWorkerCleanup(cls): return False @classmethod def supportsHotDeployment(cls): return False def __init__(self, config, maxCores, maxMemory, maxDisk): super(ParasolBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) if maxMemory != sys.maxsize: logger.warn('The Parasol batch system does not support maxMemory.') # Keep the name of the results file for the pstat2 command.. command = config.parasolCommand if os.path.sep not in command: try: command = next(which(command)) except StopIteration: raise RuntimeError("Can't find %s on PATH." % command) logger.debug('Using Parasol at %s', command) self.parasolCommand = command self.parasolResultsDir = tempfile.mkdtemp(dir=config.jobStore) # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch # have the same cpu and memory requirements. The keys to this dictionary are the (cpu, # memory) tuples for each batch. A new batch is created whenever a job has a new unique # combination of cpu and memory requirements. self.resultsFiles = dict() self.maxBatches = config.parasolMaxBatches # Allows the worker process to send back the IDs of jobs that have finished, so the batch # system can decrease its used cpus counter self.cpuUsageQueue = Queue() # Also stores finished job IDs, but is read by getUpdatedJobIDs(). self.updatedJobsQueue = Queue() # Use this to stop the worker when shutting down self.running = True self.worker = Thread(target=self.updatedJobWorker, args=()) self.worker.start() self.usedCpus = 0 self.jobIDsToCpu = {} # Set of jobs that have been issued but aren't known to have finished or been killed yet. # Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are # removed in killBatchJobs. self.runningJobs = set() def _runParasol(self, command, autoRetry=True): """ Issues a parasol command using popen to capture the output. If the command fails then it will try pinging parasol until it gets a response. When it gets a response it will recursively call the issue parasol command, repeating this pattern for a maximum of N times. The final exit value will reflect this. """ command = list(concat(self.parasolCommand, command)) while True: logger.debug('Running %r', command) process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1) stdout, stderr = process.communicate() status = process.wait() for line in stderr.split('\n'): if line: logger.warn(line) if status == 0: return 0, stdout.split('\n') message = 'Command %r failed with exit status %i' % (command, status) if autoRetry: logger.warn(message) else: logger.error(message) return status, None logger.warn('Waiting for a 10s, before trying again') time.sleep(10) parasolOutputPattern = re.compile("your job ([0-9]+).*") def issueBatchJob(self, jobNode): """ Issues parasol with job commands. """ self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk) MiB = 1 << 20 truncatedMemory = (old_div(jobNode.memory, MiB)) * MiB # Look for a batch for jobs with these resource requirements, with # the memory rounded down to the nearest megabyte. Rounding down # meams the new job can't ever decrease the memory requirements # of jobs already in the batch. if len(self.resultsFiles) >= self.maxBatches: raise RuntimeError( 'Number of batches reached limit of %i' % self.maxBatches) try: results = self.resultsFiles[(truncatedMemory, jobNode.cores)] except KeyError: results = getTempFile(rootDir=self.parasolResultsDir) self.resultsFiles[(truncatedMemory, jobNode.cores)] = results # Prefix the command with environment overrides, optionally looking them up from the # current environment if the value is None command = ' '.join(concat('env', self.__environment(), jobNode.command)) parasolCommand = ['-verbose', '-ram=%i' % jobNode.memory, '-cpu=%i' % jobNode.cores, '-results=' + results, 'add', 'job', command] # Deal with the cpus self.usedCpus += jobNode.cores while True: # Process finished results with no wait try: jobID = self.cpuUsageQueue.get_nowait() except Empty: break if jobID in list(self.jobIDsToCpu.keys()): self.usedCpus -= self.jobIDsToCpu.pop(jobID) assert self.usedCpus >= 0 while self.usedCpus > self.maxCores: # If we are still waiting jobID = self.cpuUsageQueue.get() if jobID in list(self.jobIDsToCpu.keys()): self.usedCpus -= self.jobIDsToCpu.pop(jobID) assert self.usedCpus >= 0 # Now keep going while True: line = self._runParasol(parasolCommand)[1][0] match = self.parasolOutputPattern.match(line) if match is None: # This is because parasol add job will return success, even if the job was not # properly issued! logger.debug('We failed to properly add the job, we will try again after a 5s.') time.sleep(5) else: jobID = int(match.group(1)) self.jobIDsToCpu[jobID] = jobNode.cores self.runningJobs.add(jobID) logger.debug("Got the parasol job id: %s from line: %s" % (jobID, line)) return jobID def setEnv(self, name, value=None): if value and ' ' in value: raise ValueError('Parasol does not support spaces in environment variable values.') return super(ParasolBatchSystem, self).setEnv(name, value) def __environment(self): return (k + '=' + (os.environ[k] if v is None else v) for k, v in list(self.environment.items())) def killBatchJobs(self, jobIDs): """Kills the given jobs, represented as Job ids, then checks they are dead by checking they are not in the list of issued jobs. """ while True: for jobID in jobIDs: if jobID in self.runningJobs: self.runningJobs.remove(jobID) exitValue = self._runParasol(['remove', 'job', str(jobID)], autoRetry=False)[0] logger.debug("Tried to remove jobID: %i, with exit value: %i" % (jobID, exitValue)) runningJobs = self.getIssuedBatchJobIDs() if set(jobIDs).difference(set(runningJobs)) == set(jobIDs): break logger.warn( 'Tried to kill some jobs, but something happened and they are still ' 'going, will try againin 5s.') time.sleep(5) # Update the CPU usage, because killed jobs aren't written to the results file. for jobID in jobIDs: if jobID in list(self.jobIDsToCpu.keys()): self.usedCpus -= self.jobIDsToCpu.pop(jobID) queuePattern = re.compile(r'q\s+([0-9]+)') runningPattern = re.compile(r'r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+') def getJobIDsForResultsFile(self, resultsFile): """ Get all queued and running jobs for a results file. """ jobIDs = [] for line in self._runParasol(['-results=' + resultsFile, 'pstat2'])[1]: runningJobMatch = self.runningPattern.match(line) queuedJobMatch = self.queuePattern.match(line) if runningJobMatch: jobID = runningJobMatch.group(1) elif queuedJobMatch: jobID = queuedJobMatch.group(1) else: continue jobIDs.append(int(jobID)) return set(jobIDs) def getIssuedBatchJobIDs(self): """ Gets the list of jobs issued to parasol in all results files, but not including jobs created by other users. """ issuedJobs = set() for resultsFile in itervalues(self.resultsFiles): issuedJobs.update(self.getJobIDsForResultsFile(resultsFile)) return list(issuedJobs) def getRunningBatchJobIDs(self): """ Returns map of running jobIDs and the time they have been running. """ # Example lines.. # r 5410186 benedictpaten worker 1247029663 localhost # r 5410324 benedictpaten worker 1247030076 localhost runningJobs = {} issuedJobs = self.getIssuedBatchJobIDs() for line in self._runParasol(['pstat2'])[1]: if line != '': match = self.runningPattern.match(line) if match is not None: jobID = int(match.group(1)) startTime = int(match.group(2)) if jobID in issuedJobs: # It's one of our jobs runningJobs[jobID] = time.time() - startTime return runningJobs def getUpdatedBatchJob(self, maxWait): while True: try: jobID, status, wallTime = self.updatedJobsQueue.get(timeout=maxWait) except Empty: return None try: self.runningJobs.remove(jobID) except KeyError: # We tried to kill this job, but it ended by itself instead, so skip it. pass else: return jobID, status, wallTime @classmethod def getRescueBatchJobFrequency(cls): """ Parasol leaks jobs, but rescuing jobs involves calls to parasol list jobs and pstat2, making it expensive. """ return 5400 # Once every 90 minutes def updatedJobWorker(self): """ We use the parasol results to update the status of jobs, adding them to the list of updated jobs. Results have the following structure.. (thanks Mark D!) int status; /* Job status - wait() return format. 0 is good. */ char *host; /* Machine job ran on. */ char *jobId; /* Job queuing system job ID */ char *exe; /* Job executable file (no path) */ int usrTicks; /* 'User' CPU time in ticks. */ int sysTicks; /* 'System' CPU time in ticks. */ unsigned submitTime; /* Job submission time in seconds since 1/1/1970 */ unsigned startTime; /* Job start time in seconds since 1/1/1970 */ unsigned endTime; /* Job end time in seconds since 1/1/1970 */ char *user; /* User who ran job */ char *errFile; /* Location of stderr file on host */ Plus you finally have the command name. """ resultsFiles = set() resultsFileHandles = [] try: while self.running: # Look for any new results files that have been created, and open them newResultsFiles = set(os.listdir(self.parasolResultsDir)).difference(resultsFiles) for newFile in newResultsFiles: newFilePath = os.path.join(self.parasolResultsDir, newFile) resultsFileHandles.append(open(newFilePath, 'r')) resultsFiles.add(newFile) for fileHandle in resultsFileHandles: while self.running: line = fileHandle.readline() if not line: break assert line[-1] == '\n' (status, host, jobId, exe, usrTicks, sysTicks, submitTime, startTime, endTime, user, errFile, command) = line[:-1].split(None, 11) status = int(status) jobId = int(jobId) if os.WIFEXITED(status): status = os.WEXITSTATUS(status) else: status = -status self.cpuUsageQueue.put(jobId) startTime = int(startTime) endTime = int(endTime) if endTime == startTime: # Both, start and end time is an integer so to get sub-second # accuracy we use the ticks reported by Parasol as an approximation. # This isn't documented but what Parasol calls "ticks" is actually a # hundredth of a second. Parasol does the unit conversion early on # after a job finished. Search paraNode.c for ticksToHundreths. We # also cheat a little by always reporting at least one hundredth of a # second. usrTicks = int(usrTicks) sysTicks = int(sysTicks) wallTime = float( max( 1, usrTicks + sysTicks) ) * 0.01 else: wallTime = float(endTime - startTime) self.updatedJobsQueue.put((jobId, status, wallTime)) time.sleep(1) except: logger.warn("Error occurred while parsing parasol results files.") raise finally: for fileHandle in resultsFileHandles: fileHandle.close() def shutdown(self): self.killBatchJobs(self.getIssuedBatchJobIDs()) # cleanup jobs for results in itervalues(self.resultsFiles): exitValue = self._runParasol(['-results=' + results, 'clear', 'sick'], autoRetry=False)[0] if exitValue is not None: logger.warn("Could not clear sick status of the parasol batch %s" % results) exitValue = self._runParasol(['-results=' + results, 'flushResults'], autoRetry=False)[0] if exitValue is not None: logger.warn("Could not flush the parasol batch %s" % results) self.running = False logger.debug('Joining worker thread...') self.worker.join() logger.debug('... joined worker thread.') for results in list(self.resultsFiles.values()): os.remove(results) os.rmdir(self.parasolResultsDir) @classmethod def setOptions(cls, setOption): from toil.common import iC setOption("parasolCommand", None, None, 'parasol') setOption("parasolMaxBatches", int, iC(1), 10000)
class ParasolBatchSystem(BatchSystemSupport): """ The interface for Parasol. """ @classmethod def supportsWorkerCleanup(cls): return False @classmethod def supportsAutoDeployment(cls): return False def __init__(self, config, maxCores, maxMemory, maxDisk): super(ParasolBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) if maxMemory != sys.maxsize: logger.warning( 'The Parasol batch system does not support maxMemory.') # Keep the name of the results file for the pstat2 command.. command = config.parasolCommand if os.path.sep not in command: try: command = which(command) except StopIteration: raise RuntimeError("Can't find %s on PATH." % command) logger.debug('Using Parasol at %s', command) self.parasolCommand = command jobStoreType, path = Toil.parseLocator(config.jobStore) if jobStoreType != 'file': raise RuntimeError( "The parasol batch system doesn't currently work with any " "jobStore type except file jobStores.") self.parasolResultsDir = tempfile.mkdtemp(dir=os.path.abspath(path)) logger.debug("Using parasol results dir: %s", self.parasolResultsDir) # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch # have the same cpu and memory requirements. The keys to this dictionary are the (cpu, # memory) tuples for each batch. A new batch is created whenever a job has a new unique # combination of cpu and memory requirements. self.resultsFiles = dict() self.maxBatches = config.parasolMaxBatches # Allows the worker process to send back the IDs of jobs that have finished, so the batch # system can decrease its used cpus counter self.cpuUsageQueue = Queue() # Also stores finished job IDs, but is read by getUpdatedJobIDs(). self.updatedJobsQueue = Queue() # Use this to stop the worker when shutting down self.running = True self.worker = Thread(target=self.updatedJobWorker, args=()) self.worker.start() self.usedCpus = 0 self.jobIDsToCpu = {} # Set of jobs that have been issued but aren't known to have finished or been killed yet. # Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are # removed in killBatchJobs. self.runningJobs = set() def _runParasol(self, command, autoRetry=True): """ Issues a parasol command using popen to capture the output. If the command fails then it will try pinging parasol until it gets a response. When it gets a response it will recursively call the issue parasol command, repeating this pattern for a maximum of N times. The final exit value will reflect this. """ command = list(concat(self.parasolCommand, command)) while True: logger.debug('Running %r', command) process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1) stdout, stderr = process.communicate() status = process.wait() for line in stderr.decode('utf-8').split('\n'): if line: logger.warning(line) if status == 0: return 0, stdout.decode('utf-8').split('\n') message = 'Command %r failed with exit status %i' % (command, status) if autoRetry: logger.warning(message) else: logger.error(message) return status, None logger.warning('Waiting for a 10s, before trying again') time.sleep(10) parasolOutputPattern = re.compile("your job ([0-9]+).*") def issueBatchJob(self, jobNode): """ Issues parasol with job commands. """ self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk) MiB = 1 << 20 truncatedMemory = (old_div(jobNode.memory, MiB)) * MiB # Look for a batch for jobs with these resource requirements, with # the memory rounded down to the nearest megabyte. Rounding down # meams the new job can't ever decrease the memory requirements # of jobs already in the batch. if len(self.resultsFiles) >= self.maxBatches: raise RuntimeError('Number of batches reached limit of %i' % self.maxBatches) try: results = self.resultsFiles[(truncatedMemory, jobNode.cores)] except KeyError: results = getTempFile(rootDir=self.parasolResultsDir) self.resultsFiles[(truncatedMemory, jobNode.cores)] = results # Prefix the command with environment overrides, optionally looking them up from the # current environment if the value is None command = ' '.join(concat('env', self.__environment(), jobNode.command)) parasolCommand = [ '-verbose', '-ram=%i' % jobNode.memory, '-cpu=%i' % jobNode.cores, '-results=' + results, 'add', 'job', command ] # Deal with the cpus self.usedCpus += jobNode.cores while True: # Process finished results with no wait try: jobID = self.cpuUsageQueue.get_nowait() except Empty: break if jobID in list(self.jobIDsToCpu.keys()): self.usedCpus -= self.jobIDsToCpu.pop(jobID) assert self.usedCpus >= 0 while self.usedCpus > self.maxCores: # If we are still waiting jobID = self.cpuUsageQueue.get() if jobID in list(self.jobIDsToCpu.keys()): self.usedCpus -= self.jobIDsToCpu.pop(jobID) assert self.usedCpus >= 0 # Now keep going while True: line = self._runParasol(parasolCommand)[1][0] match = self.parasolOutputPattern.match(line) if match is None: # This is because parasol add job will return success, even if the job was not # properly issued! logger.debug( 'We failed to properly add the job, we will try again after a 5s.' ) time.sleep(5) else: jobID = int(match.group(1)) self.jobIDsToCpu[jobID] = jobNode.cores self.runningJobs.add(jobID) logger.debug("Got the parasol job id: %s from line: %s" % (jobID, line)) return jobID def setEnv(self, name, value=None): if value and ' ' in value: raise ValueError( 'Parasol does not support spaces in environment variable values.' ) return super(ParasolBatchSystem, self).setEnv(name, value) def __environment(self): return (k + '=' + (os.environ[k] if v is None else v) for k, v in listitems(self.environment)) def killBatchJobs(self, jobIDs): """Kills the given jobs, represented as Job ids, then checks they are dead by checking they are not in the list of issued jobs. """ while True: for jobID in jobIDs: if jobID in self.runningJobs: self.runningJobs.remove(jobID) exitValue = self._runParasol( ['remove', 'job', str(jobID)], autoRetry=False)[0] logger.debug("Tried to remove jobID: %i, with exit value: %i" % (jobID, exitValue)) runningJobs = self.getIssuedBatchJobIDs() if set(jobIDs).difference(set(runningJobs)) == set(jobIDs): break logger.warning( 'Tried to kill some jobs, but something happened and they are still ' 'going, will try againin 5s.') time.sleep(5) # Update the CPU usage, because killed jobs aren't written to the results file. for jobID in jobIDs: if jobID in list(self.jobIDsToCpu.keys()): self.usedCpus -= self.jobIDsToCpu.pop(jobID) runningPattern = re.compile( r'r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+') def getJobIDsForResultsFile(self, resultsFile): """ Get all queued and running jobs for a results file. """ jobIDs = [] for line in self._runParasol(['-extended', 'list', 'jobs'])[1]: fields = line.strip().split() if len(fields) == 0 or fields[-1] != resultsFile: continue jobID = fields[0] jobIDs.append(int(jobID)) return set(jobIDs) def getIssuedBatchJobIDs(self): """ Gets the list of jobs issued to parasol in all results files, but not including jobs created by other users. """ issuedJobs = set() for resultsFile in itervalues(self.resultsFiles): issuedJobs.update(self.getJobIDsForResultsFile(resultsFile)) return list(issuedJobs) def getRunningBatchJobIDs(self): """ Returns map of running jobIDs and the time they have been running. """ # Example lines.. # r 5410186 benedictpaten worker 1247029663 localhost # r 5410324 benedictpaten worker 1247030076 localhost runningJobs = {} issuedJobs = self.getIssuedBatchJobIDs() for line in self._runParasol(['pstat2'])[1]: if line != '': match = self.runningPattern.match(line) if match is not None: jobID = int(match.group(1)) startTime = int(match.group(2)) if jobID in issuedJobs: # It's one of our jobs runningJobs[jobID] = time.time() - startTime return runningJobs def getUpdatedBatchJob(self, maxWait): while True: try: item = self.updatedJobsQueue.get(timeout=maxWait) except Empty: return None try: self.runningJobs.remove(item.jobID) except KeyError: # We tried to kill this job, but it ended by itself instead, so skip it. pass else: return item def updatedJobWorker(self): """ We use the parasol results to update the status of jobs, adding them to the list of updated jobs. Results have the following structure.. (thanks Mark D!) int status; /* Job status - wait() return format. 0 is good. */ char *host; /* Machine job ran on. */ char *jobId; /* Job queuing system job ID */ char *exe; /* Job executable file (no path) */ int usrTicks; /* 'User' CPU time in ticks. */ int sysTicks; /* 'System' CPU time in ticks. */ unsigned submitTime; /* Job submission time in seconds since 1/1/1970 */ unsigned startTime; /* Job start time in seconds since 1/1/1970 */ unsigned endTime; /* Job end time in seconds since 1/1/1970 */ char *user; /* User who ran job */ char *errFile; /* Location of stderr file on host */ Plus you finally have the command name. """ resultsFiles = set() resultsFileHandles = [] try: while self.running: # Look for any new results files that have been created, and open them newResultsFiles = set(os.listdir( self.parasolResultsDir)).difference(resultsFiles) for newFile in newResultsFiles: newFilePath = os.path.join(self.parasolResultsDir, newFile) resultsFileHandles.append(open(newFilePath, 'r')) resultsFiles.add(newFile) for fileHandle in resultsFileHandles: while self.running: line = fileHandle.readline() if not line: break assert line[-1] == '\n' (status, host, jobId, exe, usrTicks, sysTicks, submitTime, startTime, endTime, user, errFile, command) = line[:-1].split(None, 11) status = int(status) jobId = int(jobId) if os.WIFEXITED(status): status = os.WEXITSTATUS(status) else: status = -status self.cpuUsageQueue.put(jobId) startTime = int(startTime) endTime = int(endTime) if endTime == startTime: # Both, start and end time is an integer so to get sub-second # accuracy we use the ticks reported by Parasol as an approximation. # This isn't documented but what Parasol calls "ticks" is actually a # hundredth of a second. Parasol does the unit conversion early on # after a job finished. Search paraNode.c for ticksToHundreths. We # also cheat a little by always reporting at least one hundredth of a # second. usrTicks = int(usrTicks) sysTicks = int(sysTicks) wallTime = float(max(1, usrTicks + sysTicks)) * 0.01 else: wallTime = float(endTime - startTime) self.updatedJobsQueue.put( UpdatedBatchJobInfo(jobID=jobId, exitStatus=status, wallTime=wallTime, exitReason=None)) time.sleep(1) except: logger.warning( "Error occurred while parsing parasol results files.") raise finally: for fileHandle in resultsFileHandles: fileHandle.close() def shutdown(self): self.killBatchJobs(self.getIssuedBatchJobIDs()) # cleanup jobs for results in itervalues(self.resultsFiles): exitValue = self._runParasol( ['-results=' + results, 'clear', 'sick'], autoRetry=False)[0] if exitValue is not None: logger.warning( "Could not clear sick status of the parasol batch %s" % results) exitValue = self._runParasol( ['-results=' + results, 'flushResults'], autoRetry=False)[0] if exitValue is not None: logger.warning("Could not flush the parasol batch %s" % results) self.running = False logger.debug('Joining worker thread...') self.worker.join() logger.debug('... joined worker thread.') for results in list(self.resultsFiles.values()): os.remove(results) os.rmdir(self.parasolResultsDir) @classmethod def setOptions(cls, setOption): from toil.common import iC setOption("parasolCommand", None, None, 'parasol') setOption("parasolMaxBatches", int, iC(1), 10000)
class Scheduler(): def __init__(self, collector): if SCHEDULER_PERSIST: self.q = RedisQueue() self.fp_container = RedisFilterContainer() else: self.q = Queue() self.fp_container = NoramlFilterContainer() # self.fp_set = set() # self.total_repeat_nums = 0 self.collector = collector # 统计计数器类对象, 从引擎传入! def add_request(self, request): # 把request放入请求队列 # 判断指纹是否在集合中,如果不在就入队 request.fp = self._gen_fp(request) if not request.filter: # 构造request声明不过滤重复请求的情况下 self.fp_container.add_fp(request.fp) #也要让指纹进集合! self.q.put(request) logger.info("添加不去重的请求<{} {}>".format(request.method, request.url)) return # 避免在本函数中请求重复入队 if self._filter_request(request): self.q.put(request) def get_request(self): # 取出一个request;取不出就返回none try: request = self.q.get_nowait() except: request = None return request def _filter_request(self, request): '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True''' # request.fp = self._gen_fp(request) # if fp not in self.fp_set: if not self.fp_container.exists(request.fp): self.fp_container.add_fp(request.fp) # 指纹进集合 return True # self.total_repeat_nums += 1 # 重复的请求数 +1 self.collector.incr(self.collector.repeat_request_nums_key) logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False def _gen_fp(self, request): # 返回request的fp指纹字符串 url = canonicalize_url(request.url) method = request.method.upper() data = request.data if request.data else {} data = sorted(data.items(), key=lambda x: x[0]) # 把data字典按(k,v)进行迭代,按照k作为排序的依据 # 默认就是用k作为排序的依据 # key=lambda x:x[0] x就是每次迭代的(k,v), x[0]就是排序的依据 # 最终返回 [('a', 1), ('b', 2)] sha1 = hashlib.sha1() sha1.update(self._to_bytes(url)) sha1.update(self._to_bytes(method)) sha1.update(self._to_bytes(str(data))) fp = sha1.hexdigest() return fp def _to_bytes(self, string): """py2 py3 正好相反!""" if six.PY2: # 判断当前是否是python2 if isinstance(string, str): return string else: return string.encode() elif six.PY3: # 判断当前是否是python3 if isinstance(string, str): return string.encode() else: return string