コード例 #1
0
def run_in_thread(p):
    q = Queue()
    t = Thread(target=enqueue_output, args=(p, q))
    t.daemon = True  # thread dies with the program
    t.start()

    lines = ["\n"]
    line = ''
    alive = True
    app = wx.GetApp().TopWindow
    if sx_print_to_consol:
        write_cmd = app.proj_tree_viewer.consol.log.AppendText
    else:
        write_cmd = app.shell.WriteTextAndPrompt
    while True:
        time.sleep(0.01)
        try:
            line = q.get_nowait()  # or q.get(timeout=.1)
        except Empty:
            if len(lines) != 0:
                wx.CallAfter(write_cmd, ''.join(lines))
            lines = []
        except:
            import traceback
            traceback.print_exc()
            break
        else:  # got line
            lines.append(line)
        if line.startswith('process terminated'):
            if len(lines) > 1:
                wx.CallAfter(write_cmd, ''.join(lines[:-1]))
            break
    return
コード例 #2
0
ファイル: utils.py プロジェクト: ProstoKSI/distributed-queue
class StoppableThread(threading.Thread):
    """This is thread can be stopped.

    Note: Thread by default does not return function result in any case,
    which is why I've implemented this workaroung with built-in Queue.
    """
    def __init__(self, **kwargs):
        super(StoppableThread, self).__init__(**kwargs)
        self.__target = kwargs.get('target')
        self.__args = kwargs.get('args')
        if self.__args is None:
            self.__args = ()
        self.__kwargs = kwargs.get('kwargs')
        if self.__kwargs is None:
            self.__kwargs = {}
        self.__result_queue = Queue()
        self.__stopped = threading.Event()

    def stop(self):
        """Stop the thread. It will not terminate code, but set the flag that
        should be handled in executed function.
        """
        self.__stopped.set()

    def is_stopped(self):
        """Check the status of the thread. It only monitors the flag state. If
        task is stopped you have to pay attention to `.is_alive()`.
        """
        return self.__stopped.is_set()

    def run(self):
        """Run the target function, check expected result and propagate
        exceptions.
        """
        try:
            self.__kwargs['_is_stopped'] = self.__stopped.is_set
            try:
                if self.__target:
                    func_result = self.__target(*self.__args, **self.__kwargs)
            finally:
                # Avoid a refcycle if the thread is running a function with
                # an argument that has a member that points to the thread.
                del self.__target, self.__args, self.__kwargs
            if func_result is None:
                func_result = {}
            elif not isinstance(func_result, dict):
                raise TypeError("Task has to return a dict or None.")
        except Exception: # pylint: disable=W0703
            self.__result_queue.put(traceback.format_exc())
        else:
            self.__result_queue.put(func_result)

    def get_result(self):
        """Return results of target function execution.
        """
        self.join()
        try:
            return self.__result_queue.get_nowait()
        except Queue.Empty:
            return None
コード例 #3
0
ファイル: common.py プロジェクト: theBenForce/localstack
def run_for_max_seconds(max_secs, _function, *args, **kwargs):
    """ Run the given function for a maximum of `max_secs` seconds - continue running
        in a background thread if the function does not finish in time. """
    def _worker(*_args):
        result = None
        try:
            result = _function(*args, **kwargs)
        except Exception as e:
            result = e
        result = True if result is None else result
        q.put(result)
        return result
    start = now()
    q = Queue()
    start_worker_thread(_worker)
    for i in range(max_secs * 2):
        result = None
        try:
            result = q.get_nowait()
        except Exception:
            pass
        if result is not None:
            if isinstance(result, Exception):
                raise result
            return result
        if now() - start >= max_secs:
            return
        time.sleep(0.5)
コード例 #4
0
ファイル: azurecli.py プロジェクト: proteus-cpi/iotedgedev
    def _handle_monitor_event_process(self, process, error_message=None):
        stdout_queue = Queue()
        stderr_queue = Queue()

        stream_thread_map = {
            'stdout':
            Thread(target=self._enqueue_stream,
                   args=(process.stdout, stdout_queue),
                   daemon=True),
            'stderr':
            Thread(target=self._enqueue_stream,
                   args=(process.stderr, stderr_queue),
                   daemon=True)
        }

        stream_thread_map['stdout'].start()
        stream_thread_map['stderr'].start()

        try:
            while not self._proc_terminated:
                if not process.poll():
                    try:
                        self.output.echo(stdout_queue.get_nowait())
                    except Empty:
                        pass
                else:
                    err = None
                    try:
                        err = stderr_queue.get_nowait()
                    except Empty:
                        pass
                    # Avoid empty sys.excepthook errors from underlying future
                    # There is already a uAMQP issue in work for this
                    # https://github.com/Azure/azure-uamqp-python/issues/30
                    if err and "sys.excepthook" not in err:
                        err = err.lstrip()
                        err = err.lstrip('ERROR:')
                        if error_message:
                            err = "{}: {}".format(error_message, err)
                        self.output.error(err)
                    return False
        except KeyboardInterrupt:
            self.output.info('Terminating process...')
            self._terminate_process_tree()

        return True
コード例 #5
0
    def test_threading(self):
        result_queue = Queue()

        with OverrideDB(CommCareCase, self.other_db_1):
            obj = _override_db.class_to_db

        def run():
            with OverrideDB(CommCareCase, self.other_db_2):
                result_queue.put(_override_db.class_to_db)

        t = threading.Thread(target=run)
        t.start()
        t.join()
        result = result_queue.get_nowait()
        self.assertNotEqual(id(obj), id(result))
コード例 #6
0
    def test_threading(self):
        result_queue = Queue()

        with OverrideDB(CommCareCase, self.other_db_1):
            obj = _override_db.class_to_db

        def run():
            with OverrideDB(CommCareCase, self.other_db_2):
                result_queue.put(_override_db.class_to_db)

        t = threading.Thread(target=run)
        t.start()
        t.join()
        result = result_queue.get_nowait()
        self.assertNotEqual(id(obj), id(result))
コード例 #7
0
ファイル: threaded.py プロジェクト: target-v/grab
class ThreadedTransport(object):
    def __init__(self, spider, thread_number):
        self.spider = spider
        self.thread_number = thread_number
        self.task_queue = Queue()
        self.result_queue = Queue()

        self.workers = []
        self.freelist = []
        for _ in six.moves.range(self.thread_number):
            thread = Thread(target=worker_thread, args=[
                self.task_queue,
                self.result_queue,
                self.freelist,
                self.spider.shutdown_event
            ])
            thread.daemon = True
            self.workers.append(thread)
            self.freelist.append(1)
            thread.start()

    def ready_for_task(self):
        return len(self.freelist)

    def get_free_threads_number(self):
        return len(self.freelist)

    def get_active_threads_number(self):
        return self.thread_number - len(self.freelist)

    def start_task_processing(self, task, grab, grab_config_backup):
        self.task_queue.put((task, grab, grab_config_backup))

    def process_handlers(self):
        pass

    def iterate_results(self):
        while True:
            try:
                result = self.result_queue.get_nowait()
            except Empty:
                break
            else:
                # FORMAT: {ok, grab, grab_config_backup, task,
                #          emsg, error_abbr}
                #grab.doc.error_code = None
                #grab.doc.error_msg = None
                yield result
コード例 #8
0
class FluentdEvent(object):
    def __init__(self, app=None):
        self.app = app
        if app is not None:
            self.init_app(app)
            # Send events after every request finishes
            app.after_request(self.send_events)

        # Unbounded queue for sent events
        self.queue = Queue()

    def init_app(self, app):
        tag_prefix = app.config.get("FLUENTD_EVENT_TAG_PREFIX",
                                    "flask.fluentd_event")
        host = app.config.get("FLUENTD_EVENT_HOST", "localhost")
        port = int(app.config.get("FLUENTD_EVENT_PORT", 24224))
        self._sender = sender.FluentSender(tag_prefix, host=host, port=port)

        # Use the newstyle teardown_appcontext if it's available,
        # otherwise fall back to the request context
        if hasattr(app, "teardown_appcontext"):
            app.teardown_appcontext(self.send_events)
        else:
            app.teardown_request(self.send_events)

    def event(self, tag, event):
        self.queue.put((tag, event))

    def send_events(self, exception):
        """
        Makes a best-effort to send all the events that it pushed during a
        request but capable of missing some
        """
        pumping = True
        while pumping:
            try:
                tag, event = self.queue.get_nowait()
                self._sender.emit(tag, event)
                self.queue.task_done()
            except Empty:
                pumping = False
            except Exception as e:
                # This is bad but it's worse to foul the request because
                # of a logging issue
                logging.exception(e)
                self.queue.task_done()

        return exception
コード例 #9
0
ファイル: flask_fluentd.py プロジェクト: thread/flask-fluentd
class Fluentd(object):
    def __init__(self, app=None):
        self.app = app
        if app is not None:
            self.init_app(app)
            # Send events after every request finishes
            app.after_request(self.send_events)

        # Unbounded queue for sent events
        self.queue = Queue()
        tag_label = app.config.get('EVENT_TAG_PREFIX', 'flask.fluentd')
        self._sender = sender.FluentSender(tag_label)

    def init_app(self, app):
        # Use the newstyle teardown_appcontext if it's available,
        # otherwise fall back to the request context
        if hasattr(app, 'teardown_appcontext'):
            app.teardown_appcontext(self.send_events)
        else:
            app.teardown_request(self.send_events)

    def event(self, pair):
        tag, evt = pair
        self.queue.put((tag, evt))

    def send_events(self, exception):
        """
        Makes a best-effort to send all the events that it pushed during a
        request but capable of missing some
        """
        pumping = True
        while pumping:
            try:
                tag, evt = self.queue.get_nowait()
                self._sender.emit(tag, evt)
                self.queue.task_done()
            except Empty:
                pumping = False
            except Exception as e:
                # This is bad but it's worse to foul the request because
                # of a logging issue
                logging.exception(e)
                self.queue.task_done()

        return exception
コード例 #10
0
ファイル: scheduler.py プロジェクト: zhaoduoyu/demo
class Scheduler():
    def __init__(self):
        self.q = Queue()

    def add_request(self, request):
        # 请求入队的函数
        self.q.put(request)

    def get_request(self):
        # 取出request并返回
        try:
            request = self.q.get_nowait()
        except:
            request = None
        return request

    def _filter_request(self):
        '''请求去重'''
        # 暂时不实现
        pass
コード例 #11
0
ファイル: __init__.py プロジェクト: andrewlukoshko/vdsm
class NLSocketPool(object):
    """Pool of netlink sockets."""
    def __init__(self, size):
        if size <= 0:
            raise ValueError('Invalid socket pool size %r. Must be positive')
        self._semaphore = BoundedSemaphore(size)
        self._sockets = Queue(maxsize=size)

    @contextmanager
    def socket(self):
        """Returns a socket from the pool (creating it when needed)."""
        with self._semaphore:
            try:
                sock = self._sockets.get_nowait()
            except Empty:
                sock = _open_socket()
            try:
                yield sock
            finally:
                self._sockets.put_nowait(sock)
コード例 #12
0
ファイル: __init__.py プロジェクト: igoihman/vdsm
class NLSocketPool(object):
    """Pool of netlink sockets."""
    def __init__(self, size):
        if size <= 0:
            raise ValueError('Invalid socket pool size %r. Must be positive')
        self._semaphore = BoundedSemaphore(size)
        self._sockets = Queue(maxsize=size)

    @contextmanager
    def socket(self):
        """Returns a socket from the pool (creating it when needed)."""
        with self._semaphore:
            try:
                sock = self._sockets.get_nowait()
            except Empty:
                sock = _open_socket()
            try:
                yield sock
            finally:
                self._sockets.put_nowait(sock)
コード例 #13
0
ファイル: fluentd.py プロジェクト: voidabhi/flask
class Fluentd(object):
    def __init__(self, app=None):
        self.app = app
        if app is not None:
            self.init_app(app)
            # Send events after every request finishes
            app.after_request(self.send_events)

        # Unbounded queue for sent events
        self.queue = Queue()
        tag_label = app.config.get('EVENT_TAG_PREFIX', 'flask.fluentd')
        self._sender = sender.FluentSender(tag_label)

    def init_app(self, app):
        # Use the newstyle teardown_appcontext if it's available,
        # otherwise fall back to the request context
        if hasattr(app, 'teardown_appcontext'):
            app.teardown_appcontext(self.send_events)
        else:
            app.teardown_request(self.send_events)

    def event(self, pair):
        tag, evt = pair
        self.queue.put((tag, evt))

    def send_events(self, exception):
        """
        Makes a best-effort to send all the events that it pushed during a
        request but capable of missing some
        """
        pumping = True
        while pumping:
            try:
                tag, evt = self.queue.get_nowait()
                self._sender.emit(tag, evt)
                self.queue.task_done()
            except Empty:
                pumping = False

        return exception
コード例 #14
0
class AmqpSubscriber(Subscriber):
    def __init__(self, amqp_chan, exchanges):
        self.channel = amqp_chan
        self.messages = Queue(maxsize=0)
        qname, _, _ = self.channel.queue_declare()
        for exchange in exchanges:
            self.channel.queue_bind(qname, exchange)
        self.channel.basic_consume(queue=qname, callback=self.callback)

    def callback(self, msg):
        self.channel.basic_ack(msg.delivery_tag)
        self.messages.put_nowait(msg.body)

    def __iter__(self):
        return self

    def next(self):
        while self.messages.empty():
            self.channel.wait()
        return self.messages.get_nowait()

    __next__ = next  # PY3
コード例 #15
0
ファイル: amqp.py プロジェクト: kuldat/anypubsub
class AmqpSubscriber(Subscriber):
    def __init__(self, amqp_chan, exchanges):
        self.channel = amqp_chan
        self.messages = Queue(maxsize=0)
        qname, _, _ = self.channel.queue_declare()
        for exchange in exchanges:
            self.channel.queue_bind(qname, exchange)
        self.channel.basic_consume(queue=qname, callback=self.callback)

    def callback(self, msg):
        self.channel.basic_ack(msg.delivery_tag)
        self.messages.put_nowait(msg.body)

    def __iter__(self):
        return self

    def next(self):
        while self.messages.empty():
            self.channel.wait()
        return self.messages.get_nowait()

    __next__ = next   # PY3
コード例 #16
0
ファイル: scheduler.py プロジェクト: feel-easy/myspider
class Scheduler():
    def __init__(self):
        self.q = Queue()
        self.fp_set = set()
        self.total_repeat_nums = 0

    def add_request(self, request):
        # 把request放入请求队列
        # 判断指纹是否在集合中,如果不在就入队
        if self._filter_request(request):
            self.q.put(request)

    def get_request(self):
        # 取出一个request;取不出就返回none
        try:
            request = self.q.get_nowait()
        except:
            request = None
        return request

    def _filter_request(self, request):
        '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True'''
        fp = self._gen_fp(request)
        if fp not in self.fp_set:
            self.fp_set.add(fp)
            return True
        self.total_repeat_nums += 1  # 重复的请求数 +1
        logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
        return False

    def _gen_fp(self, request):
        # 返回request的fp指纹字符串

        url = canonicalize_url(request.url)
        method = request.method.upper()
        data = request.data if request.data else {}
        data = sorted(data.items(), key=lambda x: x[0])
        # 把data字典按(k,v)进行迭代,按照k作为排序的依据
        # 默认就是用k作为排序的依据
        # key=lambda x:x[0] x就是每次迭代的(k,v), x[0]就是排序的依据
        # 最终返回 [('a', 1), ('b', 2)]

        sha1 = hashlib.sha1()
        sha1.update(self._to_bytes(url))
        sha1.update(self._to_bytes(method))
        sha1.update(self._to_bytes(str(data)))
        fp = sha1.hexdigest()
        return fp

    def _to_bytes(self, string):
        """py2 py3 正好相反!"""
        if six.PY2:  # 判断当前是否是python2
            if isinstance(string, str):
                return string
            else:
                return string.encode()
        elif six.PY3:  # 判断当前是否是python3
            if isinstance(string, str):
                return string.encode()
            else:
                return string
コード例 #17
0
class YoutubeDLDownloader(object):
    """Python class for downloading videos using youtube-dl & subprocess.

    Attributes:
        OK, ERROR, STOPPED, ALREADY, FILESIZE_ABORT, WARNING (int): Integers
            that describe the return code from the download() method. The
            larger the number the higher is the hierarchy of the code.
            Codes with smaller hierachy cannot overwrite codes with higher
            hierarchy.

    Args:
        youtubedl_path (string): Absolute path to youtube-dl binary.

        data_hook (function): Optional callback function to retrieve download
            process data.

        log_data (function): Optional callback function to write data to
            the log file.

    Warnings:
        The caller is responsible for calling the close() method after he has
        finished with the object in order for the object to be able to properly
        close down itself.

    Example:
        How to use YoutubeDLDownloader from a python script.

            from downloaders import YoutubeDLDownloader

            def data_hook(data):
                print(data)

            downloader = YoutubeDLDownloader('/usr/bin/youtube-dl', data_hook)

            downloader.download(<URL STRING>, ['-f', 'flv'])

    """

    OK = 0
    WARNING = 1
    ERROR = 2
    FILESIZE_ABORT = 3
    ALREADY = 4
    STOPPED = 5

    def __init__(self, youtubedl_path, data_hook=None, log_data=None):
        self.youtubedl_path = youtubedl_path
        self.data_hook = data_hook
        self.log_data = log_data

        self._return_code = self.OK
        self._proc = None

        self._stderr_queue = Queue()
        self._stderr_reader = PipeReader(self._stderr_queue)

    def download(self, url, options):
        """Download url using given options.

        Args:
            url (string): URL string to download.
            options (list): Python list that contains youtube-dl options.

        Returns:
            An integer that shows the status of the download process.
            There are 6 different return codes.

            OK (0): The download process completed successfully.
            WARNING (1): A warning occured during the download process.
            ERROR (2): An error occured during the download process.
            FILESIZE_ABORT (3): The corresponding url video file was larger or
                smaller from the given filesize limit.
            ALREADY (4): The given url is already downloaded.
            STOPPED (5): The download process was stopped by the user.

        """
        self._return_code = self.OK

        cmd = self._get_cmd(url, options)
        self._create_process(cmd)

        if self._proc is not None:
            self._stderr_reader.attach_filedescriptor(self._proc.stderr)

        while self._proc_is_alive():
            stdout = self._proc.stdout.readline().rstrip()
            stdout = convert_item(stdout, to_unicode=True)

            if stdout:
                data_dict = extract_data(stdout)
                self._extract_info(data_dict)
                self._hook_data(data_dict)

        # Read stderr after download process has been completed
        # We don't need to read stderr in real time
        while not self._stderr_queue.empty():
            stderr = self._stderr_queue.get_nowait()
            if len(stderr) == 0:
                break
            stderr = convert_item(stderr.rstrip(), to_unicode=True)

            self._log(stderr)

            if self._is_warning(stderr):
                self._set_returncode(self.WARNING)
            else:
                self._set_returncode(self.ERROR)

        # Set return code to ERROR if we could not start the download process
        # or the childs return code is greater than zero
        # NOTE: In Linux if the called script is just empty Python exits
        # normally (ret=0), so we cant detect this or similar cases
        # using the code below
        # NOTE: In Unix a negative return code (-N) indicates that the child
        # was terminated by signal N (e.g. -9 = SIGKILL)
        if self._proc is None or self._proc.returncode > 0:
            self._return_code = self.ERROR

        if self._proc is not None and self._proc.returncode > 0:
            self._log('Child process exited with non-zero code: {}'.format(
                self._proc.returncode))

        self._last_data_hook()

        return self._return_code

    def stop(self):
        """Stop the download process and set return code to STOPPED. """
        if self._proc_is_alive():

            if os.name == 'nt':
                # os.killpg is not available on Windows
                # See: https://bugs.python.org/issue5115
                self._proc.kill()

                # When we kill the child process on Windows the return code
                # gets set to 1, so we want to reset the return code back to 0
                # in order to avoid creating logging output in the download(...)
                # method
                self._proc.returncode = 0
            else:
                os.killpg(self._proc.pid, signal.SIGKILL)

            self._set_returncode(self.STOPPED)

    def close(self):
        """Destructor like function for the object. """
        self._stderr_reader.join()

    def _set_returncode(self, code):
        """Set self._return_code only if the hierarchy of the given code is
        higher than the current self._return_code. """
        if code >= self._return_code:
            self._return_code = code

    def _is_warning(self, stderr):
        return stderr.split(':')[0] == 'WARNING'

    def _last_data_hook(self):
        """Set the last data information based on the return code. """
        data_dictionary = {}

        if self._return_code == self.OK:
            data_dictionary['status'] = 'Finished'
        elif self._return_code == self.ERROR:
            data_dictionary['status'] = 'Error'
            data_dictionary['speed'] = ''
            data_dictionary['eta'] = ''
        elif self._return_code == self.WARNING:
            data_dictionary['status'] = 'Warning'
            data_dictionary['speed'] = ''
            data_dictionary['eta'] = ''
        elif self._return_code == self.STOPPED:
            data_dictionary['status'] = 'Stopped'
            data_dictionary['speed'] = ''
            data_dictionary['eta'] = ''
        elif self._return_code == self.ALREADY:
            data_dictionary['status'] = 'Already Downloaded'
        else:
            data_dictionary['status'] = 'Filesize Abort'

        self._hook_data(data_dictionary)

    def _extract_info(self, data):
        """Extract informations about the download process from the given data.

        Args:
            data (dict): Python dictionary that contains different
                keys. The keys are not standar the dictionary can also be
                empty when there are no data to extract. See extract_data().

        """
        if 'status' in data:
            if data['status'] == 'Already Downloaded':
                # Set self._return_code to already downloaded
                # and trash that key
                self._set_returncode(self.ALREADY)
                data['status'] = None

            if data['status'] == 'Filesize Abort':
                # Set self._return_code to filesize abort
                # and trash that key
                self._set_returncode(self.FILESIZE_ABORT)
                data['status'] = None

    def _log(self, data):
        """Log data using the callback function. """
        if self.log_data is not None:
            self.log_data(data)

    def _hook_data(self, data):
        """Pass data back to the caller. """
        if self.data_hook is not None:
            self.data_hook(data)

    def _proc_is_alive(self):
        """Returns True if self._proc is alive else False. """
        if self._proc is None:
            return False

        return self._proc.poll() is None

    def _get_cmd(self, url, options):
        """Build the subprocess command.

        Args:
            url (string): URL string to download.
            options (list): Python list that contains youtube-dl options.

        Returns:
            Python list that contains the command to execute.

        """
        if os.name == 'nt':
            cmd = [self.youtubedl_path] + options + [url]
        else:
            cmd = ['python', self.youtubedl_path] + options + [url]

        return cmd

    def _create_process(self, cmd):
        """Create new subprocess.

        Args:
            cmd (list): Python list that contains the command to execute.

        """
        info = preexec = None

        # Keep a unicode copy of cmd for the log
        ucmd = cmd

        if os.name == 'nt':
            # Hide subprocess window
            info = subprocess.STARTUPINFO()
            info.dwFlags |= subprocess.STARTF_USESHOWWINDOW
        else:
            # Make subprocess the process group leader
            # in order to kill the whole process group with os.killpg
            preexec = os.setsid

        # Encode command for subprocess
        # Refer to http://stackoverflow.com/a/9951851/35070
        if sys.version_info < (3, 0):
            cmd = convert_item(cmd, to_unicode=False)

        try:
            self._proc = subprocess.Popen(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          preexec_fn=preexec,
                                          startupinfo=info)
        except (ValueError, OSError) as error:
            self._log('Failed to start process: {}'.format(ucmd))
            self._log(convert_item(str(error), to_unicode=True))
コード例 #18
0
class BatchTaskQueue(TaskQueue):
    """
    A class for managing batch async operations.
    """

    def __init__(self, work_func,
                 max_batch_size=DEFAULT_BATCH_SIZE, batch_cushion=DEFAULT_BATCH_CUSHION,
                 interval=DEFAULT_INTERVAL,
                 **kwargs):
        """
        :param work_func: Work function input params of list(item), items are added with add_item
        :type work_func: function
        :param max_batch_size: The max number of elements in a batch call
        :type max_batch_size: int
        :param batch_cushion: The batch cushion between items uploaded and the specified max
        :type batch_cushion: int
        :param interval: The interval between checking and uploading added items
        :type interval: int
        """

        super(BatchTaskQueue, self).__init__(**kwargs)
        self._max_batch_size = max_batch_size
        self._batch_cushion = batch_cushion

        self._batch_size = self._max_batch_size - self._batch_cushion
        if self._batch_size <= 0:
            self._logger.warning("Batch size - batch cushion is less than 1, defaulting to 1.")
            self._batch_size = MIN_BATCH_SIZE

        self._items = Queue()
        self._work_func = work_func

        self._daemon = Daemon(self._do_work,
                              interval,
                              _parent_logger=self._logger,
                              _ident="{}Daemon".format(self.identity))
        self._daemon.start()

    def add_item(self, item):
        """
        :param func: Function to be executed asynchronously
        :type func: builtin.function
        :param task_priority: Priority for the task, higher items have higher priority
        :type task_priority: int or None
        """
        self._items.put(item)

    def _handle_batch(self):
        batch = []
        for item in range(self._batch_size):
            try:
                item = self._items.get_nowait()
                batch.append(item)
            except Empty:
                break

        self._logger.debug("Batch size {}.".format(len(batch)))
        if len(batch) > 0:
            self.add(self._work_func, batch)

    def _do_work(self):
        if not self._items.empty():
            queue_size = self._items.qsize()
            num_batches = 1 + int(queue_size / self._batch_size)
            with TaskQueue(_ident="BatchTaskQueueAdd_{}_Batches".format(num_batches)) as task_queue:
                for _ in range(num_batches):
                    task_queue.add(self._handle_batch)

    def __exit__(self, *args):
        super(BatchTaskQueue, self).__exit__(*args)
        self._daemon.stop()

    def flush(self, *args, **kwargs):
        self._do_work()
        super(BatchTaskQueue, self).flush(*args, **kwargs)
コード例 #19
0
ファイル: zmqsocket.py プロジェクト: pennmem/RAMControl
class SocketServer(object):
    """ZMQ-based socket server for sending and receiving messages from the host
    PC.

    Because of the weird way in which PyEPL handles events, we can't run this as
    its own thread, but instead have to poll for events in the general PyEPL
    machinery. In the future, we should clean up PyEPL entirely so that it does
    not block other threads (amongst other reasons).

    :param zmq.Context ctx:

    """
    def __init__(self, ctx=None):
        self.ctx = ctx or zmq.Context()

        self._handlers = []

        self.sock = self.ctx.socket(zmq.PAIR)
        self._bound = False

        self.poller = zmq.Poller()
        self.poller.register(self.sock, zmq.POLLIN)

        # Outgoing message queue
        self._out_queue = Queue()

        # time of last sent heartbeat message
        self._last_heartbeat = 0.

        # Logging of sent and received messages.
        self.logger = create_logger("network")

    def join(self):
        """Block until all outgoing messages have been processed."""
        self.logger.warning("Joining doesn't work yet; doing nothing...")
        # self._out_queue.join()

    def bind(self, address="tcp://*:8889"):
        """Bind the socket to start listening for connections.

        :param str address: ZMQ address string

        """
        self.sock.bind(address)
        self._bound = True

    def register_handler(self, func):
        """Register a message handler.

        :param callable func: Handler function which takes the message as its
            only argument.

        """
        self.logger.debug("Adding handler: %s", func.__name__)
        self._handlers.append(func)

    def enqueue_message(self, msg):
        """Submit a new outgoing message to the queue."""
        self._out_queue.put_nowait(msg)

    def send(self, msg):
        """Immediately transmit a message to the host PC. It is advisable to not
        call this method directly in most cases, but rather enqueue a message to
        be sent via :meth:`enqueue_message`.

        :param RAMMessage msg: Message to send.

        """
        out = msg.jsonize()
        try:
            self.log_message(msg, incoming=False)
            self.sock.send(out, zmq.NOBLOCK)
        except:
            self.logger.error("Sending failed!")

    def send_heartbeat(self):
        """Convenience method to send a heartbeat message to the host PC."""
        if time.time() - self._last_heartbeat >= 1.0:
            self.send(HeartbeatMessage())
            self._last_heartbeat = time.time()

    def log_message(self, message, incoming=True):
        """Log a message to the log file."""
        if not incoming:
            message = message.to_dict()

        message["in_or_out"] = "in" if incoming else "out"
        self.logger.info("%s", json.dumps(message))

    def handle_incoming(self):
        events = self.poller.poll(1)
        if self.sock in dict(events):
            try:
                msg = self.sock.recv_json()
                self.log_message(msg, incoming=True)
            except:
                self.logger.error("Unable to decode JSON.", exc_info=True)
                return

            for handler in self._handlers:
                try:
                    handler(msg)
                except:
                    self.logger.error("Error handling message", exc_info=True)
                    continue

    def handle_outgoing(self):
        try:
            while not self._out_queue.empty():
                msg = self._out_queue.get_nowait()
                self.send(msg)
                self._out_queue.task_done(
                )  # so we can join the queue elsewhere
        except:
            self.logger.error("Error in outgoing message processing",
                              exc_info=True)

    def update(self):
        """Call periodically to check for incoming messages and/or send messages
        in the outgoing queue.

        """
        self.handle_incoming()
        self.handle_outgoing()
コード例 #20
0
ファイル: coqtail.py プロジェクト: j-hui/Coqtail
class CoqtailHandler(StreamRequestHandler):
    """Forward messages between Vim and Coqtail."""

    # Redraw rate in seconds
    refresh_rate = 0.05

    # How often to check for a closed channel
    check_close_rate = 1

    # Is the channel open
    closed = False

    # Is a request currently being handled
    working = False

    # Is the client synchronous
    sync = False

    def parse_msgs(self):
        # type: () -> None
        """Parse messages sent over a Vim channel."""
        while not self.closed:
            try:
                msg = self.rfile.readline().decode("utf-8")
                msg_id, data = json.loads(msg)
            except ValueError:
                # Check if channel closed
                self.closed = True
                break

            if msg_id >= 0:
                bnum, func, args = data
                if func == "interrupt":
                    self.interrupt()
                else:
                    self.reqs.put((msg_id, bnum, func, args))
            else:
                # N.B. Accessing self.resps concurrently creates a race condition
                # where defaultdict could construct a Queue twice
                with self.resp_lk:
                    self.resps[-msg_id].put((msg_id, data))

    def get_msg(self, msg_id=None):
        # type: (Optional[int]) -> Sequence[Any]
        """Check for any pending messages from Vim."""
        if msg_id is None:
            queue = self.reqs  # type: Queue[Any]
        else:
            with self.resp_lk:
                queue = self.resps[msg_id]
        while not self.closed:
            try:
                return queue.get(timeout=self.check_close_rate)  # type: ignore
            except Empty:
                pass
        raise EOFError

    def handle(self):
        # type: () -> None
        """Forward requests from Vim to the appropriate Coqtail function."""
        self.coq = Coqtail(self)
        self.closed = False
        self.reqs = Queue(
        )  # type: Queue[Tuple[int, int, str, Mapping[str, Any]]]
        self.resps = ddict(
            Queue)  # type: DefaultDict[int, Queue[Tuple[int, Any]]]
        self.resp_lk = threading.Lock()

        read_thread = threading.Thread(target=self.parse_msgs)
        read_thread.daemon = True
        read_thread.start()

        while not self.closed:
            try:
                self.working = False
                self.msg_id, self.bnum, func, args = self.get_msg()
                self.refresh_time = 0.0
                self.working = True
            except EOFError:
                break

            handler = {
                "start": self.coq.start,
                "stop": self.coq.stop,
                "step": self.coq.step,
                "rewind": self.coq.rewind,
                "to_line": self.coq.to_line,
                "to_top": self.coq.to_top,
                "query": self.coq.query,
                "endpoint": self.coq.endpoint,
                "toggle_debug": self.coq.toggle_debug,
                "splash": self.coq.splash,
                "sync": self.coq.sync,
                "find_def": self.coq.find_def,
                "find_lib": self.coq.find_lib,
                "refresh": self.coq.refresh,
            }.get(func, None)

            try:
                ret = handler(
                    **args) if handler is not None else None  # type: ignore
                msg = [self.msg_id, {"buf": self.bnum, "ret": ret}]
                self.wfile.write(json.dumps(msg).encode("utf-8") + b"\n")
            # Python 2 doesn't have BrokenPipeError
            except (EOFError, OSError):
                break

            try:
                del self.resps[self.msg_id]
            except KeyError:
                pass

            if func == "stop":
                break

    def vimeval(self, expr, wait=True):
        # type: (List[Any], bool) -> Any
        """Send Vim a request."""
        if wait:
            expr += [-self.msg_id]
        self.wfile.write(json.dumps(expr).encode("utf-8") + b"\n")

        if wait:
            msg_id, res = self.get_msg(self.msg_id)
            assert msg_id == -self.msg_id
            return res
        return None

    def vimcall(self, expr, wait, *args):
        # type: (str, bool, *Any) -> Any
        """Request Vim to evaluate a function call."""
        return self.vimeval(["call", expr, args], wait=wait)

    def vimvar(self, var, val=None):
        # type: (str, Optional[Any]) -> Any
        """Get or set the value of a Vim variable."""
        if val is None:
            return self.vimcall("getbufvar", True, self.bnum, var)
        else:
            return self.vimcall("setbufvar", True, self.bnum, var, val)

    def refresh(self, goals=True, force=True, scroll=False):
        # type: (bool, bool, bool) -> None
        """Refresh the highlighting and auxiliary panels."""
        if not force:
            cur_time = time.time()
            force = cur_time - self.refresh_time > self.refresh_rate
            self.refresh_time = cur_time
        if force:
            self.vimcall(
                "coqtail#panels#refresh",
                self.sync,
                self.bnum,
                self.coq.highlights,
                self.coq.panels(goals),
                scroll,
            )

    def interrupt(self):
        # type: () -> None
        """Interrupt Coqtop and clear the request queue."""
        if self.coq.coqtop is not None and self.working:
            self.working = False
            while not self.reqs.empty():
                try:
                    self.reqs.get_nowait()
                except Empty:
                    break
            self.coq.coqtop.interrupt()
コード例 #21
0
class Schedule(object):
    """
    Schedule 调度器组件
    """
    def __init__(self):
        self.queue = Queue()
        self.__filter_set = Set()
        self.total_request_num = 0
        self.total_repeat_num = 0

    def add_to_queue(self, request):
        """
        add request to queue if request not in __filter_set
        """
        fp = self.__get_fingerprint(request)

        if self._filter_request(fp, request):
            self.__filter_set.add_fp(fp)
            self.queue.put(request)
            self.total_request_num += 1
        else:
            self.total_repeat_num += 1

    def get(self):
        """
        get request from Schedule.queue
        :return request
        """
        try:
            request = self.queue.get_nowait()
        except:
            return None
        else:
            return request

    def _filter_request(self, fp, request):
        """
        Use set filter request
        """
        if fp in self.__filter_set:
            logger.info("Filter Request [{}] <{}>".format(
                request.method, request.url))
            return False
        else:
            # 如果不是重复请求,允许添加到请求队列
            return True

    def __get_fingerprint(self, request):
        """
        指纹去重
        根据url method params data  给出指纹
        """
        import w3lib.url
        url = w3lib.url.canonicalize_url(request.url)

        method = request.method.upper()
        params = request.params if request.params else {}
        params = str(sorted(params.items(), key=lambda x: x[0]))

        data = request.data if request.data else {}
        data = str(sorted(data.items(), key=lambda x: x[0]))

        from hashlib import sha1

        sha1_data = sha1()
        sha1_data.update(self.get_utf8_str(url))
        sha1_data.update(self.get_utf8_str(method))
        sha1_data.update(self.get_utf8_str(params))
        sha1_data.update(self.get_utf8_str(data))

        fp = sha1_data.hexdigest()
        return fp

    def get_utf8_str(self, string):
        """
            判断字符串类型,并将Unicode字符串编码为utf-8
        """
        if six.PY2:
            if isinstance(string, str):
                return string
            else:
                return string.encode("utf-8")
        else:
            if isinstance(string, bytes):
                return string
            else:
                return string.encode("utf-8")
コード例 #22
0
class RabbitMQCrashStorage(CrashStorageBase):
    """This class is an implementation of a Socorro Crash Storage system.
    It is used as a crash queing methanism for raw crashes.  It implements
    the save_raw_crash method as a queue submission function, and the
    new_crashes generator as a queue consumption function.  Please note: as
    it only queues the crash_id and not the whole raw crash, it is not suitable
    to actually save a crash.  It is a very lossy container.  This class
    should be used in conjuction with a more persistant storage mechanism.

    The implementations CrashStorage classes can use arbitrarly high or low
    level semantics to talk to their underlying resource.  In the RabbitMQ,
    implementation, queing through the 'save_raw_crash' method is given full
    transactional semantics using the TransactorExecutor classes.  The
    'new_crashes' generator has a lower level relationship with the
    underlying connection object"""

    required_config = Namespace()
    required_config.add_option(
        'rabbitmq_class',
        default=ConnectionContextPooled,  # we choose a pooled connection
        # because we need thread safe
        # connection behaviors
        doc='the class responsible for connecting to RabbitMQ',
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'transaction_executor_class',
        default=
        'socorro.lib.transaction.TransactionExecutorWithInfiniteBackoff',
        doc='a class that will manage transactions',
        from_string_converter=class_converter,
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'routing_key',
        default='socorro.normal',
        doc='the name of the queue to recieve crashes',
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'filter_on_legacy_processing',
        default=True,
        doc='toggle for using or ignoring the throttling flag',
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'throttle',
        default=100,
        doc='percentage of the time that rabbit will try to queue',
        reference_value_from='resource.rabbitmq',
    )

    def __init__(self, config, namespace='', quit_check_callback=None):
        super(RabbitMQCrashStorage,
              self).__init__(config,
                             namespace=namespace,
                             quit_check_callback=quit_check_callback)

        self.config = config

        # Note: this may continue to grow if we aren't acking certain UUIDs.
        # We should find a way to time out UUIDs after a certain time.
        self.acknowledgement_token_cache = {}
        self.acknowledgment_queue = Queue()

        self.rabbitmq = config.rabbitmq_class(config)
        self.transaction = config.transaction_executor_class(
            config, self.rabbitmq, quit_check_callback=quit_check_callback)

        # cache this object so we don't have to remake it for every transaction
        self._basic_properties = pika.BasicProperties(
            delivery_mode=2,  # make message persistent
        )

        if config.throttle == 100:
            self.dont_queue_this_crash = lambda: False
        else:
            self.dont_queue_this_crash = (
                lambda: randint(1, 100) > config.throttle)

    def save_raw_crash(self, raw_crash, dumps, crash_id):
        if self.dont_queue_this_crash():
            self.config.logger.info(
                'Crash %s filtered out of RabbitMQ queue %s', crash_id,
                self.config.routing_key)
            return
        try:
            this_crash_should_be_queued = (
                not self.config.filter_on_legacy_processing
                or raw_crash.legacy_processing == 0)
        except KeyError:
            self.config.logger.debug(
                'RabbitMQCrashStorage legacy_processing key absent in crash '
                '%s', crash_id)
            return

        if this_crash_should_be_queued:
            self.config.logger.debug('RabbitMQCrashStorage saving crash %s',
                                     crash_id)
            self.transaction(self._save_raw_crash_transaction, crash_id)
            return True
        else:
            self.config.logger.debug(
                'RabbitMQCrashStorage not saving crash %s, legacy processing '
                'flag is %s', crash_id, raw_crash.legacy_processing)

    def _save_raw_crash_transaction(self, connection, crash_id):
        connection.channel.basic_publish(exchange='',
                                         routing_key=self.config.routing_key,
                                         body=crash_id,
                                         properties=self._basic_properties)

    def _basic_get_transaction(self, conn, queue):
        """reorganize the the call to rabbitmq basic_get so that it can be
        used by the transaction retry wrapper."""
        things = conn.channel.basic_get(queue=queue)
        return things

    def new_crashes(self):
        """This generator fetches crash_ids from RabbitMQ."""

        # We've set up RabbitMQ to require acknowledgement of processing of a
        # crash_id from this generator.  It is the responsibility of the
        # consumer of the crash_id to tell this instance of the class when has
        # completed its work on the crash_id.  That is done with the call to
        # 'ack_crash' below.  Because RabbitMQ connections are not thread safe,
        # only the thread that read the crash may acknowledge it.  'ack_crash'
        # queues the crash_id. The '_consume_acknowledgement_queue' function
        # is run to send acknowledgments back to RabbitMQ
        self._consume_acknowledgement_queue()
        queues = [
            self.rabbitmq.config.priority_queue_name,
            self.rabbitmq.config.standard_queue_name,
            self.rabbitmq.config.reprocessing_queue_name,
            self.rabbitmq.config.priority_queue_name,
        ]
        while True:
            for queue in queues:
                method_frame, header_frame, body = self.transaction(
                    self._basic_get_transaction, queue=queue)
                if method_frame and self._suppress_duplicate_jobs(
                        body, method_frame):
                    continue
                if method_frame:
                    break
            # must consume ack queue before testing for end of iterator
            # or the last job won't get ack'd
            self._consume_acknowledgement_queue()
            if not method_frame:
                # there was nothing in the queue - leave the iterator
                return
            self.acknowledgement_token_cache[body] = method_frame
            yield body
            queues.reverse()

    def ack_crash(self, crash_id):
        self.acknowledgment_queue.put(crash_id)

    def _suppress_duplicate_jobs(self, crash_id, acknowledgement_token):
        """if this crash is in the cache, then it is already in progress
        and this is a duplicate.  Acknowledge it, then return to True
        to let the caller know to skip on to the next crash."""
        if crash_id in self.acknowledgement_token_cache:
            # reject this crash - it's already being processsed
            self.config.logger.info('duplicate job: %s is already in progress',
                                    crash_id)
            # ack this
            self.transaction(self._transaction_ack_crash, crash_id,
                             acknowledgement_token)
            return True
        return False

    def _consume_acknowledgement_queue(self):
        """The acknowledgement of the processing of each crash_id yielded
        from the 'new_crashes' method must take place on the same connection
        that the crash_id came from.  The crash_ids are queued in the
        'acknowledgment_queue'.  That queue is consumed by the QueuingThread"""
        try:
            while True:
                crash_id_to_be_acknowledged = \
                    self.acknowledgment_queue.get_nowait()
                # self.config.logger.debug(
                #     'RabbitMQCrashStorage set to acknowledge %s',
                #     crash_id_to_be_acknowledged
                # )
                try:
                    acknowledgement_token = \
                        self.acknowledgement_token_cache[
                            crash_id_to_be_acknowledged
                        ]
                    self.transaction(self._transaction_ack_crash,
                                     crash_id_to_be_acknowledged,
                                     acknowledgement_token)
                    del self.acknowledgement_token_cache[
                        crash_id_to_be_acknowledged]
                except KeyError:
                    self.config.logger.warning(
                        'RabbitMQCrashStorage tried to acknowledge crash %s'
                        ', which was not in the cache',
                        crash_id_to_be_acknowledged,
                        exc_info=True)
                except Exception:
                    self.config.logger.error(
                        'RabbitMQCrashStorage unexpected failure on %s',
                        crash_id_to_be_acknowledged,
                        exc_info=True)

        except Empty:
            pass  # nothing to do with an empty queue

    def _transaction_ack_crash(self, connection, crash_id,
                               acknowledgement_token):
        connection.channel.basic_ack(
            delivery_tag=acknowledgement_token.delivery_tag)
        self.config.logger.debug(
            'RabbitMQCrashStorage acking %s with delivery_tag %s', crash_id,
            acknowledgement_token.delivery_tag)
コード例 #23
0
ファイル: coqtop.py プロジェクト: yichaolemon/Coqtail
class Coqtop(object):
    """Provide an interface to the background Coqtop process."""
    def __init__(self, version, done_callback):
        # type: (Text, Callable[[], None]) -> None
        """Initialize Coqtop state.

        coqtop - The Coqtop process
        done_callback - A function to call when finished waiting for Coqtop
        states - A stack of previous state_ids (grows to the right)
        state_id - The current state_id
        root_state - The starting state_id
        out_q - A thread-safe queue of data read from Coqtop
        xml - The XML interface for the given version
        """
        self.coqtop = None  # type: Optional[subprocess.Popen[bytes]]
        self.done_callback = done_callback
        self.states = []  # type: List[int]
        self.state_id = -1
        self.root_state = -1
        self.out_q = Queue()  # type: Queue[bytes]
        self.xml = XMLInterface(version)
        self.stopping = False

        # Debugging
        self.log = None  # type: Optional[IO[Text]]
        self.handler = logging.NullHandler()  # type: logging.Handler
        self.logger = logging.getLogger(str(id(self)))
        self.logger.addHandler(self.handler)
        self.logger.setLevel(logging.INFO)

    # Coqtop Interface #
    # These are expressed as generators that spawn a thread to interact with
    # Coqtop, yield and wait to be told whether the user interrupted with
    # CTRL-C, then yield the final result. This is done because Vim cannot
    # capture signals while running Python plugins, so we have to busy wait in
    # Vim instead.

    # Ideally the type would be Generator[None, bool, bool] and the final
    # 'yield's would be 'return's, but Python 2 doesn't support returning
    # values from generators.
    def start(self, coq_path, *args, **kwargs):
        # type: (str, *str, **int) -> Generator[Optional[bool], bool, None]
        """Launch the Coqtop process."""
        assert self.coqtop is None

        self.logger.debug("start")
        timeout = kwargs.get("timeout", None)

        for launch in self.xml.launch(coq_path):
            try:
                self.coqtop = subprocess.Popen(
                    launch + args,
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    bufsize=0,
                )

                # Spawn threads to monitor Coqtop's stdout and stderr
                for f in (self.capture_out, self.capture_err,
                          self.capture_dead):
                    read_thread = threading.Thread(target=f)
                    read_thread.daemon = True
                    read_thread.start()

                # Initialize Coqtop
                call = self.call(self.xml.init(), timeout=timeout)
                next(call)
                stopped = yield  # type: ignore[misc] # (see comment above start())
                response = call.send(stopped)

                if isinstance(response, Err):
                    yield False
                    return

                self.root_state = response.val
                self.state_id = response.val

                yield True
                return
            except OSError:
                continue

        # Failed to launch Coqtop
        self.coqtop = None
        self.done_callback()
        yield  # type: ignore[misc] # (see comment above start())
        yield False

    def stop(self):
        # type: () -> None
        """End the Coqtop process."""
        if self.coqtop is not None:
            self.logger.debug("stop")
            self.stopping = True

            # Close debugging log
            self.handler.flush()
            self.handler.close()
            if self.log is not None:
                self.log.close()

            try:
                # Try to terminate Coqtop cleanly
                # TODO: use Quit call
                self.coqtop.terminate()
                self.coqtop.communicate()
            except (OSError, ValueError, AttributeError):
                try:
                    # Force Coqtop to stop
                    self.coqtop.kill()
                except (OSError, AttributeError):
                    pass

            self.coqtop = None

    def advance(
            self,
            cmd,  # type: Text
            encoding="utf-8",  # type: str
            timeout=None,  # type: Optional[int]
    ):
        # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None]
        """Advance Coqtop by sending 'cmd'."""
        self.logger.debug("advance: %s", cmd)
        call = self.call(self.xml.add(cmd, self.state_id, encoding=encoding),
                         timeout=timeout)
        next(call)
        stopped = yield  # type: ignore[misc] # (see comment above start())
        response = call.send(stopped)

        if isinstance(response, Err):
            yield False, response.msg, response.loc
            return

        # In addition to sending 'cmd', also check status in order to force it
        # to be evaluated
        call = self.call(self.xml.status(encoding=encoding), timeout=timeout)
        next(call)
        stopped = yield  # type: ignore[misc] # (see comment above start())
        status = call.send(stopped)

        # Combine messages
        msgs = "\n\n".join(msg for msg in (response.msg,
                                           response.val["res_msg"], status.msg)
                           if msg != "")

        if isinstance(status, Err):
            # Reset state id to before the error
            call = self.call(self.xml.edit_at(self.state_id, 1))
            next(call)
            yield  # type: ignore[misc] # (see comment above start())
            call.send(False)
            yield False, msgs, status.loc
            return

        self.states.append(self.state_id)
        self.state_id = response.val["state_id"]

        yield True, msgs, None

    def rewind(self, steps=1):
        # type: (int) -> Generator[Tuple[bool, int], bool, None]
        """Go back 'steps' states."""
        self.logger.debug("rewind: %d", steps)
        if steps > len(self.states):
            self.state_id = self.root_state
            self.states = []
            steps = len(self.states)
        else:
            # In 8.4 query and option commands will be recorded with
            # state_id = -1. Need to count them and reduce number of steps to
            # rewind so Coqtop doesn't go too far back
            fake_steps = sum(s == -1 for s in self.states[-steps:])
            if self.states[-steps] != -1:
                self.state_id = self.states[-steps]
            else:
                self.state_id = 0
            self.states = self.states[:-steps]
            steps -= fake_steps

        call = self.call(self.xml.edit_at(self.state_id, steps))
        next(call)
        stopped = yield  # type: ignore[misc] # (see comment above start())
        response = call.send(stopped)

        if isinstance(response, Ok):
            yield True, response.val
        else:
            yield False, 0

    def query(
            self,
            cmd,  # type: Text
            in_script,  # type: bool
            encoding="utf-8",  # type: str
            timeout=None,  # type: Optional[int]
    ):
        # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None]
        """Query Coqtop with 'cmd'."""
        self.logger.debug("query: %s", cmd)
        call = self.call(self.xml.query(cmd, self.state_id, encoding=encoding),
                         timeout=timeout)
        next(call)
        stopped = yield  # type: ignore[misc] # (see comment above start())
        response = call.send(stopped)

        if isinstance(response, Ok):
            # If the query was called from within the script we need to record
            # the state id so rewinding will work properly. Since 8.4 uses
            # number of steps rather than state ids, record '-1' to indicate
            # that no rewind should actually be done
            if in_script:
                if self.xml.versions >= (8, 5, 0):
                    self.states.append(self.state_id)
                else:
                    self.states.append(-1)
            yield True, response.msg, None
        else:
            yield False, response.msg, response.loc

    def goals(self, timeout=None):
        # type: (Optional[int]) -> Generator[Tuple[bool, Text, Optional[Tuple[List[Any], List[Any], List[Any], List[Any]]]], bool, None]
        """Get the current set of hypotheses and goals."""
        self.logger.debug("goals")
        call = self.call(self.xml.goal(), timeout=timeout)
        next(call)
        stopped = yield  # type: ignore[misc] # (see comment above start())
        response = call.send(stopped)

        if isinstance(response, Ok):
            yield True, response.msg, response.val
        else:
            yield False, "", None

    def mk_cases(self, ty, encoding="utf-8", timeout=None):
        # type: (Text, str, Optional[int]) -> Generator[Tuple[bool, Text], bool, None]
        """Return cases for each constructor of 'ty'."""
        self.logger.debug("mk_cases: %s", ty)
        call = self.call(self.xml.mk_cases(ty, encoding=encoding),
                         timeout=timeout)
        next(call)
        stopped = yield  # type: ignore[misc] # (see comment above start())
        response = call.send(stopped)

        if isinstance(response, Ok):
            yield True, response.val
        else:
            yield False, response.msg

    def do_option(
            self,
            cmd,  # type: Text
            in_script,  # type: bool
            encoding="utf-8",  # type: str
            timeout=None,  # type: Optional[int]
    ):
        # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None]
        """Set or get an option."""
        self.logger.debug("do_option: %s", cmd)
        vals, opt = self.xml.parse_option(cmd)

        if vals is None:
            call = self.call(self.xml.get_options(encoding=encoding),
                             timeout=timeout)
            next(call)
            stopped = yield  # type: ignore[misc] # (see comment above start())
            response = call.send(stopped)

            if isinstance(response, Ok):
                optval = [(val, desc) for name, desc, val in response.val
                          if name == opt]

                if optval != []:
                    ret = "{}: {}".format(optval[0][1],
                                          optval[0][0])  # type: Text
                else:
                    ret = "Invalid option name"
        else:
            for val in vals:
                call = self.call(self.xml.set_options(opt,
                                                      val,
                                                      encoding=encoding),
                                 timeout=timeout)
                next(call)
                stopped = yield  # type: ignore[misc] # (see comment above start())
                response = call.send(stopped)
                ret = response.msg
                if isinstance(response, Ok):
                    break

        if isinstance(response, Ok):
            # Hack to associate setting an option with a new state id by
            # executing a noop so it works correctly with rewinding
            if in_script:
                noop_call = self.advance(self.xml.noop, encoding)
                next(noop_call)
                while True:
                    yield  # type: ignore[misc] # (see comment above start())
                    noop_ret = noop_call.send(False)
                    if noop_ret is not None:
                        success, _, _ = noop_ret
                        assert success
                        break
            yield True, ret, None
        else:
            yield False, response.msg, response.loc

    def dispatch(
            self,
            cmd,  # type: Text
            in_script=True,  # type: bool
            encoding="utf-8",  # type: str
            timeout=None,  # type: Optional[int]
    ):
        # type: (...) -> Generator[Tuple[bool, Text, Optional[Tuple[int, int]]], bool, None]
        """Decide whether 'cmd' is setting/getting an option, a query, or a
        regular command.
        """
        # Make sure 'cmd' is a string format that supports unicode
        cmd = ensure_text(cmd, encoding)  # type: ignore[no-untyped-call]

        if self.xml.is_option(cmd):
            call = self.do_option(cmd, in_script, encoding, timeout)
        elif self.xml.is_query(cmd):
            call = self.query(cmd, in_script, encoding, timeout)
        elif in_script:
            call = self.advance(cmd, encoding, timeout)
        else:
            self.done_callback()
            yield  # type: ignore[misc] # (see comment above start())
            yield True, "Command only allowed in script.", None
            return

        next(call)
        while True:
            stopped = yield  # type: ignore[misc] # (see comment above start())
            ret = call.send(stopped)
            if ret is not None:
                yield ret
                break

    # Interacting with Coqtop #
    def call(
            self,
            cmdtype_msg,  # type: Tuple[Text, Optional[bytes]]
            timeout=None,  # type: Optional[int]
    ):
        # type: (...) -> Generator[Union[Ok, Err], bool, None]
        """Send 'msg' to the Coqtop process and wait for the response."""
        # Check if Coqtop has stopped
        if not self.running():
            raise CoqtopError("Coqtop is not running.")

        # Throw away any unread messages
        self.empty_out()

        cmd, msg = cmdtype_msg

        # 'msg' can be None if a command does not exist for a particular
        # version and is being faked.
        # N.B. It is important that the '_standardize' function being called
        # does not depend on the value it is passed since it is None
        if msg is None:
            self.done_callback()
            yield  # type: ignore[misc] # (see comment above start())
            yield self.xml.standardize(cmd, Ok(None))
            return

        # Don't bother doing prettyxml if debugging isn't on
        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug(prettyxml(msg))
        self.send_cmd(msg)

        if timeout == 0:
            timeout = None

        # The got_response event tells the timeout_thread that get_answer()
        # returned normally, while timed_out will be set by timeout_thread if
        # time runs out without receiving a response
        got_response = threading.Event()
        timed_out = threading.Event()
        timeout_thread = threading.Thread(target=self.timeout_thread,
                                          args=(timeout, got_response,
                                                timed_out))
        timeout_thread.daemon = True

        # Start a thread to get Coqtop's response
        res_ref = Ref()
        answer_thread = threading.Thread(target=self.get_answer,
                                         args=(res_ref, ))
        answer_thread.daemon = True

        # Start threads and yield back to caller to wait for Coqtop to finish
        timeout_thread.start()
        answer_thread.start()
        stopped = yield  # type: ignore[misc] # (see comment above start())

        # Notify timeout_thread that a response is received and wait for
        # threads to finish
        got_response.set()
        timeout_thread.join()
        answer_thread.join()

        response = res_ref.val

        # Check for user interrupt or timeout
        if isinstance(response, Err):
            if stopped:
                response = STOPPED_ERR
            elif timed_out.is_set():
                response = TIMEOUT_ERR

        yield self.xml.standardize(cmd, response)

    def timeout_thread(self, timeout, got_response, timed_out):
        # type: (int, threading.Event, threading.Event) -> None
        """Wait on the 'got_response' Event for timeout seconds and set
        'timed_out' and interrupt the Coqtop process if it is not set in
        time.
        """
        if self.coqtop is None:
            raise CoqtopError("coqtop must not be None in timeout_thread()")

        if not got_response.wait(timeout):
            self.interrupt()
            timed_out.set()

    def get_answer(self, res_ref):
        # type: (Ref) -> None
        """Read from 'out_q' and wait until a full response is received."""
        data = []

        while True:
            data.append(self.out_q.get())
            xml = b"".join(data)
            if not self.xml.worth_parsing(xml):
                continue
            response = self.xml.raw_response(xml)

            if response is None:
                continue

            # Don't bother doing prettyxml if debugging isn't on
            if self.logger.isEnabledFor(logging.DEBUG):
                self.logger.debug(
                    prettyxml(b"<response>" + xml + b"</response>"))
            res_ref.val = response
            # Notify the caller that Coqtop is done
            self.done_callback()
            break

    def empty_out(self):
        # type: () -> None
        """Pop data until 'out_q' is empty."""
        while not self.out_q.empty():
            try:
                self.out_q.get_nowait()
            except Empty:
                return

    def capture_out(self):
        # type: () -> None
        """Continually read data from Coqtop's stdout into 'out_q'."""
        if self.coqtop is None:
            raise CoqtopError("coqtop must not be None in capture_out()")
        if self.coqtop.stdout is None:
            raise CoqtopError(
                "coqtop stdout must not be None in capture_out()")
        fd = self.coqtop.stdout.fileno()

        while not self.stopping:
            try:
                self.out_q.put(os.read(fd, 0x10000))
            except (AttributeError, OSError, ValueError):
                # Coqtop died
                return

    def capture_err(self):
        # type: () -> None
        """Continually read data from Coqtop's stderr and print it."""
        if self.coqtop is None:
            raise CoqtopError("coqtop must not be None in capture_err()")
        if self.coqtop.stderr is None:
            raise CoqtopError(
                "coqtop stderr must not be None in capture_err()")
        fd = self.coqtop.stderr.fileno()

        while not self.stopping:
            try:
                print(os.read(fd, 0x10000).decode())
            except (AttributeError, OSError, ValueError):
                # Coqtop died
                return

    def capture_dead(self):
        # type: () -> None
        """Continually check if Coqtop has died."""
        while self.running():
            time.sleep(1)
        self.stop()

    def send_cmd(self, cmd):
        # type: (bytes) -> None
        """Write to Coqtop's stdin."""
        if self.coqtop is None:
            raise CoqtopError("coqtop must not be None in send_cmd()")
        if self.coqtop.stdin is None:
            raise CoqtopError("coqtop stdin must not be None in send_cmd()")

        self.coqtop.stdin.write(cmd)
        self.coqtop.stdin.flush()

    def interrupt(self):
        # type: () -> None
        """Send a SIGINT signal to Coqtop."""
        if self.coqtop is None:
            raise CoqtopError("Coqtop is not running.")
        self.coqtop.send_signal(signal.SIGINT)

    # Current State #
    def running(self):
        # type: () -> bool
        """Check if Coqtop has already been started."""
        return self.coqtop is not None and self.coqtop.poll() is None

    # Debugging #
    def toggle_debug(self):
        # type: () -> Optional[str]
        """Enable or disable logging of debug messages."""
        self.logger.removeHandler(self.handler)
        self.handler.flush()
        self.handler.close()

        if self.log is None:
            # Create unique log file
            pre = "coqtop_{}_".format(
                datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
            fmt = logging.Formatter("%(asctime)s: %(message)s")
            # Python 2 says _TemporaryFileWrapper is incompatible with IO[Text]
            self.log = NamedTemporaryFile(
                mode="w", prefix=pre, delete=False)  # type: ignore[assignment]
            self.handler = logging.StreamHandler(self.log)
            self.handler.setFormatter(fmt)
            self.logger.addHandler(self.handler)
            self.logger.setLevel(logging.DEBUG)
            return self.log.name  # type: ignore[no-any-return, attr-defined] # (see above)
        else:
            # Clean up old logging
            self.log.close()

            # Set to null logging
            self.log = None
            self.handler = logging.NullHandler()
            self.logger.addHandler(self.handler)
            self.logger.setLevel(logging.CRITICAL)
            return None
コード例 #24
0
ファイル: handler.py プロジェクト: ImmPortDB/immport-galaxy
class JobHandlerQueue(Monitors):
    """
    Job Handler's Internal Queue, this is what actually implements waiting for
    jobs to be runnable and dispatching to a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        """Initializes the Job Handler Queue, creates (unstarted) monitoring thread"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        name = "JobHandlerQueue.monitor_thread"
        self._init_monitor_thread(name, target=self.__monitor, config=app.config)

    def start(self):
        """
        Starts the JobHandler's thread after checking for any unhandled jobs.
        """
        log.debug('Handler queue starting for jobs assigned to handler: %s', self.app.config.server_name)
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        # The stack code is initialized in the application
        JobHandlerMessage().bind_default_handler(self, '_handle_message')
        self.app.application_stack.register_message_handler(self._handle_message, name=JobHandlerMessage.target)
        log.info("job handler queue started")

    def job_wrapper(self, job, use_persisted_destination=False):
        return JobWrapper(job, self, use_persisted_destination=use_persisted_destination)

    def job_pair_for_id(self, id):
        job = self.sa_session.query(model.Job).get(id)
        return job, self.job_wrapper(job, use_persisted_destination=True)

    def __write_registry_file_if_absent(self, job):
        # TODO: remove this and the one place it is called in late 2018, this
        # hack attempts to minimize the job failures due to upgrades from 17.05
        # Galaxies.
        job_wrapper = self.job_wrapper(job)
        cwd = job_wrapper.working_directory
        datatypes_config = os.path.join(cwd, "registry.xml")
        if not os.path.exists(datatypes_config):
            try:
                self.app.datatypes_registry.to_xml_file(path=datatypes_config)
            except OSError:
                pass

    def __check_jobs_at_startup(self):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        In case the activation is enforced it will filter out the jobs of inactive users.
        """
        jobs_at_startup = []
        if self.track_jobs_in_database:
            in_list = (model.Job.states.QUEUED,
                       model.Job.states.RUNNING)
        else:
            in_list = (model.Job.states.NEW,
                       model.Job.states.QUEUED,
                       model.Job.states.RUNNING)
        if self.app.config.user_activation_on:
            jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .outerjoin(model.User) \
                .filter(model.Job.state.in_(in_list) &
                        (model.Job.handler == self.app.config.server_name) &
                        or_((model.Job.user_id == null()), (model.User.active == true()))).all()
        else:
            jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .filter(model.Job.state.in_(in_list) &
                        (model.Job.handler == self.app.config.server_name)).all()

        for job in jobs_at_startup:
            self.__write_registry_file_if_absent(job)
            if not self.app.toolbox.has_tool(job.tool_id, job.tool_version, exact=True):
                log.warning("(%s) Tool '%s' removed from tool config, unable to recover job" % (job.id, job.tool_id))
                self.job_wrapper(job).fail('This tool was disabled before the job completed.  Please contact your Galaxy administrator.')
            elif job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug("(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id)
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                job_wrapper = self.job_wrapper(job)
                job_destination = self.dispatcher.url_to_destination(job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination, job.job_runner_external_id)
                self.dispatcher.recover(job, job_wrapper)
                log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug("(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % (job.id, job.state))
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            else:
                # Already dispatched and running
                job_wrapper = self.__recover_job_wrapper(job)
                self.dispatcher.recover(job, job_wrapper)
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __recover_job_wrapper(self, job):
        # Already dispatched and running
        job_wrapper = self.job_wrapper(job)
        # Use the persisted destination as its params may differ from
        # what's in the job_conf xml
        job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params)
        # resubmits are not persisted (it's a good thing) so they
        # should be added back to the in-memory destination on startup
        try:
            config_job_destination = self.app.job_config.get_destination(job.destination_id)
            job_destination.resubmit = config_job_destination.resubmit
        except KeyError:
            log.debug('(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id)
        job_wrapper.job_runner_mapper.cached_job_destination = job_destination
        return job_wrapper

    def __monitor(self):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.monitor_running:
            try:
                # If jobs are locked, there's nothing to monitor and we skip
                # to the sleep.
                if not self.app.job_manager.job_lock:
                    self.__monitor_step()
            except Exception:
                log.exception("Exception in monitor_step")
            self._monitor_sleep(1)

    def __monitor_step(self):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. If the job
        belongs to an inactive user it is ignored.
        Otherwise, the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        resubmit_jobs = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputDatasetAssociation) \
                .join(model.HistoryDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_((model.Job.state == model.Job.states.NEW),
                             or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA),
                                 (model.HistoryDatasetAssociation.deleted == true()),
                                 (model.Dataset.state != model.Dataset.states.OK),
                                 (model.Dataset.deleted == true())))).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputLibraryDatasetAssociation) \
                .join(model.LibraryDatasetDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_((model.Job.state == model.Job.states.NEW),
                        or_((model.LibraryDatasetDatasetAssociation._state != null()),
                            (model.LibraryDatasetDatasetAssociation.deleted == true()),
                            (model.Dataset.state != model.Dataset.states.OK),
                            (model.Dataset.deleted == true())))).subquery()
            if self.app.config.user_activation_on:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .outerjoin(model.User) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.Job.user_id == null()), (model.User.active == true())),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            else:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            # Fetch all "resubmit" jobs
            resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .filter(and_((model.Job.state == model.Job.states.RESUBMITTED),
                             (model.Job.handler == self.app.config.server_name))) \
                .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append(self.sa_session.query(model.Job).get(job_id))
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append(self.sa_session.query(model.Job).get(job_id))
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_job_count()
        # Check resubmit jobs first so that limits of new jobs will still be enforced
        for job in resubmit_jobs:
            log.debug('(%s) Job was resubmitted and is being dispatched immediately', job.id)
            # Reassemble resubmit job destination from persisted value
            jw = self.__recover_job_wrapper(job)
            if jw.is_ready_for_resubmission(job):
                self.increase_running_job_count(job.user_id, jw.job_destination.id)
                self.dispatcher.put(jw)
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                if job.copied_from_job_id:
                    copied_from_job = self.sa_session.query(model.Job).get(job.copied_from_job_id)
                    job.numeric_metrics = copied_from_job.numeric_metrics
                    job.text_metrics = copied_from_job.text_metrics
                    job.dependencies = copied_from_job.dependencies
                    job.state = copied_from_job.state
                    job.stderr = copied_from_job.stderr
                    job.stdout = copied_from_job.stdout
                    job.command_line = copied_from_job.command_line
                    job.traceback = copied_from_job.traceback
                    job.tool_version = copied_from_job.tool_version
                    job.exit_code = copied_from_job.exit_code
                    job.job_runner_name = copied_from_job.job_runner_name
                    job.job_runner_external_id = copied_from_job.job_runner_external_id
                    continue
                job_state = self.__check_job_state(job)
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append(job.id)
                elif job_state == JOB_INPUT_ERROR:
                    log.info("(%d) Job unable to run: one or more inputs in error state" % job.id)
                elif job_state == JOB_INPUT_DELETED:
                    log.info("(%d) Job unable to run: one or more inputs deleted" % job.id)
                elif job_state == JOB_READY:
                    self.dispatcher.put(self.job_wrappers.pop(job.id))
                    log.info("(%d) Job dispatched" % job.id)
                elif job_state == JOB_DELETED:
                    log.info("(%d) Job deleted by user while still queued" % job.id)
                elif job_state == JOB_ADMIN_DELETED:
                    log.info("(%d) Job deleted by admin while still queued" % job.id)
                elif job_state in (JOB_USER_OVER_QUOTA,
                                   JOB_USER_OVER_TOTAL_WALLTIME):
                    if job_state == JOB_USER_OVER_QUOTA:
                        log.info("(%d) User (%s) is over quota: job paused" % (job.id, job.user_id))
                    else:
                        log.info("(%d) User (%s) is over total walltime limit: job paused" % (job.id, job.user_id))

                    job.set_state(model.Job.states.PAUSED)
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add(dataset_assoc.dataset.dataset)
                    self.sa_session.add(job)
                elif job_state == JOB_ERROR:
                    log.error("(%d) Error checking job readiness" % job.id)
                else:
                    log.error("(%d) Job in unknown state '%s'" % (job.id, job_state))
                    new_waiting_jobs.append(job.id)
            except Exception:
                log.exception("failure running job %d", job.id)
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in list(self.job_wrappers.keys()):
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_job_state(self, job):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        if not self.track_jobs_in_database:
            in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job)
            if in_memory_not_ready_state:
                return in_memory_not_ready_state

        # Else, if tracking in the database, job.state is guaranteed to be NEW and
        # the inputs are guaranteed to be OK.

        # Create the job wrapper so that the destination can be set
        job_id = job.id
        job_wrapper = self.job_wrappers.get(job_id, None)
        if not job_wrapper:
            job_wrapper = self.job_wrapper(job)
            self.job_wrappers[job_id] = job_wrapper

        # If state == JOB_READY, assume job_destination also set - otherwise
        # in case of various error or cancelled states do not assume
        # destination has been set.
        state, job_destination = self.__verify_job_ready(job, job_wrapper)

        if state == JOB_READY:
            # PASS.  increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration
            self.increase_running_job_count(job.user_id, job_destination.id)
        return state

    def __verify_job_ready(self, job, job_wrapper):
        """ Compute job destination and verify job is ready at that
        destination by checking job limits and quota. If this method
        return a job state of JOB_READY - it MUST also return a job
        destination.
        """
        job_destination = None
        try:
            assert job_wrapper.tool is not None, 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
            # Cause the job_destination to be set and cached by the mapper
            job_destination = job_wrapper.job_destination
        except AssertionError as e:
            log.warning("(%s) Tool '%s' removed from tool config, unable to run job" % (job.id, job.tool_id))
            job_wrapper.fail(e)
            return JOB_ERROR, job_destination
        except JobNotReadyException as e:
            job_state = e.job_state or JOB_WAIT
            return job_state, None
        except Exception as e:
            failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE)
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception('Failed to generate job destination')
            else:
                log.debug("Intentionally failing job with message (%s)" % failure_message)
            job_wrapper.fail(failure_message)
            return JOB_ERROR, job_destination
        # job is ready to run, check limits
        # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable
        state = self.__check_destination_jobs(job, job_wrapper)

        if state == JOB_READY:
            state = self.__check_user_jobs(job, job_wrapper)
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota(job.user)
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage(user=job.user, history=job.history)
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA, job_destination
                except AssertionError as e:
                    pass  # No history, should not happen with an anon user
        # Check total walltime limits
        if (state == JOB_READY and
                "delta" in self.app.job_config.limits.total_walltime):
            jobs_to_check = self.sa_session.query(model.Job).filter(
                model.Job.user_id == job.user.id,
                model.Job.update_time >= datetime.datetime.now() -
                datetime.timedelta(
                    self.app.job_config.limits.total_walltime["window"]
                ),
                model.Job.state == 'ok'
            ).all()
            time_spent = datetime.timedelta(0)
            for job in jobs_to_check:
                # History is job.state_history
                started = None
                finished = None
                for history in sorted(
                        job.state_history,
                        key=lambda history: history.update_time):
                    if history.state == "running":
                        started = history.create_time
                    elif history.state == "ok":
                        finished = history.create_time

                time_spent += finished - started

            if time_spent > self.app.job_config.limits.total_walltime["delta"]:
                return JOB_USER_OVER_TOTAL_WALLTIME, job_destination

        return state, job_destination

    def __verify_in_memory_job_inputs(self, job):
        """ Perform the same checks that happen via SQL for in-memory managed
        jobs.
        """
        if job.state == model.Job.states.DELETED:
            return JOB_DELETED
        elif job.state == model.Job.states.ERROR:
            return JOB_ADMIN_DELETED
        for dataset_assoc in job.input_datasets + job.input_library_datasets:
            idata = dataset_assoc.dataset
            if not idata:
                continue
            # don't run jobs for which the input dataset was deleted
            if idata.deleted:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail("input data %s (file: %s) was deleted before the job started" % (idata.hid, idata.file_name))
                return JOB_INPUT_DELETED
            # an error in the input data causes us to bail immediately
            elif idata.state == idata.states.ERROR:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail("input data %s is in error state" % (idata.hid))
                return JOB_INPUT_ERROR
            elif idata.state == idata.states.FAILED_METADATA:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail("input data %s failed to properly set metadata" % (idata.hid))
                return JOB_INPUT_ERROR
            elif idata.state != idata.states.OK and not (idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id):
                # need to requeue
                return JOB_WAIT

        # All inputs ready to go.
        return None

    def __clear_job_count(self):
        self.user_job_count = None
        self.user_job_count_per_destination = None
        self.total_job_count_per_destination = None

    def get_user_job_count(self, user_id):
        self.__cache_user_job_count()
        # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching
        rval = self.user_job_count.get(user_id, 0)
        if not self.app.config.cache_user_job_count:
            result = self.sa_session.execute(select([func.count(model.Job.table.c.id)])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED,
                                                         model.Job.states.RUNNING,
                                                         model.Job.states.RESUBMITTED)),
                                                         (model.Job.table.c.user_id == user_id))))
            for row in result:
                # there should only be one row
                rval += row[0]
        return rval

    def __cache_user_job_count(self):
        # Cache the job count if necessary
        if self.user_job_count is None and self.app.config.cache_user_job_count:
            self.user_job_count = {}
            query = self.sa_session.execute(select([model.Job.table.c.user_id, func.count(model.Job.table.c.user_id)])
                                            .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED,
                                                                                     model.Job.states.RUNNING,
                                                                                     model.Job.states.RESUBMITTED)),
                                                        (model.Job.table.c.user_id != null())))
                                            .group_by(model.Job.table.c.user_id))
            for row in query:
                self.user_job_count[row[0]] = row[1]
        elif self.user_job_count is None:
            self.user_job_count = {}

    def get_user_job_count_per_destination(self, user_id):
        self.__cache_user_job_count_per_destination()
        cached = self.user_job_count_per_destination.get(user_id, {})
        if self.app.config.cache_user_job_count:
            rval = cached
        else:
            # The cached count is still used even when we're not caching, it is
            # incremented when a job is run by this handler to ensure that
            # multiple jobs can't get past the limits in one iteration of the
            # queue.
            rval = {}
            rval.update(cached)
            result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)), (model.Job.table.c.user_id == user_id)))
                                             .group_by(model.Job.table.c.destination_id))
            for row in result:
                # Add the count from the database to the cached count
                rval[row['destination_id']] = rval.get(row['destination_id'], 0) + row['job_count']
        return rval

    def __cache_user_job_count_per_destination(self):
        # Cache the job count if necessary
        if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count:
            self.user_job_count_per_destination = {}
            result = self.sa_session.execute(select([model.Job.table.c.user_id, model.Job.table.c.destination_id, func.count(model.Job.table.c.user_id).label('job_count')])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING))))
                                             .group_by(model.Job.table.c.user_id, model.Job.table.c.destination_id))
            for row in result:
                if row['user_id'] not in self.user_job_count_per_destination:
                    self.user_job_count_per_destination[row['user_id']] = {}
                self.user_job_count_per_destination[row['user_id']][row['destination_id']] = row['job_count']
        elif self.user_job_count_per_destination is None:
            self.user_job_count_per_destination = {}

    def increase_running_job_count(self, user_id, destination_id):
        if self.app.job_config.limits.registered_user_concurrent_jobs or \
           self.app.job_config.limits.anonymous_user_concurrent_jobs or \
           self.app.job_config.limits.destination_user_concurrent_jobs:
            if self.user_job_count is None:
                self.user_job_count = {}
            if self.user_job_count_per_destination is None:
                self.user_job_count_per_destination = {}
            self.user_job_count[user_id] = self.user_job_count.get(user_id, 0) + 1
            if user_id not in self.user_job_count_per_destination:
                self.user_job_count_per_destination[user_id] = {}
            self.user_job_count_per_destination[user_id][destination_id] = self.user_job_count_per_destination[user_id].get(destination_id, 0) + 1
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            if self.total_job_count_per_destination is None:
                self.total_job_count_per_destination = {}
            self.total_job_count_per_destination[destination_id] = self.total_job_count_per_destination.get(destination_id, 0) + 1

    def __check_user_jobs(self, job, job_wrapper):
        # TODO: Update output datasets' _state = LIMITED or some such new
        # state, so the UI can reflect what jobs are waiting due to concurrency
        # limits
        if job.user:
            # Check the hard limit first
            if self.app.job_config.limits.registered_user_concurrent_jobs:
                count = self.get_user_job_count(job.user_id)
                # Check the user's number of dispatched jobs against the overall limit
                if count >= self.app.job_config.limits.registered_user_concurrent_jobs:
                    return JOB_WAIT
            # If we pass the hard limit, also check the per-destination count
            id = job_wrapper.job_destination.id
            count_per_id = self.get_user_job_count_per_destination(job.user_id)
            if id in self.app.job_config.limits.destination_user_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_user_concurrent_jobs[id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_user_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [d.id for d in self.app.job_config.get_destinations(tag)]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_user_concurrent_jobs[tag]:
                            return JOB_WAIT
        elif job.galaxy_session:
            # Anonymous users only get the hard limit
            if self.app.job_config.limits.anonymous_user_concurrent_jobs:
                count = self.sa_session.query(model.Job).enable_eagerloads(False) \
                            .filter(and_(model.Job.session_id == job.galaxy_session.id,
                                         or_(model.Job.state == model.Job.states.RUNNING,
                                             model.Job.state == model.Job.states.QUEUED))).count()
                if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs:
                    return JOB_WAIT
        else:
            log.warning('Job %s is not associated with a user or session so job concurrency limit cannot be checked.' % job.id)
        return JOB_READY

    def __cache_total_job_count_per_destination(self):
        # Cache the job count if necessary
        if self.total_job_count_per_destination is None:
            self.total_job_count_per_destination = {}
            result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING))))
                                             .group_by(model.Job.table.c.destination_id))
            for row in result:
                self.total_job_count_per_destination[row['destination_id']] = row['job_count']

    def get_total_job_count_per_destination(self):
        self.__cache_total_job_count_per_destination()
        # Always use caching (at worst a job will have to wait one iteration,
        # and this would be more fair anyway as it ensures FIFO scheduling,
        # insofar as FIFO would be fair...)
        return self.total_job_count_per_destination

    def __check_destination_jobs(self, job, job_wrapper):
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            id = job_wrapper.job_destination.id
            count_per_id = self.get_total_job_count_per_destination()
            if id in self.app.job_config.limits.destination_total_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_total_concurrent_jobs[id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_total_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [d.id for d in self.app.job_config.get_destinations(tag)]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_total_concurrent_jobs[tag]:
                            return JOB_WAIT
        return JOB_READY

    def _handle_setup_msg(self, job_id=None):
        job = self.sa_session.query(model.Job).get(job_id)
        if job.handler is None:
            job.handler = self.app.config.server_name
            self.sa_session.add(job)
            self.sa_session.flush()
            # If not tracking jobs in the database
            self.put(job.id, job.tool_id)
        else:
            log.warning("(%s) Handler '%s' received setup message but handler '%s' is already assigned, ignoring", job.id, self.app.config.server_name, job.handler)

    def put(self, job_id, tool_id):
        """Add a job to the queue (by job identifier)"""
        if not self.track_jobs_in_database:
            self.queue.put((job_id, tool_id))
            self.sleeper.wake()
        else:
            # Workflow invocations farmed out to workers will submit jobs through here. If a handler is unassigned, we
            # will submit for one, or else claim it ourself. TODO: This should be moved to a higher level as it's now
            # implemented here and in MessageJobQueue
            job = self.sa_session.query(model.Job).get(job_id)
            if job.handler is None and self.app.application_stack.has_pool(self.app.application_stack.pools.JOB_HANDLERS):
                msg = JobHandlerMessage(task='setup', job_id=job_id)
                self.app.application_stack.send_message(self.app.application_stack.pools.JOB_HANDLERS, msg)

    def shutdown(self):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info("sending stop signal to worker thread")
            self.stop_monitoring()
            if not self.app.config.track_jobs_in_database:
                self.queue.put(self.STOP_SIGNAL)
            # A message could still be received while shutting down, should be ok since they will be picked up on next startup.
            self.app.application_stack.deregister_message_handler(name=JobHandlerMessage.target)
            self.sleeper.wake()
            self.shutdown_monitor()
            log.info("job handler queue stopped")
            self.dispatcher.shutdown()
コード例 #25
0
ファイル: process.py プロジェクト: Ahuge/readable_process
class ReadableProcess(object):
    def __init__(self, command):
        super(ReadableProcess, self).__init__()
        self._command = command
        self._out_pipe = StringIO()
        self._err_pipe = StringIO()
        self._stdout_t = None
        self._stderr_t = None
        self._process = None
        self._out_queue = None
        self._err_queue = None
        self._out_t = None
        self._err_t = None

    @property
    def returncode(self):
        if self._process:
            return self._process.returncode
        return 1

    def poll(self):
        if self._process:
            return self._process.poll()
        return 1

    def enqueue_output(self, out, queue):
        for line in iter(out.readline, b''):
            queue.put(line)
        out.close()

    def kill(self):
        if self._process:
            os.killpg(os.getpgid(self.pid()), signal.SIGTERM)
            self._process.terminate()

    def pid(self):
        if self._process:
            return self._process.pid

    def run(self):
        if self._process:
            raise ValueError("Already Ran")

        self._out_queue = Queue()
        self._err_queue = Queue()

        try:
            self._process = subprocess.Popen(
                self._command,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                preexec_fn=os.setsid,
            )
            self._stdout_t = Thread(target=self.enqueue_output,
                                    args=(self._process.stdout,
                                          self._out_queue))
            self._stdout_t.daemon = True
            self._stderr_t = Thread(target=self.enqueue_output,
                                    args=(self._process.stderr,
                                          self._err_queue))
            self._stderr_t.daemon = True

            self._stdout_t.start()
            self._stderr_t.start()
        except Exception:
            import traceback
            fmt = traceback.format_exc()
            for line in fmt.split("\n"):
                self._err_queue.put(line)

    def read_stdout_line(self):
        try:
            return self._out_queue.get_nowait()
        except Empty:
            return None

    def read_stdout_all(self):
        buf = []
        line = self.read_stdout_line()
        while line and line is not None:
            buf.append(text_type(line))
            line = self.read_stdout_line()
            time.sleep(0.1)
        return "\n".join(buf)

    def read_stderr_line(self):
        try:
            return self._err_queue.get_nowait()
        except Empty:
            return None

    def read_stderr_all(self):
        buf = []
        line = self.read_stderr_line()
        while line and line is not None:
            buf.append(text_type(line))
            line = self.read_stderr_line()
            time.sleep(0.1)
        return "\n".join(buf)

    def read_all(self):
        return self.read_stdout_all(), self.read_stderr_all()
コード例 #26
0
ファイル: crashstorage.py プロジェクト: stephendonner/socorro
class RabbitMQCrashStorage(CrashStorageBase):
    """This class is an implementation of a Socorro Crash Storage system.
    It is used as a crash queing methanism for raw crashes.  It implements
    the save_raw_crash method as a queue submission function, and the
    new_crashes generator as a queue consumption function.  Please note: as
    it only queues the crash_id and not the whole raw crash, it is not suitable
    to actually save a crash.  It is a very lossy container.  This class
    should be used in conjuction with a more persistant storage mechanism.

    The implementations CrashStorage classes can use arbitrarly high or low
    level semantics to talk to their underlying resource.  In the RabbitMQ,
    implementation, queing through the 'save_raw_crash' method is given full
    transactional semantics using the TransactorExecutor classes.  The
    'new_crashes' generator has a lower level relationship with the
    underlying connection object"""

    required_config = Namespace()
    required_config.add_option(
        'rabbitmq_class',
        default=ConnectionContextPooled,  # we choose a pooled connection
                                          # because we need thread safe
                                          # connection behaviors
        doc='the class responsible for connecting to RabbitMQ',
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'transaction_executor_class',
        default="socorro.database.transaction_executor."
                "TransactionExecutorWithInfiniteBackoff",
        doc='a class that will manage transactions',
        from_string_converter=class_converter,
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'routing_key',
        default='socorro.normal',
        doc='the name of the queue to recieve crashes',
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'filter_on_legacy_processing',
        default=True,
        doc='toggle for using or ignoring the throttling flag',
        reference_value_from='resource.rabbitmq',
    )
    required_config.add_option(
        'throttle',
        default=100,
        doc='percentage of the time that rabbit will try to queue',
        reference_value_from='resource.rabbitmq',
    )

    def __init__(self, config, namespace='', quit_check_callback=None):
        super(RabbitMQCrashStorage, self).__init__(
            config,
            namespace=namespace,
            quit_check_callback=quit_check_callback
        )

        self.config = config

        # Note: this may continue to grow if we aren't acking certain UUIDs.
        # We should find a way to time out UUIDs after a certain time.
        self.acknowledgement_token_cache = {}
        self.acknowledgment_queue = Queue()

        self.rabbitmq = config.rabbitmq_class(config)
        self.transaction = config.transaction_executor_class(
            config,
            self.rabbitmq,
            quit_check_callback=quit_check_callback
        )

        # cache this object so we don't have to remake it for every transaction
        self._basic_properties = pika.BasicProperties(
            delivery_mode=2,  # make message persistent
        )

        if config.throttle == 100:
            self.dont_queue_this_crash = lambda: False
        else:
            self.dont_queue_this_crash = (
                lambda: randint(1, 100) > config.throttle
            )

    def save_raw_crash(self, raw_crash, dumps, crash_id):
        if self.dont_queue_this_crash():
            self.config.logger.info(
                'Crash %s filtered out of RabbitMQ queue %s',
                crash_id,
                self.config.routing_key
            )
            return
        try:
            this_crash_should_be_queued = (
                not self.config.filter_on_legacy_processing or
                raw_crash.legacy_processing == 0
            )
        except KeyError:
            self.config.logger.debug(
                'RabbitMQCrashStorage legacy_processing key absent in crash '
                '%s', crash_id
            )
            return

        if this_crash_should_be_queued:
            self.config.logger.debug(
                'RabbitMQCrashStorage saving crash %s', crash_id
            )
            self.transaction(self._save_raw_crash_transaction, crash_id)
            return True
        else:
            self.config.logger.debug(
                'RabbitMQCrashStorage not saving crash %s, legacy processing '
                'flag is %s', crash_id, raw_crash.legacy_processing
            )

    def _save_raw_crash_transaction(self, connection, crash_id):
        connection.channel.basic_publish(
            exchange='',
            routing_key=self.config.routing_key,
            body=crash_id,
            properties=self._basic_properties
        )

    def _basic_get_transaction(self, conn, queue):
        """reorganize the the call to rabbitmq basic_get so that it can be
        used by the transaction retry wrapper."""
        things = conn.channel.basic_get(queue=queue)
        return things

    def new_crashes(self):
        """This generator fetches crash_ids from RabbitMQ."""

        # We've set up RabbitMQ to require acknowledgement of processing of a
        # crash_id from this generator.  It is the responsibility of the
        # consumer of the crash_id to tell this instance of the class when has
        # completed its work on the crash_id.  That is done with the call to
        # 'ack_crash' below.  Because RabbitMQ connections are not thread safe,
        # only the thread that read the crash may acknowledge it.  'ack_crash'
        # queues the crash_id. The '_consume_acknowledgement_queue' function
        # is run to send acknowledgments back to RabbitMQ
        self._consume_acknowledgement_queue()
        queues = [
            self.rabbitmq.config.priority_queue_name,
            self.rabbitmq.config.standard_queue_name,
            self.rabbitmq.config.reprocessing_queue_name,
            self.rabbitmq.config.priority_queue_name,
        ]
        while True:
            for queue in queues:
                method_frame, header_frame, body = self.transaction(
                    self._basic_get_transaction,
                    queue=queue
                )
                if method_frame and self._suppress_duplicate_jobs(
                    body,
                    method_frame
                ):
                    continue
                if method_frame:
                    break
            # must consume ack queue before testing for end of iterator
            # or the last job won't get ack'd
            self._consume_acknowledgement_queue()
            if not method_frame:
                # there was nothing in the queue - leave the iterator
                return
            self.acknowledgement_token_cache[body] = method_frame
            yield body
            queues.reverse()

    def ack_crash(self, crash_id):
        self.acknowledgment_queue.put(crash_id)

    def _suppress_duplicate_jobs(self, crash_id, acknowledgement_token):
        """if this crash is in the cache, then it is already in progress
        and this is a duplicate.  Acknowledge it, then return to True
        to let the caller know to skip on to the next crash."""
        if crash_id in self.acknowledgement_token_cache:
            # reject this crash - it's already being processsed
            self.config.logger.info(
                'duplicate job: %s is already in progress',
                crash_id
            )
            # ack this
            self.transaction(
                self._transaction_ack_crash,
                crash_id,
                acknowledgement_token
            )
            return True
        return False

    def _consume_acknowledgement_queue(self):
        """The acknowledgement of the processing of each crash_id yielded
        from the 'new_crashes' method must take place on the same connection
        that the crash_id came from.  The crash_ids are queued in the
        'acknowledgment_queue'.  That queue is consumed by the QueuingThread"""
        try:
            while True:
                crash_id_to_be_acknowledged = \
                    self.acknowledgment_queue.get_nowait()
                # self.config.logger.debug(
                #     'RabbitMQCrashStorage set to acknowledge %s',
                #     crash_id_to_be_acknowledged
                # )
                try:
                    acknowledgement_token = \
                        self.acknowledgement_token_cache[
                            crash_id_to_be_acknowledged
                        ]
                    self.transaction(
                        self._transaction_ack_crash,
                        crash_id_to_be_acknowledged,
                        acknowledgement_token
                    )
                    del self.acknowledgement_token_cache[
                        crash_id_to_be_acknowledged
                    ]
                except KeyError:
                    self.config.logger.warning(
                        'RabbitMQCrashStorage tried to acknowledge crash %s'
                        ', which was not in the cache',
                        crash_id_to_be_acknowledged,
                        exc_info=True
                    )
                except Exception:
                    self.config.logger.error(
                        'RabbitMQCrashStorage unexpected failure on %s',
                        crash_id_to_be_acknowledged,
                        exc_info=True
                    )

        except Empty:
            pass  # nothing to do with an empty queue

    def _transaction_ack_crash(
        self,
        connection,
        crash_id,
        acknowledgement_token
    ):
        connection.channel.basic_ack(
            delivery_tag=acknowledgement_token.delivery_tag
        )
        self.config.logger.debug(
            'RabbitMQCrashStorage acking %s with delivery_tag %s',
            crash_id,
            acknowledgement_token.delivery_tag
        )
コード例 #27
0
class TaskQueue(ChainedIdentity):
    """
    A class for managing async tasks.
    """

    def __init__(self, worker_pool=None, error_handler=None, flush_timeout_seconds=None, **kwargs):
        """
        :param worker_pool: Thread pool for executing tasks
        :type worker_pool: concurrent.futures.ThreadPoolExecutor
        :param error_handler: Extension point for processing error queue items
        :type error_handler: function(error, logging.Logger)
        :param timeout_seconds: Task flush timeout in seconds
        :type timeout_seconds: timeout in seconds
        """
        super(TaskQueue, self).__init__(**kwargs)
        self._tasks = Queue()
        self._results = []
        # For right now, don't need queue for errors, but it's
        # probable that we'll want the error handler looping on queue thread
        self._errors = []
        self._err_handler = error_handler
        self._worker_pool = worker_pool if worker_pool is not None else WorkerPool(_parent_logger=self._logger)
        self._task_number = 0
        self._flush_timeout_seconds = DEFAULT_FLUSH_TIMEOUT_SECONDS
        if flush_timeout_seconds:
            self._flush_timeout_seconds = flush_timeout_seconds
            self._logger.debug('Overriding default timeout to {}'.format(flush_timeout_seconds))

    def __enter__(self):
        self._logger.debug("[Start]")
        return self

    def __exit__(self, *args):
        self._logger.debug("[Stop] - waiting default timeout")
        self.flush(self.identity)

    # TODO: Adding functions with this method needs to be more configurable
    def add(self, func, *args, **kwargs):
        """
        :param func: Function to be executed asynchronously
        :type func: builtin.function
        """
        future = self.create_future(func, *args, **kwargs)
        ident = "{}_{}".format(self._tasks.qsize(), func.__name__)
        task = AsyncTask(future, _ident=ident, _parent_logger=self._logger)
        self.add_task(task)
        return task

    def add_task(self, async_task):
        """
        :param async_task: asynchronous task to be added to the queue and possibly processed
        :type async_task: azureml._async.AsyncTask
        """
        '''Blocking, no timeout add task to queue'''
        if not isinstance(async_task, AsyncTask):
            raise ValueError("Can only add AsyncTask, got {0}".format(type(async_task)))

        self._logger.debug("Adding task {0} to queue of approximate size: {1}".format(async_task.ident,
                                                                                      self._tasks.qsize()))
        self._tasks.put(async_task)

    def create_future(self, func, *args, **kwargs):
        return self._worker_pool.submit(func, *args, **kwargs)

    def flush(self, source, timeout_seconds=None):
        with self._log_context("WaitFlushSource:{}".format(source)) as log_context:

            if timeout_seconds is None:
                log_context.debug("Overriding default flush timeout from None to {}".
                                  format(self._flush_timeout_seconds))
                timeout_seconds = self._flush_timeout_seconds
            else:
                log_context.debug("flush timeout {} is different from task queue timeout {}, using flush timeout".
                                  format(timeout_seconds, self._flush_timeout_seconds))

            start_time = time.time()

            #  Take tasks off of the queue
            tasks_to_wait = []
            while True:
                try:
                    tasks_to_wait.append(self._tasks.get_nowait())
                except Empty:
                    break

            message = ""
            timeout_time = start_time + timeout_seconds

            log_context.debug("Waiting {} seconds on tasks: {}.".format(timeout_seconds, tasks_to_wait))

            not_done = True

            while not_done and time.time() <= timeout_time:
                completed_tasks = [task for task in tasks_to_wait if task.done()]
                tasks_to_wait = [task for task in tasks_to_wait if not task.done()]
                not_done = len(tasks_to_wait) != 0

                self._results.extend((task.wait(awaiter_name=self.identity) for task in completed_tasks))

                if not_done:
                    for task in tasks_to_wait:
                        message += "Waiting on task: {}.\n".format(task.ident)
                    message += "{} tasks left. Current duration of flush {} seconds.\n".format(
                        len(tasks_to_wait), time.time() - start_time)

                    time.sleep(.25)

            self._logger.debug(message)

            # Reach this case on timeout
            if not_done:
                azureml_error = AzureMLError.create(
                    FlushTaskTimeout, timeout_seconds=timeout_seconds
                )
                raise AzureMLException._with_error(azureml_error)

    @property
    def results(self):
        for result in self._results:
            yield result

    def errors(self):
        for error in self._errors:
            yield error
コード例 #28
0
class JobHandlerStopQueue(Monitors):
    """
    A queue for jobs which need to be terminated prematurely.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        name = "JobHandlerStopQueue.monitor_thread"
        self._init_monitor_thread(name, config=app.config)
        log.info("job handler stop queue started")

    def start(self):
        # Start the queue
        self.monitor_thread.start()
        log.info("job handler stop queue started")

    def monitor(self):
        """
        Continually iterate the waiting jobs, stop any that are found.
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep(10)
        while self.monitor_running:
            try:
                self.monitor_step()
            except Exception:
                log.exception("Exception in monitor_step")
            # Sleep
            self._monitor_sleep(1)

    def monitor_step(self):
        """
        Called repeatedly by `monitor` to stop jobs.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.app.config.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            newly_deleted_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                                     .filter((model.Job.state == model.Job.states.DELETED_NEW) &
                                             (model.Job.handler == self.app.config.server_name)).all()
            for job in newly_deleted_jobs:
                jobs_to_check.append((job, job.stderr))
        # Also pull from the queue (in the case of Administrative stopped jobs)
        try:
            while 1:
                message = self.queue.get_nowait()
                if message is self.STOP_SIGNAL:
                    return
                # Unpack the message
                job_id, error_msg = message
                # Get the job object and append to watch queue
                jobs_to_check.append(
                    (self.sa_session.query(model.Job).get(job_id), error_msg))
        except Empty:
            pass
        for job, error_msg in jobs_to_check:
            if (job.state not in (job.states.DELETED_NEW, job.states.DELETED)
                    and job.finished):
                # terminated before it got here
                log.debug('Job %s already finished, not deleting or stopping',
                          job.id)
                continue
            final_state = job.states.DELETED
            if error_msg is not None:
                final_state = job.states.ERROR
                job.info = error_msg
            job.set_final_state(final_state)
            self.sa_session.add(job)
            self.sa_session.flush()
            if job.job_runner_name is not None:
                # tell the dispatcher to stop the job
                self.dispatcher.stop(job)

    def put(self, job_id, error_msg=None):
        if not self.app.config.track_jobs_in_database:
            self.queue.put((job_id, error_msg))

    def shutdown(self):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info("sending stop signal to worker thread")
            self.stop_monitoring()
            if not self.app.config.track_jobs_in_database:
                self.queue.put(self.STOP_SIGNAL)
            self.shutdown_monitor()
            log.info("job handler stop queue stopped")
コード例 #29
0
class JobHandlerQueue(Monitors):
    """
    Job Handler's Internal Queue, this is what actually implements waiting for
    jobs to be runnable and dispatching to a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        """Initializes the Job Handler Queue, creates (unstarted) monitoring thread"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        name = "JobHandlerQueue.monitor_thread"
        self._init_monitor_thread(name,
                                  target=self.__monitor,
                                  config=app.config)

    def start(self):
        """
        Starts the JobHandler's thread after checking for any unhandled jobs.
        """
        log.debug('Handler queue starting for jobs assigned to handler: %s',
                  self.app.config.server_name)
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        # The stack code is initialized in the application
        JobHandlerMessage().bind_default_handler(self, '_handle_message')
        self.app.application_stack.register_message_handler(
            self._handle_message, name=JobHandlerMessage.target)
        log.info("job handler queue started")

    def job_wrapper(self, job, use_persisted_destination=False):
        return JobWrapper(job,
                          self,
                          use_persisted_destination=use_persisted_destination)

    def job_pair_for_id(self, id):
        job = self.sa_session.query(model.Job).get(id)
        return job, self.job_wrapper(job, use_persisted_destination=True)

    def __write_registry_file_if_absent(self, job):
        # TODO: remove this and the one place it is called in late 2018, this
        # hack attempts to minimize the job failures due to upgrades from 17.05
        # Galaxies.
        job_wrapper = self.job_wrapper(job)
        cwd = job_wrapper.working_directory
        datatypes_config = os.path.join(cwd, "registry.xml")
        if not os.path.exists(datatypes_config):
            try:
                self.app.datatypes_registry.to_xml_file(path=datatypes_config)
            except OSError:
                pass

    def __check_jobs_at_startup(self):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        In case the activation is enforced it will filter out the jobs of inactive users.
        """
        jobs_at_startup = []
        if self.track_jobs_in_database:
            in_list = (model.Job.states.QUEUED, model.Job.states.RUNNING)
        else:
            in_list = (model.Job.states.NEW, model.Job.states.QUEUED,
                       model.Job.states.RUNNING)
        if self.app.config.user_activation_on:
            jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .outerjoin(model.User) \
                .filter(model.Job.state.in_(in_list) &
                        (model.Job.handler == self.app.config.server_name) &
                        or_((model.Job.user_id == null()), (model.User.active == true()))).all()
        else:
            jobs_at_startup = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .filter(model.Job.state.in_(in_list) &
                        (model.Job.handler == self.app.config.server_name)).all()

        for job in jobs_at_startup:
            self.__write_registry_file_if_absent(job)
            if not self.app.toolbox.has_tool(
                    job.tool_id, job.tool_version, exact=True):
                log.warning(
                    "(%s) Tool '%s' removed from tool config, unable to recover job"
                    % (job.id, job.tool_id))
                self.job_wrapper(job).fail(
                    'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
                )
            elif job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug(
                    "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue"
                    % job.id)
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                job_wrapper = self.job_wrapper(job)
                job_destination = self.dispatcher.url_to_destination(
                    job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination,
                                                job.job_runner_external_id)
                self.dispatcher.recover(job, job_wrapper)
                log.info(
                    '(%s) Converted job from a URL to a destination and recovered'
                    % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug(
                    "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue"
                    % (job.id, job.state))
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            else:
                # Already dispatched and running
                job_wrapper = self.__recover_job_wrapper(job)
                self.dispatcher.recover(job, job_wrapper)
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __recover_job_wrapper(self, job):
        # Already dispatched and running
        job_wrapper = self.job_wrapper(job)
        # Use the persisted destination as its params may differ from
        # what's in the job_conf xml
        job_destination = JobDestination(id=job.destination_id,
                                         runner=job.job_runner_name,
                                         params=job.destination_params)
        # resubmits are not persisted (it's a good thing) so they
        # should be added back to the in-memory destination on startup
        try:
            config_job_destination = self.app.job_config.get_destination(
                job.destination_id)
            job_destination.resubmit = config_job_destination.resubmit
        except KeyError:
            log.debug(
                '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)',
                job.id, job.destination_id)
        job_wrapper.job_runner_mapper.cached_job_destination = job_destination
        return job_wrapper

    def __monitor(self):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.monitor_running:
            try:
                # If jobs are locked, there's nothing to monitor and we skip
                # to the sleep.
                if not self.app.job_manager.job_lock:
                    self.__monitor_step()
            except Exception:
                log.exception("Exception in monitor_step")
            self._monitor_sleep(1)

    def __monitor_step(self):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. If the job
        belongs to an inactive user it is ignored.
        Otherwise, the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        resubmit_jobs = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputDatasetAssociation) \
                .join(model.HistoryDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_((model.Job.state == model.Job.states.NEW),
                             or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA),
                                 (model.HistoryDatasetAssociation.deleted == true()),
                                 (model.Dataset.state != model.Dataset.states.OK),
                                 (model.Dataset.deleted == true())))).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputLibraryDatasetAssociation) \
                .join(model.LibraryDatasetDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_((model.Job.state == model.Job.states.NEW),
                        or_((model.LibraryDatasetDatasetAssociation._state != null()),
                            (model.LibraryDatasetDatasetAssociation.deleted == true()),
                            (model.Dataset.state != model.Dataset.states.OK),
                            (model.Dataset.deleted == true())))).subquery()
            if self.app.config.user_activation_on:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .outerjoin(model.User) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.Job.user_id == null()), (model.User.active == true())),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            else:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            # Fetch all "resubmit" jobs
            resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .filter(and_((model.Job.state == model.Job.states.RESUBMITTED),
                             (model.Job.handler == self.app.config.server_name))) \
                .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append(
                    self.sa_session.query(model.Job).get(job_id))
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append(
                        self.sa_session.query(model.Job).get(job_id))
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_job_count()
        # Check resubmit jobs first so that limits of new jobs will still be enforced
        for job in resubmit_jobs:
            log.debug(
                '(%s) Job was resubmitted and is being dispatched immediately',
                job.id)
            # Reassemble resubmit job destination from persisted value
            jw = self.__recover_job_wrapper(job)
            if jw.is_ready_for_resubmission(job):
                self.increase_running_job_count(job.user_id,
                                                jw.job_destination.id)
                self.dispatcher.put(jw)
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                if job.copied_from_job_id:
                    copied_from_job = self.sa_session.query(model.Job).get(
                        job.copied_from_job_id)
                    job.numeric_metrics = copied_from_job.numeric_metrics
                    job.text_metrics = copied_from_job.text_metrics
                    job.dependencies = copied_from_job.dependencies
                    job.state = copied_from_job.state
                    job.stderr = copied_from_job.stderr
                    job.stdout = copied_from_job.stdout
                    job.command_line = copied_from_job.command_line
                    job.traceback = copied_from_job.traceback
                    job.tool_version = copied_from_job.tool_version
                    job.exit_code = copied_from_job.exit_code
                    job.job_runner_name = copied_from_job.job_runner_name
                    job.job_runner_external_id = copied_from_job.job_runner_external_id
                    continue
                job_state = self.__check_job_state(job)
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append(job.id)
                elif job_state == JOB_INPUT_ERROR:
                    log.info(
                        "(%d) Job unable to run: one or more inputs in error state"
                        % job.id)
                elif job_state == JOB_INPUT_DELETED:
                    log.info(
                        "(%d) Job unable to run: one or more inputs deleted" %
                        job.id)
                elif job_state == JOB_READY:
                    self.dispatcher.put(self.job_wrappers.pop(job.id))
                    log.info("(%d) Job dispatched" % job.id)
                elif job_state == JOB_DELETED:
                    log.info("(%d) Job deleted by user while still queued" %
                             job.id)
                elif job_state == JOB_ADMIN_DELETED:
                    log.info("(%d) Job deleted by admin while still queued" %
                             job.id)
                elif job_state in (JOB_USER_OVER_QUOTA,
                                   JOB_USER_OVER_TOTAL_WALLTIME):
                    if job_state == JOB_USER_OVER_QUOTA:
                        log.info("(%d) User (%s) is over quota: job paused" %
                                 (job.id, job.user_id))
                    else:
                        log.info(
                            "(%d) User (%s) is over total walltime limit: job paused"
                            % (job.id, job.user_id))

                    job.set_state(model.Job.states.PAUSED)
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add(dataset_assoc.dataset.dataset)
                    self.sa_session.add(job)
                elif job_state == JOB_ERROR:
                    log.error("(%d) Error checking job readiness" % job.id)
                else:
                    log.error("(%d) Job in unknown state '%s'" %
                              (job.id, job_state))
                    new_waiting_jobs.append(job.id)
            except Exception:
                log.exception("failure running job %d", job.id)
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in list(self.job_wrappers.keys()):
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_job_state(self, job):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        if not self.track_jobs_in_database:
            in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job)
            if in_memory_not_ready_state:
                return in_memory_not_ready_state

        # Else, if tracking in the database, job.state is guaranteed to be NEW and
        # the inputs are guaranteed to be OK.

        # Create the job wrapper so that the destination can be set
        job_id = job.id
        job_wrapper = self.job_wrappers.get(job_id, None)
        if not job_wrapper:
            job_wrapper = self.job_wrapper(job)
            self.job_wrappers[job_id] = job_wrapper

        # If state == JOB_READY, assume job_destination also set - otherwise
        # in case of various error or cancelled states do not assume
        # destination has been set.
        state, job_destination = self.__verify_job_ready(job, job_wrapper)

        if state == JOB_READY:
            # PASS.  increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration
            self.increase_running_job_count(job.user_id, job_destination.id)
        return state

    def __verify_job_ready(self, job, job_wrapper):
        """ Compute job destination and verify job is ready at that
        destination by checking job limits and quota. If this method
        return a job state of JOB_READY - it MUST also return a job
        destination.
        """
        job_destination = None
        try:
            assert job_wrapper.tool is not None, 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
            # Cause the job_destination to be set and cached by the mapper
            job_destination = job_wrapper.job_destination
        except AssertionError as e:
            log.warning(
                "(%s) Tool '%s' removed from tool config, unable to run job" %
                (job.id, job.tool_id))
            job_wrapper.fail(e)
            return JOB_ERROR, job_destination
        except JobNotReadyException as e:
            job_state = e.job_state or JOB_WAIT
            return job_state, None
        except Exception as e:
            failure_message = getattr(e, 'failure_message',
                                      DEFAULT_JOB_PUT_FAILURE_MESSAGE)
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception('Failed to generate job destination')
            else:
                log.debug("Intentionally failing job with message (%s)" %
                          failure_message)
            job_wrapper.fail(failure_message)
            return JOB_ERROR, job_destination
        # job is ready to run, check limits
        # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable
        state = self.__check_destination_jobs(job, job_wrapper)

        if state == JOB_READY:
            state = self.__check_user_jobs(job, job_wrapper)
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota(job.user)
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage(user=job.user,
                                                           history=job.history)
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA, job_destination
                except AssertionError as e:
                    pass  # No history, should not happen with an anon user
        # Check total walltime limits
        if (state == JOB_READY
                and "delta" in self.app.job_config.limits.total_walltime):
            jobs_to_check = self.sa_session.query(model.Job).filter(
                model.Job.user_id == job.user.id, model.Job.update_time >=
                datetime.datetime.now() - datetime.timedelta(
                    self.app.job_config.limits.total_walltime["window"]),
                model.Job.state == 'ok').all()
            time_spent = datetime.timedelta(0)
            for job in jobs_to_check:
                # History is job.state_history
                started = None
                finished = None
                for history in sorted(job.state_history,
                                      key=lambda history: history.update_time):
                    if history.state == "running":
                        started = history.create_time
                    elif history.state == "ok":
                        finished = history.create_time

                time_spent += finished - started

            if time_spent > self.app.job_config.limits.total_walltime["delta"]:
                return JOB_USER_OVER_TOTAL_WALLTIME, job_destination

        return state, job_destination

    def __verify_in_memory_job_inputs(self, job):
        """ Perform the same checks that happen via SQL for in-memory managed
        jobs.
        """
        if job.state == model.Job.states.DELETED:
            return JOB_DELETED
        elif job.state == model.Job.states.ERROR:
            return JOB_ADMIN_DELETED
        for dataset_assoc in job.input_datasets + job.input_library_datasets:
            idata = dataset_assoc.dataset
            if not idata:
                continue
            # don't run jobs for which the input dataset was deleted
            if idata.deleted:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail(
                    "input data %s (file: %s) was deleted before the job started"
                    % (idata.hid, idata.file_name))
                return JOB_INPUT_DELETED
            # an error in the input data causes us to bail immediately
            elif idata.state == idata.states.ERROR:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail(
                    "input data %s is in error state" % (idata.hid))
                return JOB_INPUT_ERROR
            elif idata.state == idata.states.FAILED_METADATA:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail(
                    "input data %s failed to properly set metadata" %
                    (idata.hid))
                return JOB_INPUT_ERROR
            elif idata.state != idata.states.OK and not (
                    idata.state == idata.states.SETTING_METADATA
                    and job.tool_id is not None and job.tool_id ==
                    self.app.datatypes_registry.set_external_metadata_tool.id):
                # need to requeue
                return JOB_WAIT

        # All inputs ready to go.
        return None

    def __clear_job_count(self):
        self.user_job_count = None
        self.user_job_count_per_destination = None
        self.total_job_count_per_destination = None

    def get_user_job_count(self, user_id):
        self.__cache_user_job_count()
        # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching
        rval = self.user_job_count.get(user_id, 0)
        if not self.app.config.cache_user_job_count:
            result = self.sa_session.execute(
                select([func.count(model.Job.table.c.id)]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED, model.Job.states.RUNNING,
                             model.Job.states.RESUBMITTED)),
                        (model.Job.table.c.user_id == user_id))))
            for row in result:
                # there should only be one row
                rval += row[0]
        return rval

    def __cache_user_job_count(self):
        # Cache the job count if necessary
        if self.user_job_count is None and self.app.config.cache_user_job_count:
            self.user_job_count = {}
            query = self.sa_session.execute(
                select([
                    model.Job.table.c.user_id,
                    func.count(model.Job.table.c.user_id)
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED, model.Job.states.RUNNING,
                             model.Job.states.RESUBMITTED)),
                        (model.Job.table.c.user_id != null()))).group_by(
                            model.Job.table.c.user_id))
            for row in query:
                self.user_job_count[row[0]] = row[1]
        elif self.user_job_count is None:
            self.user_job_count = {}

    def get_user_job_count_per_destination(self, user_id):
        self.__cache_user_job_count_per_destination()
        cached = self.user_job_count_per_destination.get(user_id, {})
        if self.app.config.cache_user_job_count:
            rval = cached
        else:
            # The cached count is still used even when we're not caching, it is
            # incremented when a job is run by this handler to ensure that
            # multiple jobs can't get past the limits in one iteration of the
            # queue.
            rval = {}
            rval.update(cached)
            result = self.sa_session.execute(
                select([
                    model.Job.table.c.destination_id,
                    func.count(
                        model.Job.table.c.destination_id).label('job_count')
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED,
                             model.Job.states.RUNNING)),
                        (model.Job.table.c.user_id == user_id))).group_by(
                            model.Job.table.c.destination_id))
            for row in result:
                # Add the count from the database to the cached count
                rval[row['destination_id']] = rval.get(row['destination_id'],
                                                       0) + row['job_count']
        return rval

    def __cache_user_job_count_per_destination(self):
        # Cache the job count if necessary
        if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count:
            self.user_job_count_per_destination = {}
            result = self.sa_session.execute(
                select([
                    model.Job.table.c.user_id,
                    model.Job.table.c.destination_id,
                    func.count(model.Job.table.c.user_id).label('job_count')
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED,
                             model.Job.states.RUNNING)))).group_by(
                                 model.Job.table.c.user_id,
                                 model.Job.table.c.destination_id))
            for row in result:
                if row['user_id'] not in self.user_job_count_per_destination:
                    self.user_job_count_per_destination[row['user_id']] = {}
                self.user_job_count_per_destination[row['user_id']][
                    row['destination_id']] = row['job_count']
        elif self.user_job_count_per_destination is None:
            self.user_job_count_per_destination = {}

    def increase_running_job_count(self, user_id, destination_id):
        if self.app.job_config.limits.registered_user_concurrent_jobs or \
           self.app.job_config.limits.anonymous_user_concurrent_jobs or \
           self.app.job_config.limits.destination_user_concurrent_jobs:
            if self.user_job_count is None:
                self.user_job_count = {}
            if self.user_job_count_per_destination is None:
                self.user_job_count_per_destination = {}
            self.user_job_count[user_id] = self.user_job_count.get(user_id,
                                                                   0) + 1
            if user_id not in self.user_job_count_per_destination:
                self.user_job_count_per_destination[user_id] = {}
            self.user_job_count_per_destination[user_id][
                destination_id] = self.user_job_count_per_destination[
                    user_id].get(destination_id, 0) + 1
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            if self.total_job_count_per_destination is None:
                self.total_job_count_per_destination = {}
            self.total_job_count_per_destination[
                destination_id] = self.total_job_count_per_destination.get(
                    destination_id, 0) + 1

    def __check_user_jobs(self, job, job_wrapper):
        # TODO: Update output datasets' _state = LIMITED or some such new
        # state, so the UI can reflect what jobs are waiting due to concurrency
        # limits
        if job.user:
            # Check the hard limit first
            if self.app.job_config.limits.registered_user_concurrent_jobs:
                count = self.get_user_job_count(job.user_id)
                # Check the user's number of dispatched jobs against the overall limit
                if count >= self.app.job_config.limits.registered_user_concurrent_jobs:
                    return JOB_WAIT
            # If we pass the hard limit, also check the per-destination count
            id = job_wrapper.job_destination.id
            count_per_id = self.get_user_job_count_per_destination(job.user_id)
            if id in self.app.job_config.limits.destination_user_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_user_concurrent_jobs[
                        id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_user_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [
                                d.id for d in
                                self.app.job_config.get_destinations(tag)
                        ]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_user_concurrent_jobs[
                                tag]:
                            return JOB_WAIT
        elif job.galaxy_session:
            # Anonymous users only get the hard limit
            if self.app.job_config.limits.anonymous_user_concurrent_jobs:
                count = self.sa_session.query(model.Job).enable_eagerloads(False) \
                            .filter(and_(model.Job.session_id == job.galaxy_session.id,
                                         or_(model.Job.state == model.Job.states.RUNNING,
                                             model.Job.state == model.Job.states.QUEUED))).count()
                if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs:
                    return JOB_WAIT
        else:
            log.warning(
                'Job %s is not associated with a user or session so job concurrency limit cannot be checked.'
                % job.id)
        return JOB_READY

    def __cache_total_job_count_per_destination(self):
        # Cache the job count if necessary
        if self.total_job_count_per_destination is None:
            self.total_job_count_per_destination = {}
            result = self.sa_session.execute(
                select([
                    model.Job.table.c.destination_id,
                    func.count(
                        model.Job.table.c.destination_id).label('job_count')
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED,
                             model.Job.states.RUNNING)))).group_by(
                                 model.Job.table.c.destination_id))
            for row in result:
                self.total_job_count_per_destination[
                    row['destination_id']] = row['job_count']

    def get_total_job_count_per_destination(self):
        self.__cache_total_job_count_per_destination()
        # Always use caching (at worst a job will have to wait one iteration,
        # and this would be more fair anyway as it ensures FIFO scheduling,
        # insofar as FIFO would be fair...)
        return self.total_job_count_per_destination

    def __check_destination_jobs(self, job, job_wrapper):
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            id = job_wrapper.job_destination.id
            count_per_id = self.get_total_job_count_per_destination()
            if id in self.app.job_config.limits.destination_total_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_total_concurrent_jobs[
                        id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_total_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [
                                d.id for d in
                                self.app.job_config.get_destinations(tag)
                        ]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_total_concurrent_jobs[
                                tag]:
                            return JOB_WAIT
        return JOB_READY

    def _handle_setup_msg(self, job_id=None):
        job = self.sa_session.query(model.Job).get(job_id)
        if job.handler is None:
            job.handler = self.app.config.server_name
            self.sa_session.add(job)
            self.sa_session.flush()
            # If not tracking jobs in the database
            self.put(job.id, job.tool_id)
        else:
            log.warning(
                "(%s) Handler '%s' received setup message but handler '%s' is already assigned, ignoring",
                job.id, self.app.config.server_name, job.handler)

    def put(self, job_id, tool_id):
        """Add a job to the queue (by job identifier)"""
        if not self.track_jobs_in_database:
            self.queue.put((job_id, tool_id))
            self.sleeper.wake()
        else:
            # Workflow invocations farmed out to workers will submit jobs through here. If a handler is unassigned, we
            # will submit for one, or else claim it ourself. TODO: This should be moved to a higher level as it's now
            # implemented here and in MessageJobQueue
            job = self.sa_session.query(model.Job).get(job_id)
            if job.handler is None and self.app.application_stack.has_pool(
                    self.app.application_stack.pools.JOB_HANDLERS):
                msg = JobHandlerMessage(task='setup', job_id=job_id)
                self.app.application_stack.send_message(
                    self.app.application_stack.pools.JOB_HANDLERS, msg)

    def shutdown(self):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info("sending stop signal to worker thread")
            self.stop_monitoring()
            if not self.app.config.track_jobs_in_database:
                self.queue.put(self.STOP_SIGNAL)
            # A message could still be received while shutting down, should be ok since they will be picked up on next startup.
            self.app.application_stack.deregister_message_handler(
                name=JobHandlerMessage.target)
            self.sleeper.wake()
            self.shutdown_monitor()
            log.info("job handler queue stopped")
            self.dispatcher.shutdown()
コード例 #30
0
ファイル: handler.py プロジェクト: ImmPortDB/immport-galaxy
class JobHandlerStopQueue(Monitors):
    """
    A queue for jobs which need to be terminated prematurely.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        name = "JobHandlerStopQueue.monitor_thread"
        self._init_monitor_thread(name, config=app.config)
        log.info("job handler stop queue started")

    def start(self):
        # Start the queue
        self.monitor_thread.start()
        log.info("job handler stop queue started")

    def monitor(self):
        """
        Continually iterate the waiting jobs, stop any that are found.
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep(10)
        while self.monitor_running:
            try:
                self.monitor_step()
            except Exception:
                log.exception("Exception in monitor_step")
            # Sleep
            self._monitor_sleep(1)

    def monitor_step(self):
        """
        Called repeatedly by `monitor` to stop jobs.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.app.config.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            newly_deleted_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                                     .filter((model.Job.state == model.Job.states.DELETED_NEW) &
                                             (model.Job.handler == self.app.config.server_name)).all()
            for job in newly_deleted_jobs:
                jobs_to_check.append((job, job.stderr))
        # Also pull from the queue (in the case of Administrative stopped jobs)
        try:
            while 1:
                message = self.queue.get_nowait()
                if message is self.STOP_SIGNAL:
                    return
                # Unpack the message
                job_id, error_msg = message
                # Get the job object and append to watch queue
                jobs_to_check.append((self.sa_session.query(model.Job).get(job_id), error_msg))
        except Empty:
            pass
        for job, error_msg in jobs_to_check:
            if (job.state not in
                    (job.states.DELETED_NEW,
                     job.states.DELETED) and
                    job.finished):
                # terminated before it got here
                log.debug('Job %s already finished, not deleting or stopping', job.id)
                continue
            final_state = job.states.DELETED
            if error_msg is not None:
                final_state = job.states.ERROR
                job.info = error_msg
            job.set_final_state(final_state)
            self.sa_session.add(job)
            self.sa_session.flush()
            if job.job_runner_name is not None:
                # tell the dispatcher to stop the job
                self.dispatcher.stop(job)

    def put(self, job_id, error_msg=None):
        if not self.app.config.track_jobs_in_database:
            self.queue.put((job_id, error_msg))

    def shutdown(self):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info("sending stop signal to worker thread")
            self.stop_monitoring()
            if not self.app.config.track_jobs_in_database:
                self.queue.put(self.STOP_SIGNAL)
            self.shutdown_monitor()
            log.info("job handler stop queue stopped")
コード例 #31
0
class ThreadedFifoBuffer(FifoBuffer):
    """
    FIFO-in-memory connection inside dedicated thread.

    This is external-IO usable for Moler since it has it's own runner
    (thread) that can work in background and pull data from FIFO-mem connection.
    Usable for integration tests.
    """

    def __init__(self, moler_connection, echo=True, name=None, logger_name=""):
        """Initialization of FIFO-mem-threaded connection."""
        super(ThreadedFifoBuffer, self).__init__(moler_connection=moler_connection,
                                                 echo=echo,
                                                 name=name,
                                                 logger_name=logger_name)
        self.pulling_thread = None
        self.injections = Queue()

    def open(self):
        """Start thread pulling data from FIFO buffer."""
        ret = super(ThreadedFifoBuffer, self).open()
        done = threading.Event()
        self.pulling_thread = TillDoneThread(target=self.pull_data,
                                             done_event=done,
                                             kwargs={'pulling_done': done})
        self.pulling_thread.start()
        self._log(msg="open {}".format(self), level=logging.INFO)
        self._notify_on_connect()
        return ret

    def close(self):
        """Stop pulling thread."""
        if self.pulling_thread:
            self.pulling_thread.join()
            self.pulling_thread = None
        super(ThreadedFifoBuffer, self).close()
        self._log(msg="closed {}".format(self), level=logging.INFO)
        self._notify_on_disconnect()

    def inject(self, input_bytes, delay=0.0):
        """
        Add bytes to end of buffer

        :param input_bytes: iterable of bytes to inject
        :param delay: delay before each inject
        :return: None
        """
        for data in input_bytes:
            self.injections.put((data, delay))
        if not delay:
            time.sleep(0.05)  # give subsequent read() a chance to get data

    def _inject_deferred(self):
        if self.deferred_injections:
            for data, delay in self.deferred_injections:
                self.injections.put((data, delay))
            self.deferred_injections = []
            time.sleep(0.05)  # give subsequent read() a chance to get data

    def pull_data(self, pulling_done):
        """Pull data from FIFO buffer."""
        while not pulling_done.is_set():
            self.read()  # internally forwards to embedded Moler connection
            try:
                data, delay = self.injections.get_nowait()
                if delay:
                    time.sleep(delay)
                self._inject(data)
                self.injections.task_done()
            except Empty:
                time.sleep(0.01)  # give FIFO chance to get data
コード例 #32
0
class AsynchronousJobRunner(BaseJobRunner, Monitors):
    """Parent class for any job runner that runs jobs asynchronously (e.g. via
    a distributed resource manager).  Provides general methods for having a
    thread to monitor the state of asynchronous jobs and submitting those jobs
    to the correct methods (queue, finish, cleanup) at appropriate times..
    """
    def __init__(self, app, nworkers, **kwargs):
        super(AsynchronousJobRunner, self).__init__(app, nworkers, **kwargs)
        # 'watched' and 'queue' are both used to keep track of jobs to watch.
        # 'queue' is used to add new watched jobs, and can be called from
        # any thread (usually by the 'queue_job' method). 'watched' must only
        # be modified by the monitor thread, which will move items from 'queue'
        # to 'watched' and then manage the watched jobs.
        self.watched = []
        self.monitor_queue = Queue()

    def _init_monitor_thread(self):
        name = "%s.monitor_thread" % self.runner_name
        super(AsynchronousJobRunner,
              self)._init_monitor_thread(name=name,
                                         target=self.monitor,
                                         start=True,
                                         config=self.app.config)

    def handle_stop(self):
        # DRMAA and SGE runners should override this and disconnect.
        pass

    def monitor(self):
        """
        Watches jobs currently in the monitor queue and deals with state
        changes (queued to running) and job completion.
        """
        while True:
            # Take any new watched jobs and put them on the monitor list
            try:
                while True:
                    async_job_state = self.monitor_queue.get_nowait()
                    if async_job_state is STOP_SIGNAL:
                        # TODO: This is where any cleanup would occur
                        self.handle_stop()
                        return
                    self.watched.append(async_job_state)
            except Empty:
                pass
            # Iterate over the list of watched jobs and check state
            try:
                self.check_watched_items()
            except Exception:
                log.exception('Unhandled exception checking active jobs')
            # Sleep a bit before the next state check
            time.sleep(1)

    def monitor_job(self, job_state):
        self.monitor_queue.put(job_state)

    def shutdown(self):
        """Attempts to gracefully shut down the monitor thread"""
        log.info("%s: Sending stop signal to monitor thread" %
                 self.runner_name)
        self.monitor_queue.put(STOP_SIGNAL)
        # Call the parent's shutdown method to stop workers
        self.shutdown_monitor()
        super(AsynchronousJobRunner, self).shutdown()

    def check_watched_items(self):
        """
        This method is responsible for iterating over self.watched and handling
        state changes and updating self.watched with a new list of watched job
        states. Subclasses can opt to override this directly (as older job runners will
        initially) or just override check_watched_item and allow the list processing to
        reuse the logic here.
        """
        new_watched = []
        for async_job_state in self.watched:
            new_async_job_state = self.check_watched_item(async_job_state)
            if new_async_job_state:
                new_watched.append(new_async_job_state)
        self.watched = new_watched

    # Subclasses should implement this unless they override check_watched_items all together.
    def check_watched_item(self, job_state):
        raise NotImplementedError()

    def finish_job(self, job_state):
        """
        Get the output/error for a finished job, pass to `job_wrapper.finish`
        and cleanup all the job's temporary files.
        """
        galaxy_id_tag = job_state.job_wrapper.get_id_tag()
        external_job_id = job_state.job_id

        # To ensure that files below are readable, ownership must be reclaimed first
        job_state.job_wrapper.reclaim_ownership()

        # wait for the files to appear
        which_try = 0
        collect_output_success = True
        while which_try < self.app.config.retry_job_output_collection + 1:
            try:
                with open(job_state.output_file,
                          "rb") as stdout_file, open(job_state.error_file,
                                                     'rb') as stderr_file:
                    stdout = shrink_stream_by_size(
                        stdout_file,
                        DATABASE_MAX_STRING_SIZE,
                        join_by="\n..\n",
                        left_larger=True,
                        beginning_on_size_error=True)
                    stderr = shrink_stream_by_size(
                        stderr_file,
                        DATABASE_MAX_STRING_SIZE,
                        join_by="\n..\n",
                        left_larger=True,
                        beginning_on_size_error=True)
                break
            except Exception as e:
                if which_try == self.app.config.retry_job_output_collection:
                    stdout = ''
                    stderr = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER
                    log.error('(%s/%s) %s: %s' %
                              (galaxy_id_tag, external_job_id, stderr, str(e)))
                    collect_output_success = False
                else:
                    time.sleep(1)
                which_try += 1

        if not collect_output_success:
            job_state.fail_message = stderr
            job_state.runner_state = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER
            self.mark_as_failed(job_state)
            return

        try:
            # This should be an 8-bit exit code, but read ahead anyway:
            exit_code_str = open(job_state.exit_code_file, "r").read(32)
        except Exception:
            # By default, the exit code is 0, which typically indicates success.
            exit_code_str = "0"

        try:
            # Decode the exit code. If it's bogus, then just use 0.
            exit_code = int(exit_code_str)
        except ValueError:
            log.warning("(%s/%s) Exit code '%s' invalid. Using 0." %
                        (galaxy_id_tag, external_job_id, exit_code_str))
            exit_code = 0

        # clean up the job files
        cleanup_job = job_state.job_wrapper.cleanup_job
        if cleanup_job == "always" or (not stderr
                                       and cleanup_job == "onsuccess"):
            job_state.cleanup()

        try:
            self._finish_or_resubmit_job(job_state, stdout, stderr, exit_code)
        except Exception:
            log.exception("(%s/%s) Job wrapper finish method failed" %
                          (galaxy_id_tag, external_job_id))
            job_state.job_wrapper.fail("Unable to finish job", exception=True)

    def mark_as_finished(self, job_state):
        self.work_queue.put((self.finish_job, job_state))

    def mark_as_failed(self, job_state):
        self.work_queue.put((self.fail_job, job_state))
コード例 #33
0
class SingleMachineBatchSystem(BatchSystemSupport):
    """
    The interface for running jobs on a single machine, runs all the jobs you
    give it as they come in, but in parallel.

    Uses a single "daddy" thread to manage a fleet of child processes.
    
    Communication with the daddy thread happens via two queues: one queue of
    jobs waiting to be run (the input queue), and one queue of jobs that are
    finished/stopped and need to be returned by getUpdatedBatchJob (the output
    queue).

    When the batch system is shut down, the daddy thread is stopped.

    If running in debug-worker mode, jobs are run immediately as they are sent
    to the batch system, in the sending thread, and the daddy thread is not
    run. But the queues are still used.
    """
    @classmethod
    def supportsAutoDeployment(cls):
        return False

    @classmethod
    def supportsWorkerCleanup(cls):
        return True

    numCores = cpu_count()

    minCores = 0.1
    """
    The minimal fractional CPU. Tasks with a smaller core requirement will be rounded up to this
    value. 
    """
    physicalMemory = toil.physicalMemory()

    def __init__(self, config, maxCores, maxMemory, maxDisk):

        # Limit to the smaller of the user-imposed limit and what we actually
        # have on this machine for each resource.
        #
        # If we don't have up to the limit of the resource (and the resource
        # isn't the inlimited sentinel), warn.
        if maxCores > self.numCores:
            if maxCores != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough cores! User limited to %i but we only have %i.',
                    maxCores, self.numCores)
            maxCores = self.numCores
        if maxMemory > self.physicalMemory:
            if maxMemory != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough memory! User limited to %i bytes but we only have %i bytes.',
                    maxMemory, self.physicalMemory)
            maxMemory = self.physicalMemory
        self.physicalDisk = toil.physicalDisk(config)
        if maxDisk > self.physicalDisk:
            if maxDisk != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough disk space! User limited to %i bytes but we only have %i bytes.',
                    maxDisk, self.physicalDisk)
            maxDisk = self.physicalDisk

        super(SingleMachineBatchSystem, self).__init__(config, maxCores,
                                                       maxMemory, maxDisk)
        assert self.maxCores >= self.minCores
        assert self.maxMemory >= 1

        # The scale allows the user to apply a factor to each task's cores requirement, thereby
        # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores
        # (scale > 1).
        self.scale = config.scale

        if config.badWorker > 0 and config.debugWorker:
            # We can't throw SIGUSR1 at the worker because it is also going to
            # be the leader and/or test harness.
            raise RuntimeError(
                "Cannot use badWorker and debugWorker together; "
                "worker would have to kill the leader")

        self.debugWorker = config.debugWorker

        # A counter to generate job IDs and a lock to guard it
        self.jobIndex = 0
        self.jobIndexLock = Lock()

        # A dictionary mapping IDs of submitted jobs to the command line
        self.jobs = {}
        """
        :type: dict[str,toil.job.JobNode]
        """

        # A queue of jobs waiting to be executed. Consumed by the daddy thread.
        self.inputQueue = Queue()

        # A queue of finished jobs. Produced by the daddy thread.
        self.outputQueue = Queue()

        # A dictionary mapping IDs of currently running jobs to their Info objects
        self.runningJobs = {}
        """
        :type: dict[str,Info]
        """

        # These next two are only used outside debug-worker mode

        # A dict mapping PIDs to Popen objects for running jobs.
        # Jobs that don't fork are executed one at a time in the main thread.
        self.children = {}
        """
        :type: dict[int,subprocess.Popen]
        """
        # A dict mapping child PIDs to the Job IDs they are supposed to be running.
        self.childToJob = {}
        """
        :type: dict[int,str]
        """

        # A pool representing available CPU in units of minCores
        self.coreFractions = ResourcePool(
            int(old_div(self.maxCores, self.minCores)), 'cores')
        # A pool representing available memory in bytes
        self.memory = ResourcePool(self.maxMemory, 'memory')
        # A pool representing the available space in bytes
        self.disk = ResourcePool(self.maxDisk, 'disk')

        # If we can't schedule something, we fill this in with a reason why
        self.schedulingStatusMessage = None

        # We use this event to signal shutdown
        self.shuttingDown = Event()

        # A thread in charge of managing all our child processes.
        # Also takes care of resource accounting.
        self.daddyThread = None
        # If it breaks it will fill this in
        self.daddyException = None

        if self.debugWorker:
            log.debug('Started in worker debug mode.')
        else:
            self.daddyThread = Thread(target=self.daddy, daemon=True)
            self.daddyThread.start()
            log.debug('Started in normal mode.')

    def daddy(self):
        """
        Be the "daddy" thread.

        Our job is to look at jobs from the input queue.
        
        If a job fits in the available resources, we allocate resources for it
        and kick off a child process.

        We also check on our children.

        When a child finishes, we reap it, release its resources, and put its
        information in the output queue.
        """

        try:
            log.debug('Started daddy thread.')

            while not self.shuttingDown.is_set():
                # Main loop

                while not self.shuttingDown.is_set():
                    # Try to start as many jobs as we can try to start
                    try:
                        # Grab something from the input queue if available.
                        args = self.inputQueue.get_nowait()
                        jobCommand, jobID, jobCores, jobMemory, jobDisk, environment = args

                        coreFractions = int(old_div(jobCores, self.minCores))

                        # Try to start the child
                        result = self._startChild(jobCommand, jobID,
                                                  coreFractions, jobMemory,
                                                  jobDisk, environment)

                        if result is None:
                            # We did not get the resources to run this job.
                            # Requeue last, so we can look at the next job.
                            # TODO: Have some kind of condition the job can wait on,
                            # but without threads (queues for jobs needing
                            # cores/memory/disk individually)?
                            self.inputQueue.put(args)
                            break

                        # Otherwise it's a PID if it succeeded, or False if it couldn't
                        # start. But we don't care either way here.

                    except Empty:
                        # Nothing to run. Stop looking in the queue.
                        break

                # Now check on our children.
                for done_pid in self._pollForDoneChildrenIn(self.children):
                    # A child has actually finished.
                    # Clean up after it.
                    self._handleChild(done_pid)

                # Then loop again: start and collect more jobs.
                # TODO: It would be good to be able to wait on a new job or a finished child, whichever comes first.
                # For now we just sleep and loop.
                time.sleep(0.01)

            # When we get here, we are shutting down.

            for popen in self.children.values():
                # Kill all the children, going through popen to avoid signaling re-used PIDs.
                popen.kill()
            for popen in self.children.values():
                # Reap all the children
                popen.wait()

            # Then exit the thread.
            return
        except Exception as e:
            log.critical('Unhandled exception in daddy thread: %s',
                         traceback.format_exc())
            # Pass the exception back to the main thread so it can stop the next person who calls into us.
            self.daddyException = e
            raise

    def _checkOnDaddy(self):
        if self.daddyException is not None:
            # The daddy thread broke and we cannot do our job
            log.critical(
                'Propagating unhandled exception in daddy thread to main thread'
            )
            exc = self.daddyException
            self.daddyException = None
            raise exc

    def _pollForDoneChildrenIn(self, pid_to_popen):
        """
        See if any children represented in the given dict from PID to Popen
        object have finished.
        
        Return a collection of their PIDs.
        
        Guarantees that each child's exit code will be gettable via wait() on
        the child's Popen object (i.e. does not reap the child, unless via
        Popen).
        """

        # We keep our found PIDs in a set so we can work around waitid showing
        # us the same one repeatedly.
        ready = set()

        # Find the waitid function
        waitid = getattr(os, 'waitid', None)

        if callable(waitid):
            # waitid exists (not Mac)

            while True:
                # Poll for any child to have exit, but don't reap it. Leave reaping
                # to the Popen.
                # TODO: What if someone else in Toil wants to do this syscall?
                # TODO: Is this one-notification-per-done-child with WNOHANG? Or
                # can we miss some? Or do we see the same one repeatedly until it
                # is reaped?
                try:
                    siginfo = waitid(os.P_ALL, -1,
                                     os.WEXITED | os.WNOWAIT | os.WNOHANG)
                except ChildProcessError:
                    # This happens when there is nothing to wait on right now,
                    # instead of the weird C behavior of overwriting a field in
                    # a pointed-to struct.
                    siginfo = None
                if siginfo is not None and siginfo.si_pid in pid_to_popen and siginfo.si_pid not in ready:
                    # Something new finished
                    ready.add(siginfo.si_pid)
                else:
                    # Nothing we own that we haven't seen before has finished.
                    return ready
        else:
            # On Mac there's no waitid and no way to wait and not reap.
            # Fall back on polling all the Popen objects.
            # To make this vaguely efficient we have to return done children in
            # batches.
            for pid, popen in pid_to_popen.items():
                if popen.poll() is not None:
                    # Process is done
                    ready.add(pid)
                    log.debug('Child %d has stopped', pid)

            # Return all the done processes we found
            return ready

    def _runDebugJob(self, jobCommand, jobID, environment):
        """
        Run the jobCommand right now, in the current thread.
        May only be called in debug-worker mode.
        Assumes resources are available.
        """

        assert self.debugWorker

        # TODO: It is not possible to kill running jobs in forkless mode,
        # because they are run immediately in the main thread.
        info = Info(time.time(), None, None, killIntended=False)
        self.runningJobs[jobID] = info

        if jobCommand.startswith("_toil_worker "):
            # We can actually run in this thread
            jobName, jobStoreLocator, jobStoreID = jobCommand.split()[
                1:]  # Parse command
            jobStore = Toil.resumeJobStore(jobStoreLocator)
            toil_worker.workerScript(
                jobStore,
                jobStore.config,
                jobName,
                jobStoreID,
                redirectOutputToLogFile=not self.debugWorker
            )  # Call the worker
        else:
            # Run synchronously. If starting or running the command fails, let the exception stop us.
            subprocess.check_call(jobCommand,
                                  shell=True,
                                  env=dict(os.environ, **environment))

        self.runningJobs.pop(jobID)
        if not info.killIntended:
            self.outputQueue.put(
                UpdatedBatchJobInfo(jobID=jobID,
                                    exitStatus=0,
                                    wallTime=time.time() - info.time,
                                    exitReason=None))

    def getSchedulingStatusMessage(self):
        # Implement the abstractBatchSystem's scheduling status message API
        return self.schedulingStatusMessage

    def _setSchedulingStatusMessage(self, message):
        """
        If we can't run a job, we record a short message about why not. If the
        leader wants to know what is up with us (for example, to diagnose a
        deadlock), it can ask us for the message.
        """

        self.schedulingStatusMessage = message

    def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk,
                    environment):
        """
        Start a child process for the given job.
        
        Allocate its required resources and save it and save it in our bookkeeping structures.

        If the job is started, returns its PID.
        If the job fails to start, reports it as failed and returns False.
        If the job cannot get the resources it needs to start, returns None.
        """

        # We fill this in if we manage to actually start the child.
        popen = None

        # This is when we started working on the job.
        startTime = time.time()

        # See if we can fit the job in our resource pools right now.
        if self.coreFractions.acquireNow(coreFractions):
            # We got some cores
            if self.memory.acquireNow(jobMemory):
                # We got some memory
                if self.disk.acquireNow(jobDisk):
                    # We got the final resource, disk.
                    # Actually run the job.
                    # When it finishes we will release what it was using.
                    # So it is important to not lose track of the child process.

                    try:
                        # Launch the job
                        popen = subprocess.Popen(jobCommand,
                                                 shell=True,
                                                 env=dict(
                                                     os.environ,
                                                     **environment))
                    except Exception:
                        # If the job can't start, make sure we release resources now
                        self.coreFractions.release(coreFractions)
                        self.memory.release(jobMemory)
                        self.disk.release(jobDisk)

                        log.error('Could not start job %s: %s', jobID,
                                  traceback.format_exc())

                        # Report as failed.
                        self.outputQueue.put(
                            UpdatedBatchJobInfo(
                                jobID=jobID,
                                exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE,
                                wallTime=0,
                                exitReason=None))

                        # Free resources
                        self.coreFractions.release(coreFractions)
                        self.memory.release(jobMemory)
                        self.disk.release(jobDisk)

                        # Complain it broke.
                        return False
                    else:
                        # If the job did start, record it
                        self.children[popen.pid] = popen
                        # Make sure we can look it up by PID later
                        self.childToJob[popen.pid] = jobID
                        # Record that the job is running, and the resources it is using
                        info = Info(startTime,
                                    popen, (coreFractions, jobMemory, jobDisk),
                                    killIntended=False)
                        self.runningJobs[jobID] = info

                        log.debug('Launched job %s as child %d', jobID,
                                  popen.pid)

                        # Report success starting the job
                        # Note that if a PID were somehow 0 it would look like False
                        assert popen.pid != 0
                        return popen.pid
                else:
                    # We can't get disk, so free cores and memory
                    self.coreFractions.release(coreFractions)
                    self.memory.release(jobMemory)
                    self._setSchedulingStatusMessage(
                        'Not enough disk to run job %s' % jobID)
            else:
                # Free cores, since we can't get memory
                self.coreFractions.release(coreFractions)
                self._setSchedulingStatusMessage(
                    'Not enough memory to run job %s' % jobID)
        else:
            self._setSchedulingStatusMessage('Not enough cores to run job %s' %
                                             jobID)

        # If we get here, we didn't succeed or fail starting the job.
        # We didn't manage to get the resources.
        # Report that.
        return None

    def _handleChild(self, pid):
        """
        Handle a child process PID that has finished.
        The PID must be for a child job we started.
        Not thread safe to run at the same time as we are making more children.

        Remove the child from our bookkeeping structures and free its resources.
        """

        # Look up the child
        popen = self.children[pid]
        jobID = self.childToJob[pid]
        info = self.runningJobs[jobID]

        # Unpack the job resources
        (coreFractions, jobMemory, jobDisk) = info.resources

        # Clean up our records of the job.
        self.runningJobs.pop(jobID)
        self.childToJob.pop(pid)
        self.children.pop(pid)

        # See how the child did, and reap it.
        statusCode = popen.wait()
        if statusCode != 0 and not info.killIntended:
            log.error("Got exit code %i (indicating failure) "
                      "from job %s.", statusCode, self.jobs[jobID])
        if not info.killIntended:
            # Report if the job failed and we didn't kill it.
            # If we killed it then it shouldn't show up in the queue.
            self.outputQueue.put(
                UpdatedBatchJobInfo(jobID=jobID,
                                    exitStatus=statusCode,
                                    wallTime=time.time() - info.time,
                                    exitReason=None))

        # Free up the job's resources.
        self.coreFractions.release(coreFractions)
        self.memory.release(jobMemory)
        self.disk.release(jobDisk)

        log.debug('Child %d for job %s succeeded', pid, jobID)

    def issueBatchJob(self, jobNode):
        """Adds the command and resources to a queue to be run."""

        self._checkOnDaddy()

        # Round cores to minCores and apply scale.
        # Make sure to give minCores even if asked for 0 cores, or negative or something.
        cores = max(
            math.ceil(jobNode.cores * self.scale / self.minCores) *
            self.minCores, self.minCores)

        # Don't do our own assertions about job size vs. our configured size.
        # The abstract batch system can handle it.
        self.checkResourceRequest(jobNode.memory,
                                  cores,
                                  jobNode.disk,
                                  name=jobNode.jobName,
                                  detail='Scale is set to {}.'.format(
                                      self.scale))

        self.checkResourceRequest(jobNode.memory, cores, jobNode.disk)
        log.debug(
            "Issuing the command: %s with memory: %i, cores: %i, disk: %i" %
            (jobNode.command, jobNode.memory, cores, jobNode.disk))
        with self.jobIndexLock:
            jobID = self.jobIndex
            self.jobIndex += 1
        self.jobs[jobID] = jobNode.command

        if self.debugWorker:
            # Run immediately, blocking for return.
            # Ignore resource requirements; we run one job at a time
            self._runDebugJob(jobNode.command, jobID, self.environment.copy())
        else:
            # Queue the job for later
            self.inputQueue.put(
                (jobNode.command, jobID, cores, jobNode.memory, jobNode.disk,
                 self.environment.copy()))

        return jobID

    def killBatchJobs(self, jobIDs):
        """Kills jobs by ID."""

        self._checkOnDaddy()

        log.debug('Killing jobs: {}'.format(jobIDs))
        for jobID in jobIDs:
            if jobID in self.runningJobs:
                info = self.runningJobs[jobID]
                info.killIntended = True
                if info.popen != None:
                    log.debug('Send kill to PID %s', info.popen.pid)
                    info.popen.kill()
                    log.debug('Sent kill to PID %s', info.popen.pid)
                else:
                    # No popen if running in forkless mode currently
                    assert self.debugWorker
                    log.critical("Can't kill job: %s in debug mode" % jobID)
                while jobID in self.runningJobs:
                    pass

    def getIssuedBatchJobIDs(self):
        """Just returns all the jobs that have been run, but not yet returned as updated."""

        self._checkOnDaddy()

        return list(self.jobs.keys())

    def getRunningBatchJobIDs(self):

        self._checkOnDaddy()

        now = time.time()
        return {
            jobID: now - info.time
            for jobID, info in list(self.runningJobs.items())
        }

    def shutdown(self):
        """
        Cleanly terminate and join daddy thread.
        """

        if self.daddyThread is not None:
            # Tell the daddy thread to stop.
            self.shuttingDown.set()
            # Wait for it to stop.
            self.daddyThread.join()

        BatchSystemSupport.workerCleanup(self.workerCleanupInfo)

    def getUpdatedBatchJob(self, maxWait):
        """Returns a tuple of a no-longer-running job, the return value of its process, and its runtime, or None."""

        self._checkOnDaddy()

        try:
            item = self.outputQueue.get(timeout=maxWait)
        except Empty:
            return None
        self.jobs.pop(item.jobID)
        log.debug("Ran jobID: %s with exit value: %i", item.jobID,
                  item.exitStatus)
        return item

    @classmethod
    def setOptions(cls, setOption):
        setOption("scale", default=1)
コード例 #34
0
class AsynchronousJobRunner(BaseJobRunner, Monitors):
    """Parent class for any job runner that runs jobs asynchronously (e.g. via
    a distributed resource manager).  Provides general methods for having a
    thread to monitor the state of asynchronous jobs and submitting those jobs
    to the correct methods (queue, finish, cleanup) at appropriate times..
    """

    def __init__(self, app, nworkers, **kwargs):
        super(AsynchronousJobRunner, self).__init__(app, nworkers, **kwargs)
        # 'watched' and 'queue' are both used to keep track of jobs to watch.
        # 'queue' is used to add new watched jobs, and can be called from
        # any thread (usually by the 'queue_job' method). 'watched' must only
        # be modified by the monitor thread, which will move items from 'queue'
        # to 'watched' and then manage the watched jobs.
        self.watched = []
        self.monitor_queue = Queue()

    def _init_monitor_thread(self):
        name = "%s.monitor_thread" % self.runner_name
        super(AsynchronousJobRunner, self)._init_monitor_thread(name=name, target=self.monitor, start=True, config=self.app.config)

    def handle_stop(self):
        # DRMAA and SGE runners should override this and disconnect.
        pass

    def monitor(self):
        """
        Watches jobs currently in the monitor queue and deals with state
        changes (queued to running) and job completion.
        """
        while True:
            # Take any new watched jobs and put them on the monitor list
            try:
                while True:
                    async_job_state = self.monitor_queue.get_nowait()
                    if async_job_state is STOP_SIGNAL:
                        # TODO: This is where any cleanup would occur
                        self.handle_stop()
                        return
                    self.watched.append(async_job_state)
            except Empty:
                pass
            # Iterate over the list of watched jobs and check state
            try:
                self.check_watched_items()
            except Exception:
                log.exception('Unhandled exception checking active jobs')
            # Sleep a bit before the next state check
            time.sleep(1)

    def monitor_job(self, job_state):
        self.monitor_queue.put(job_state)

    def shutdown(self):
        """Attempts to gracefully shut down the monitor thread"""
        log.info("%s: Sending stop signal to monitor thread" % self.runner_name)
        self.monitor_queue.put(STOP_SIGNAL)
        # Call the parent's shutdown method to stop workers
        self.shutdown_monitor()
        super(AsynchronousJobRunner, self).shutdown()

    def check_watched_items(self):
        """
        This method is responsible for iterating over self.watched and handling
        state changes and updating self.watched with a new list of watched job
        states. Subclasses can opt to override this directly (as older job runners will
        initially) or just override check_watched_item and allow the list processing to
        reuse the logic here.
        """
        new_watched = []
        for async_job_state in self.watched:
            new_async_job_state = self.check_watched_item(async_job_state)
            if new_async_job_state:
                new_watched.append(new_async_job_state)
        self.watched = new_watched

    # Subclasses should implement this unless they override check_watched_items all together.
    def check_watched_item(self, job_state):
        raise NotImplementedError()

    def finish_job(self, job_state):
        """
        Get the output/error for a finished job, pass to `job_wrapper.finish`
        and cleanup all the job's temporary files.
        """
        galaxy_id_tag = job_state.job_wrapper.get_id_tag()
        external_job_id = job_state.job_id

        # To ensure that files below are readable, ownership must be reclaimed first
        job_state.job_wrapper.reclaim_ownership()

        # wait for the files to appear
        which_try = 0
        collect_output_success = True
        while which_try < self.app.config.retry_job_output_collection + 1:
            try:
                with open(job_state.output_file, "rb") as stdout_file, open(job_state.error_file, 'rb') as stderr_file:
                    stdout = shrink_stream_by_size(stdout_file, DATABASE_MAX_STRING_SIZE, join_by="\n..\n", left_larger=True, beginning_on_size_error=True)
                    stderr = shrink_stream_by_size(stderr_file, DATABASE_MAX_STRING_SIZE, join_by="\n..\n", left_larger=True, beginning_on_size_error=True)
                break
            except Exception as e:
                if which_try == self.app.config.retry_job_output_collection:
                    stdout = ''
                    stderr = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER
                    log.error('(%s/%s) %s: %s' % (galaxy_id_tag, external_job_id, stderr, str(e)))
                    collect_output_success = False
                else:
                    time.sleep(1)
                which_try += 1

        if not collect_output_success:
            job_state.fail_message = stderr
            job_state.runner_state = job_state.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER
            self.mark_as_failed(job_state)
            return

        try:
            # This should be an 8-bit exit code, but read ahead anyway:
            exit_code_str = open(job_state.exit_code_file, "r").read(32)
        except Exception:
            # By default, the exit code is 0, which typically indicates success.
            exit_code_str = "0"

        try:
            # Decode the exit code. If it's bogus, then just use 0.
            exit_code = int(exit_code_str)
        except ValueError:
            log.warning("(%s/%s) Exit code '%s' invalid. Using 0." % (galaxy_id_tag, external_job_id, exit_code_str))
            exit_code = 0

        # clean up the job files
        cleanup_job = job_state.job_wrapper.cleanup_job
        if cleanup_job == "always" or (not stderr and cleanup_job == "onsuccess"):
            job_state.cleanup()

        try:
            self._finish_or_resubmit_job(job_state, stdout, stderr, exit_code)
        except Exception:
            log.exception("(%s/%s) Job wrapper finish method failed" % (galaxy_id_tag, external_job_id))
            job_state.job_wrapper.fail("Unable to finish job", exception=True)

    def mark_as_finished(self, job_state):
        self.work_queue.put((self.finish_job, job_state))

    def mark_as_failed(self, job_state):
        self.work_queue.put((self.fail_job, job_state))
コード例 #35
0
class CachePipeline(object):
    def __init__(self, spider, cache):
        self.spider = spider
        self.cache = cache
        self.queue_size = 100
        self.input_queue = Queue()
        self.result_queue = Queue()
        self.is_working = Event()
        self.is_paused = Event()

        self.thread = Thread(target=self.thread_worker)
        self.thread.daemon = True
        self.thread.start()

    def has_free_resources(self):
        return (self.input_queue.qsize() < self.queue_size
                and self.result_queue.qsize() < self.queue_size)

    def is_idle(self):
        return (not self.is_working.is_set() and not self.input_queue.qsize()
                and not self.input_queue.qsize())

    def thread_worker(self):
        while True:
            while self.is_paused.is_set():
                time.sleep(0.01)
            try:
                action, data = self.input_queue.get(True, 0.1)
            except Empty:
                if self.spider.shutdown_event.is_set():
                    #print('!CACHE: EXITING CACHE PIPELINE')
                    return self.shutdown()
                #else:
                #    print('no shutdown event')
            else:
                self.is_working.set()
                #print('!CACHE:got new task from input: %s:%s'
                #      % (action, data))
                assert action in ('load', 'save', 'pause')
                if action == 'load':
                    task, grab = data
                    result = None
                    if self.is_cache_loading_allowed(task, grab):
                        #print('!CACHE: query cache storage')
                        result = self.load_from_cache(task, grab)
                    if result:
                        #print('!CACHE: cached result is None')
                        #print('!! PUT RESULT INTO CACHE PIPE '
                        #      'RESULT QUEUE (cache)')
                        self.result_queue.put(('network_result', result))
                    else:
                        self.result_queue.put(('task', task))
                elif action == 'save':
                    task, grab = data
                    if self.is_cache_saving_allowed(task, grab):
                        with self.spider.timer.log_time('cache'):
                            with self.spider.timer.log_time('cache.write'):
                                self.cache.save_response(task.url, grab)
                elif action == 'pause':
                    self.is_paused.set()
                self.is_working.clear()

    def is_cache_loading_allowed(self, task, grab):
        # 1) cache data should be refreshed
        # 2) cache is disabled for that task
        # 3) request type is not cacheable
        return (not task.get('refresh_cache', False)
                and not task.get('disable_cache', False)
                and grab.detect_request_method() == 'GET')

    def is_cache_saving_allowed(self, task, grab):
        """
        Check if network transport result could
        be saved to cache layer.

        res: {ok, grab, grab_config_backup, task, emsg}
        """

        if grab.request_method == 'GET':
            if not task.get('disable_cache'):
                if self.spider.is_valid_network_response_code(
                        grab.doc.code, task):
                    return True
        return False

    def load_from_cache(self, task, grab):
        with self.spider.timer.log_time('cache'):
            with self.spider.timer.log_time('cache.read'):
                cache_item = self.cache.get_item(grab.config['url'],
                                                 timeout=task.cache_timeout)
                if cache_item is None:
                    return None
                else:
                    with self.spider.timer.log_time(
                            'cache.read.prepare_request'):
                        grab.prepare_request()
                    with self.spider.timer.log_time(
                            'cache.read.load_response'):
                        self.cache.load_response(grab, cache_item)
                    grab.log_request('CACHED')
                    self.spider.stat.inc('spider:request-cache')

                    return {
                        'ok': True,
                        'task': task,
                        'grab': grab,
                        'grab_config_backup': grab.dump_config(),
                        'emsg': None
                    }

    def shutdown(self):
        try:
            self.cache.close()
        except AttributeError:
            print('Cache %s does not support close method' % self.cache)

    def pause(self):
        self.add_task(('pause', None))
        self.is_paused.wait()

    def resume(self):
        self.is_paused.clear()

    def get_ready_results(self):
        res = []
        while True:
            try:
                action, result = self.result_queue.get_nowait()
            except Empty:
                break
            else:
                assert action in ('network_result', 'task')
                res.append((action, result))
        return res

    def add_task(self, task):
        self.input_queue.put(task)
コード例 #36
0
class Scheduler():
    def __init__(self):
        self.q = Queue()
        self.fp_set = set()
        self.total_repeat_nums = 0

    def add_request(self, request):
        # 请求入队的函数: 指纹不在集合中self._filter_request(request)返回True
        if self._filter_request(request):
            self.q.put(request)

    def get_request(self):
        # 取出request并返回
        try:
            request = self.q.get_nowait()
        except:
            request = None
        return request

    def _filter_request(self, request):
        '''请求去重: request的指纹不在集合中,指纹入集合,返回true'''
        fp = self._gen_fp(request)
        if fp not in self.fp_set:
            self.fp_set.add(fp)
            return True
        self.total_repeat_nums += 1
        logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
        return False

    def _gen_fp(self, request):
        """返回request的fp"""

        url = canonicalize_url(request.url)
        method = request.method.upper()
        data = request.data if request.data else {}
        data = sorted(data.items(), key=lambda x: x[0])
        # data.items() 返回dict_items([('b', 2), ('a', 1)]) 迭代对象
        # sorted(dict.items()) # [('a', 1), ('b', 2)]
        # key参数接收一个lambda函数 表示按照返回的对象进行排序
        # x就是 dict.items()中每次迭代返回的 对象 (k, v)
        # x[0] 就是 dict中的k
        # data = sorted(data.items(), key=lambda x:x[0])表示对data.items()依照字典的k进行排序

        sha1 = hashlib.sha1()
        sha1.update(self._to_bytes(url))
        sha1.update(self._to_bytes(method))
        sha1.update(self._to_bytes(str(data)))
        fp = sha1.hexdigest()
        return fp

    def _to_bytes(self, string):
        """py2和py3字符串类型正好相反"""
        if six.PY2:  # 判断当前是不是python2
            if isinstance(string, str):
                return string
            else:
                return string.encode()
        elif six.PY3:
            if isinstance(string, str):
                return string.encode()
            else:
                return string
コード例 #37
0
ファイル: parasol.py プロジェクト: chapmanb/toil
class ParasolBatchSystem(BatchSystemSupport):
    """
    The interface for Parasol.
    """

    @classmethod
    def supportsWorkerCleanup(cls):
        return False

    @classmethod
    def supportsHotDeployment(cls):
        return False

    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super(ParasolBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk)
        if maxMemory != sys.maxsize:
            logger.warn('The Parasol batch system does not support maxMemory.')
        # Keep the name of the results file for the pstat2 command..
        command = config.parasolCommand
        if os.path.sep not in command:
            try:
                command = next(which(command))
            except StopIteration:
                raise RuntimeError("Can't find %s on PATH." % command)
        logger.debug('Using Parasol at %s', command)
        self.parasolCommand = command
        self.parasolResultsDir = tempfile.mkdtemp(dir=config.jobStore)

        # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch
        # have the same cpu and memory requirements. The keys to this dictionary are the (cpu,
        # memory) tuples for each batch. A new batch is created whenever a job has a new unique
        # combination of cpu and memory requirements.
        self.resultsFiles = dict()
        self.maxBatches = config.parasolMaxBatches

        # Allows the worker process to send back the IDs of jobs that have finished, so the batch
        #  system can decrease its used cpus counter
        self.cpuUsageQueue = Queue()

        # Also stores finished job IDs, but is read by getUpdatedJobIDs().
        self.updatedJobsQueue = Queue()

        # Use this to stop the worker when shutting down
        self.running = True

        self.worker = Thread(target=self.updatedJobWorker, args=())
        self.worker.start()
        self.usedCpus = 0
        self.jobIDsToCpu = {}

        # Set of jobs that have been issued but aren't known to have finished or been killed yet.
        #  Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are
        #  removed in killBatchJobs.
        self.runningJobs = set()

    def _runParasol(self, command, autoRetry=True):
        """
        Issues a parasol command using popen to capture the output. If the command fails then it
        will try pinging parasol until it gets a response. When it gets a response it will
        recursively call the issue parasol command, repeating this pattern for a maximum of N
        times. The final exit value will reflect this.
        """
        command = list(concat(self.parasolCommand, command))
        while True:
            logger.debug('Running %r', command)
            process = subprocess.Popen(command,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       bufsize=-1)
            stdout, stderr = process.communicate()
            status = process.wait()
            for line in stderr.split('\n'):
                if line: logger.warn(line)
            if status == 0:
                return 0, stdout.split('\n')
            message = 'Command %r failed with exit status %i' % (command, status)
            if autoRetry:
                logger.warn(message)
            else:
                logger.error(message)
                return status, None
            logger.warn('Waiting for a 10s, before trying again')
            time.sleep(10)

    parasolOutputPattern = re.compile("your job ([0-9]+).*")

    def issueBatchJob(self, jobNode):
        """
        Issues parasol with job commands.
        """
        self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk)

        MiB = 1 << 20
        truncatedMemory = (old_div(jobNode.memory, MiB)) * MiB
        # Look for a batch for jobs with these resource requirements, with
        # the memory rounded down to the nearest megabyte. Rounding down
        # meams the new job can't ever decrease the memory requirements
        # of jobs already in the batch.
        if len(self.resultsFiles) >= self.maxBatches:
            raise RuntimeError( 'Number of batches reached limit of %i' % self.maxBatches)
        try:
            results = self.resultsFiles[(truncatedMemory, jobNode.cores)]
        except KeyError:
            results = getTempFile(rootDir=self.parasolResultsDir)
            self.resultsFiles[(truncatedMemory, jobNode.cores)] = results

        # Prefix the command with environment overrides, optionally looking them up from the
        # current environment if the value is None
        command = ' '.join(concat('env', self.__environment(), jobNode.command))
        parasolCommand = ['-verbose',
                          '-ram=%i' % jobNode.memory,
                          '-cpu=%i' % jobNode.cores,
                          '-results=' + results,
                          'add', 'job', command]
        # Deal with the cpus
        self.usedCpus += jobNode.cores
        while True:  # Process finished results with no wait
            try:
                jobID = self.cpuUsageQueue.get_nowait()
            except Empty:
                break
            if jobID in list(self.jobIDsToCpu.keys()):
                self.usedCpus -= self.jobIDsToCpu.pop(jobID)
            assert self.usedCpus >= 0
        while self.usedCpus > self.maxCores:  # If we are still waiting
            jobID = self.cpuUsageQueue.get()
            if jobID in list(self.jobIDsToCpu.keys()):
                self.usedCpus -= self.jobIDsToCpu.pop(jobID)
            assert self.usedCpus >= 0
        # Now keep going
        while True:
            line = self._runParasol(parasolCommand)[1][0]
            match = self.parasolOutputPattern.match(line)
            if match is None:
                # This is because parasol add job will return success, even if the job was not
                # properly issued!
                logger.debug('We failed to properly add the job, we will try again after a 5s.')
                time.sleep(5)
            else:
                jobID = int(match.group(1))
                self.jobIDsToCpu[jobID] = jobNode.cores
                self.runningJobs.add(jobID)
                logger.debug("Got the parasol job id: %s from line: %s" % (jobID, line))
                return jobID

    def setEnv(self, name, value=None):
        if value and ' ' in value:
            raise ValueError('Parasol does not support spaces in environment variable values.')
        return super(ParasolBatchSystem, self).setEnv(name, value)

    def __environment(self):
        return (k + '=' + (os.environ[k] if v is None else v) for k, v in list(self.environment.items()))

    def killBatchJobs(self, jobIDs):
        """Kills the given jobs, represented as Job ids, then checks they are dead by checking
        they are not in the list of issued jobs.
        """
        while True:
            for jobID in jobIDs:
                if jobID in self.runningJobs:
                    self.runningJobs.remove(jobID)
                exitValue = self._runParasol(['remove', 'job', str(jobID)],
                                             autoRetry=False)[0]
                logger.debug("Tried to remove jobID: %i, with exit value: %i" % (jobID, exitValue))
            runningJobs = self.getIssuedBatchJobIDs()
            if set(jobIDs).difference(set(runningJobs)) == set(jobIDs):
                break
            logger.warn( 'Tried to kill some jobs, but something happened and they are still '
                         'going, will try againin 5s.')
            time.sleep(5)
        # Update the CPU usage, because killed jobs aren't written to the results file.
        for jobID in jobIDs:
            if jobID in list(self.jobIDsToCpu.keys()):
                self.usedCpus -= self.jobIDsToCpu.pop(jobID)

    queuePattern = re.compile(r'q\s+([0-9]+)')
    runningPattern = re.compile(r'r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+')

    def getJobIDsForResultsFile(self, resultsFile):
        """
        Get all queued and running jobs for a results file.
        """
        jobIDs = []
        for line in self._runParasol(['-results=' + resultsFile, 'pstat2'])[1]:
            runningJobMatch = self.runningPattern.match(line)
            queuedJobMatch = self.queuePattern.match(line)
            if runningJobMatch:
                jobID = runningJobMatch.group(1)
            elif queuedJobMatch:
                jobID = queuedJobMatch.group(1)
            else:
                continue
            jobIDs.append(int(jobID))
        return set(jobIDs)

    def getIssuedBatchJobIDs(self):
        """
        Gets the list of jobs issued to parasol in all results files, but not including jobs
        created by other users.
        """
        issuedJobs = set()
        for resultsFile in itervalues(self.resultsFiles):
            issuedJobs.update(self.getJobIDsForResultsFile(resultsFile))

        return list(issuedJobs)

    def getRunningBatchJobIDs(self):
        """
        Returns map of running jobIDs and the time they have been running.
        """
        # Example lines..
        # r 5410186 benedictpaten worker 1247029663 localhost
        # r 5410324 benedictpaten worker 1247030076 localhost
        runningJobs = {}
        issuedJobs = self.getIssuedBatchJobIDs()
        for line in self._runParasol(['pstat2'])[1]:
            if line != '':
                match = self.runningPattern.match(line)
                if match is not None:
                    jobID = int(match.group(1))
                    startTime = int(match.group(2))
                    if jobID in issuedJobs:  # It's one of our jobs
                        runningJobs[jobID] = time.time() - startTime
        return runningJobs

    def getUpdatedBatchJob(self, maxWait):
        while True:
            try:
                jobID, status, wallTime = self.updatedJobsQueue.get(timeout=maxWait)
            except Empty:
                return None
            try:
                self.runningJobs.remove(jobID)
            except KeyError:
                # We tried to kill this job, but it ended by itself instead, so skip it.
                pass
            else:
                return jobID, status, wallTime

    @classmethod
    def getRescueBatchJobFrequency(cls):
        """
        Parasol leaks jobs, but rescuing jobs involves calls to parasol list jobs and pstat2,
        making it expensive.
        """
        return 5400  # Once every 90 minutes

    def updatedJobWorker(self):
        """
        We use the parasol results to update the status of jobs, adding them
        to the list of updated jobs.

        Results have the following structure.. (thanks Mark D!)

        int status;    /* Job status - wait() return format. 0 is good. */
        char *host;    /* Machine job ran on. */
        char *jobId;    /* Job queuing system job ID */
        char *exe;    /* Job executable file (no path) */
        int usrTicks;    /* 'User' CPU time in ticks. */
        int sysTicks;    /* 'System' CPU time in ticks. */
        unsigned submitTime;    /* Job submission time in seconds since 1/1/1970 */
        unsigned startTime;    /* Job start time in seconds since 1/1/1970 */
        unsigned endTime;    /* Job end time in seconds since 1/1/1970 */
        char *user;    /* User who ran job */
        char *errFile;    /* Location of stderr file on host */

        Plus you finally have the command name.
        """
        resultsFiles = set()
        resultsFileHandles = []
        try:
            while self.running:
                # Look for any new results files that have been created, and open them
                newResultsFiles = set(os.listdir(self.parasolResultsDir)).difference(resultsFiles)
                for newFile in newResultsFiles:
                    newFilePath = os.path.join(self.parasolResultsDir, newFile)
                    resultsFileHandles.append(open(newFilePath, 'r'))
                    resultsFiles.add(newFile)
                for fileHandle in resultsFileHandles:
                    while self.running:
                        line = fileHandle.readline()
                        if not line:
                            break
                        assert line[-1] == '\n'
                        (status, host, jobId, exe, usrTicks, sysTicks, submitTime, startTime,
                         endTime, user, errFile, command) = line[:-1].split(None, 11)
                        status = int(status)
                        jobId = int(jobId)
                        if os.WIFEXITED(status):
                            status = os.WEXITSTATUS(status)
                        else:
                            status = -status
                        self.cpuUsageQueue.put(jobId)
                        startTime = int(startTime)
                        endTime = int(endTime)
                        if endTime == startTime:
                            # Both, start and end time is an integer so to get sub-second
                            # accuracy we use the ticks reported by Parasol as an approximation.
                            # This isn't documented but what Parasol calls "ticks" is actually a
                            # hundredth of a second. Parasol does the unit conversion early on
                            # after a job finished. Search paraNode.c for ticksToHundreths. We
                            # also cheat a little by always reporting at least one hundredth of a
                            # second.
                            usrTicks = int(usrTicks)
                            sysTicks = int(sysTicks)
                            wallTime = float( max( 1, usrTicks + sysTicks) ) * 0.01
                        else:
                            wallTime = float(endTime - startTime)
                        self.updatedJobsQueue.put((jobId, status, wallTime))
                time.sleep(1)
        except:
            logger.warn("Error occurred while parsing parasol results files.")
            raise
        finally:
            for fileHandle in resultsFileHandles:
                fileHandle.close()

    def shutdown(self):
        self.killBatchJobs(self.getIssuedBatchJobIDs())  # cleanup jobs
        for results in itervalues(self.resultsFiles):
            exitValue = self._runParasol(['-results=' + results, 'clear', 'sick'],
                                         autoRetry=False)[0]
            if exitValue is not None:
                logger.warn("Could not clear sick status of the parasol batch %s" % results)
            exitValue = self._runParasol(['-results=' + results, 'flushResults'],
                                         autoRetry=False)[0]
            if exitValue is not None:
                logger.warn("Could not flush the parasol batch %s" % results)
        self.running = False
        logger.debug('Joining worker thread...')
        self.worker.join()
        logger.debug('... joined worker thread.')
        for results in list(self.resultsFiles.values()):
            os.remove(results)
        os.rmdir(self.parasolResultsDir)


    @classmethod
    def setOptions(cls, setOption):
        from toil.common import iC
        setOption("parasolCommand", None, None, 'parasol')
        setOption("parasolMaxBatches", int, iC(1), 10000)
コード例 #38
0
class ParasolBatchSystem(BatchSystemSupport):
    """
    The interface for Parasol.
    """
    @classmethod
    def supportsWorkerCleanup(cls):
        return False

    @classmethod
    def supportsAutoDeployment(cls):
        return False

    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super(ParasolBatchSystem, self).__init__(config, maxCores, maxMemory,
                                                 maxDisk)
        if maxMemory != sys.maxsize:
            logger.warning(
                'The Parasol batch system does not support maxMemory.')
        # Keep the name of the results file for the pstat2 command..
        command = config.parasolCommand
        if os.path.sep not in command:
            try:
                command = which(command)
            except StopIteration:
                raise RuntimeError("Can't find %s on PATH." % command)
        logger.debug('Using Parasol at %s', command)
        self.parasolCommand = command
        jobStoreType, path = Toil.parseLocator(config.jobStore)
        if jobStoreType != 'file':
            raise RuntimeError(
                "The parasol batch system doesn't currently work with any "
                "jobStore type except file jobStores.")
        self.parasolResultsDir = tempfile.mkdtemp(dir=os.path.abspath(path))
        logger.debug("Using parasol results dir: %s", self.parasolResultsDir)

        # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch
        # have the same cpu and memory requirements. The keys to this dictionary are the (cpu,
        # memory) tuples for each batch. A new batch is created whenever a job has a new unique
        # combination of cpu and memory requirements.
        self.resultsFiles = dict()
        self.maxBatches = config.parasolMaxBatches

        # Allows the worker process to send back the IDs of jobs that have finished, so the batch
        #  system can decrease its used cpus counter
        self.cpuUsageQueue = Queue()

        # Also stores finished job IDs, but is read by getUpdatedJobIDs().
        self.updatedJobsQueue = Queue()

        # Use this to stop the worker when shutting down
        self.running = True

        self.worker = Thread(target=self.updatedJobWorker, args=())
        self.worker.start()
        self.usedCpus = 0
        self.jobIDsToCpu = {}

        # Set of jobs that have been issued but aren't known to have finished or been killed yet.
        #  Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are
        #  removed in killBatchJobs.
        self.runningJobs = set()

    def _runParasol(self, command, autoRetry=True):
        """
        Issues a parasol command using popen to capture the output. If the command fails then it
        will try pinging parasol until it gets a response. When it gets a response it will
        recursively call the issue parasol command, repeating this pattern for a maximum of N
        times. The final exit value will reflect this.
        """
        command = list(concat(self.parasolCommand, command))
        while True:
            logger.debug('Running %r', command)
            process = subprocess.Popen(command,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       bufsize=-1)
            stdout, stderr = process.communicate()
            status = process.wait()
            for line in stderr.decode('utf-8').split('\n'):
                if line: logger.warning(line)
            if status == 0:
                return 0, stdout.decode('utf-8').split('\n')
            message = 'Command %r failed with exit status %i' % (command,
                                                                 status)
            if autoRetry:
                logger.warning(message)
            else:
                logger.error(message)
                return status, None
            logger.warning('Waiting for a 10s, before trying again')
            time.sleep(10)

    parasolOutputPattern = re.compile("your job ([0-9]+).*")

    def issueBatchJob(self, jobNode):
        """
        Issues parasol with job commands.
        """
        self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk)

        MiB = 1 << 20
        truncatedMemory = (old_div(jobNode.memory, MiB)) * MiB
        # Look for a batch for jobs with these resource requirements, with
        # the memory rounded down to the nearest megabyte. Rounding down
        # meams the new job can't ever decrease the memory requirements
        # of jobs already in the batch.
        if len(self.resultsFiles) >= self.maxBatches:
            raise RuntimeError('Number of batches reached limit of %i' %
                               self.maxBatches)
        try:
            results = self.resultsFiles[(truncatedMemory, jobNode.cores)]
        except KeyError:
            results = getTempFile(rootDir=self.parasolResultsDir)
            self.resultsFiles[(truncatedMemory, jobNode.cores)] = results

        # Prefix the command with environment overrides, optionally looking them up from the
        # current environment if the value is None
        command = ' '.join(concat('env', self.__environment(),
                                  jobNode.command))
        parasolCommand = [
            '-verbose',
            '-ram=%i' % jobNode.memory,
            '-cpu=%i' % jobNode.cores, '-results=' + results, 'add', 'job',
            command
        ]
        # Deal with the cpus
        self.usedCpus += jobNode.cores
        while True:  # Process finished results with no wait
            try:
                jobID = self.cpuUsageQueue.get_nowait()
            except Empty:
                break
            if jobID in list(self.jobIDsToCpu.keys()):
                self.usedCpus -= self.jobIDsToCpu.pop(jobID)
            assert self.usedCpus >= 0
        while self.usedCpus > self.maxCores:  # If we are still waiting
            jobID = self.cpuUsageQueue.get()
            if jobID in list(self.jobIDsToCpu.keys()):
                self.usedCpus -= self.jobIDsToCpu.pop(jobID)
            assert self.usedCpus >= 0
        # Now keep going
        while True:
            line = self._runParasol(parasolCommand)[1][0]
            match = self.parasolOutputPattern.match(line)
            if match is None:
                # This is because parasol add job will return success, even if the job was not
                # properly issued!
                logger.debug(
                    'We failed to properly add the job, we will try again after a 5s.'
                )
                time.sleep(5)
            else:
                jobID = int(match.group(1))
                self.jobIDsToCpu[jobID] = jobNode.cores
                self.runningJobs.add(jobID)
                logger.debug("Got the parasol job id: %s from line: %s" %
                             (jobID, line))
                return jobID

    def setEnv(self, name, value=None):
        if value and ' ' in value:
            raise ValueError(
                'Parasol does not support spaces in environment variable values.'
            )
        return super(ParasolBatchSystem, self).setEnv(name, value)

    def __environment(self):
        return (k + '=' + (os.environ[k] if v is None else v)
                for k, v in listitems(self.environment))

    def killBatchJobs(self, jobIDs):
        """Kills the given jobs, represented as Job ids, then checks they are dead by checking
        they are not in the list of issued jobs.
        """
        while True:
            for jobID in jobIDs:
                if jobID in self.runningJobs:
                    self.runningJobs.remove(jobID)
                exitValue = self._runParasol(
                    ['remove', 'job', str(jobID)], autoRetry=False)[0]
                logger.debug("Tried to remove jobID: %i, with exit value: %i" %
                             (jobID, exitValue))
            runningJobs = self.getIssuedBatchJobIDs()
            if set(jobIDs).difference(set(runningJobs)) == set(jobIDs):
                break
            logger.warning(
                'Tried to kill some jobs, but something happened and they are still '
                'going, will try againin 5s.')
            time.sleep(5)
        # Update the CPU usage, because killed jobs aren't written to the results file.
        for jobID in jobIDs:
            if jobID in list(self.jobIDsToCpu.keys()):
                self.usedCpus -= self.jobIDsToCpu.pop(jobID)

    runningPattern = re.compile(
        r'r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+')

    def getJobIDsForResultsFile(self, resultsFile):
        """
        Get all queued and running jobs for a results file.
        """
        jobIDs = []
        for line in self._runParasol(['-extended', 'list', 'jobs'])[1]:
            fields = line.strip().split()
            if len(fields) == 0 or fields[-1] != resultsFile:
                continue
            jobID = fields[0]
            jobIDs.append(int(jobID))
        return set(jobIDs)

    def getIssuedBatchJobIDs(self):
        """
        Gets the list of jobs issued to parasol in all results files, but not including jobs
        created by other users.
        """
        issuedJobs = set()
        for resultsFile in itervalues(self.resultsFiles):
            issuedJobs.update(self.getJobIDsForResultsFile(resultsFile))

        return list(issuedJobs)

    def getRunningBatchJobIDs(self):
        """
        Returns map of running jobIDs and the time they have been running.
        """
        # Example lines..
        # r 5410186 benedictpaten worker 1247029663 localhost
        # r 5410324 benedictpaten worker 1247030076 localhost
        runningJobs = {}
        issuedJobs = self.getIssuedBatchJobIDs()
        for line in self._runParasol(['pstat2'])[1]:
            if line != '':
                match = self.runningPattern.match(line)
                if match is not None:
                    jobID = int(match.group(1))
                    startTime = int(match.group(2))
                    if jobID in issuedJobs:  # It's one of our jobs
                        runningJobs[jobID] = time.time() - startTime
        return runningJobs

    def getUpdatedBatchJob(self, maxWait):
        while True:
            try:
                item = self.updatedJobsQueue.get(timeout=maxWait)
            except Empty:
                return None
            try:
                self.runningJobs.remove(item.jobID)
            except KeyError:
                # We tried to kill this job, but it ended by itself instead, so skip it.
                pass
            else:
                return item

    def updatedJobWorker(self):
        """
        We use the parasol results to update the status of jobs, adding them
        to the list of updated jobs.

        Results have the following structure.. (thanks Mark D!)

        int status;    /* Job status - wait() return format. 0 is good. */
        char *host;    /* Machine job ran on. */
        char *jobId;    /* Job queuing system job ID */
        char *exe;    /* Job executable file (no path) */
        int usrTicks;    /* 'User' CPU time in ticks. */
        int sysTicks;    /* 'System' CPU time in ticks. */
        unsigned submitTime;    /* Job submission time in seconds since 1/1/1970 */
        unsigned startTime;    /* Job start time in seconds since 1/1/1970 */
        unsigned endTime;    /* Job end time in seconds since 1/1/1970 */
        char *user;    /* User who ran job */
        char *errFile;    /* Location of stderr file on host */

        Plus you finally have the command name.
        """
        resultsFiles = set()
        resultsFileHandles = []
        try:
            while self.running:
                # Look for any new results files that have been created, and open them
                newResultsFiles = set(os.listdir(
                    self.parasolResultsDir)).difference(resultsFiles)
                for newFile in newResultsFiles:
                    newFilePath = os.path.join(self.parasolResultsDir, newFile)
                    resultsFileHandles.append(open(newFilePath, 'r'))
                    resultsFiles.add(newFile)
                for fileHandle in resultsFileHandles:
                    while self.running:
                        line = fileHandle.readline()
                        if not line:
                            break
                        assert line[-1] == '\n'
                        (status, host, jobId, exe, usrTicks, sysTicks,
                         submitTime, startTime, endTime, user, errFile,
                         command) = line[:-1].split(None, 11)
                        status = int(status)
                        jobId = int(jobId)
                        if os.WIFEXITED(status):
                            status = os.WEXITSTATUS(status)
                        else:
                            status = -status
                        self.cpuUsageQueue.put(jobId)
                        startTime = int(startTime)
                        endTime = int(endTime)
                        if endTime == startTime:
                            # Both, start and end time is an integer so to get sub-second
                            # accuracy we use the ticks reported by Parasol as an approximation.
                            # This isn't documented but what Parasol calls "ticks" is actually a
                            # hundredth of a second. Parasol does the unit conversion early on
                            # after a job finished. Search paraNode.c for ticksToHundreths. We
                            # also cheat a little by always reporting at least one hundredth of a
                            # second.
                            usrTicks = int(usrTicks)
                            sysTicks = int(sysTicks)
                            wallTime = float(max(1,
                                                 usrTicks + sysTicks)) * 0.01
                        else:
                            wallTime = float(endTime - startTime)
                        self.updatedJobsQueue.put(
                            UpdatedBatchJobInfo(jobID=jobId,
                                                exitStatus=status,
                                                wallTime=wallTime,
                                                exitReason=None))
                time.sleep(1)
        except:
            logger.warning(
                "Error occurred while parsing parasol results files.")
            raise
        finally:
            for fileHandle in resultsFileHandles:
                fileHandle.close()

    def shutdown(self):
        self.killBatchJobs(self.getIssuedBatchJobIDs())  # cleanup jobs
        for results in itervalues(self.resultsFiles):
            exitValue = self._runParasol(
                ['-results=' + results, 'clear', 'sick'], autoRetry=False)[0]
            if exitValue is not None:
                logger.warning(
                    "Could not clear sick status of the parasol batch %s" %
                    results)
            exitValue = self._runParasol(
                ['-results=' + results, 'flushResults'], autoRetry=False)[0]
            if exitValue is not None:
                logger.warning("Could not flush the parasol batch %s" %
                               results)
        self.running = False
        logger.debug('Joining worker thread...')
        self.worker.join()
        logger.debug('... joined worker thread.')
        for results in list(self.resultsFiles.values()):
            os.remove(results)
        os.rmdir(self.parasolResultsDir)

    @classmethod
    def setOptions(cls, setOption):
        from toil.common import iC
        setOption("parasolCommand", None, None, 'parasol')
        setOption("parasolMaxBatches", int, iC(1), 10000)
コード例 #39
0
ファイル: scheduler.py プロジェクト: feel-easy/myspider
class Scheduler():
    def __init__(self, collector):
        if SCHEDULER_PERSIST:
            self.q = RedisQueue()
            self.fp_container = RedisFilterContainer()
        else:
            self.q = Queue()
            self.fp_container = NoramlFilterContainer()
        # self.fp_set = set()
        # self.total_repeat_nums = 0
        self.collector = collector  # 统计计数器类对象, 从引擎传入!

    def add_request(self, request):
        # 把request放入请求队列
        # 判断指纹是否在集合中,如果不在就入队
        request.fp = self._gen_fp(request)
        if not request.filter:  # 构造request声明不过滤重复请求的情况下
            self.fp_container.add_fp(request.fp)  #也要让指纹进集合!
            self.q.put(request)
            logger.info("添加不去重的请求<{} {}>".format(request.method, request.url))
            return  # 避免在本函数中请求重复入队
        if self._filter_request(request):
            self.q.put(request)

    def get_request(self):
        # 取出一个request;取不出就返回none
        try:
            request = self.q.get_nowait()
        except:
            request = None
        return request

    def _filter_request(self, request):
        '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True'''
        # request.fp = self._gen_fp(request)
        # if fp not in self.fp_set:
        if not self.fp_container.exists(request.fp):
            self.fp_container.add_fp(request.fp)  # 指纹进集合
            return True
        # self.total_repeat_nums += 1 # 重复的请求数 +1
        self.collector.incr(self.collector.repeat_request_nums_key)
        logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
        return False

    def _gen_fp(self, request):
        # 返回request的fp指纹字符串

        url = canonicalize_url(request.url)
        method = request.method.upper()
        data = request.data if request.data else {}
        data = sorted(data.items(), key=lambda x: x[0])
        # 把data字典按(k,v)进行迭代,按照k作为排序的依据
        # 默认就是用k作为排序的依据
        # key=lambda x:x[0] x就是每次迭代的(k,v), x[0]就是排序的依据
        # 最终返回 [('a', 1), ('b', 2)]

        sha1 = hashlib.sha1()
        sha1.update(self._to_bytes(url))
        sha1.update(self._to_bytes(method))
        sha1.update(self._to_bytes(str(data)))
        fp = sha1.hexdigest()
        return fp

    def _to_bytes(self, string):
        """py2 py3 正好相反!"""
        if six.PY2:  # 判断当前是否是python2
            if isinstance(string, str):
                return string
            else:
                return string.encode()
        elif six.PY3:  # 判断当前是否是python3
            if isinstance(string, str):
                return string.encode()
            else:
                return string