def run():
    """
    Start consuming runs and executing them.
    """
    context = zmq.Context(1)

    greenlets = []
    reset_incomplete_runs()
    concurrent_jobs = config.getint('job_runner_worker', 'concurrent_jobs')

    run_queue = Queue()
    kill_queue = Queue()
    event_queue = Queue()
    exit_queue = JoinableQueue()
    event_exit_queue = Queue()

    greenlets.append(
        gevent.spawn(
            enqueue_actions,
            context,
            run_queue,
            kill_queue,
            event_queue,
            exit_queue,
        )
    )

    for x in range(concurrent_jobs):
        greenlets.append(gevent.spawn(
            execute_run,
            run_queue,
            event_queue,
            exit_queue,
        ))

    greenlets.append(gevent.spawn(
        kill_run, kill_queue, event_queue, exit_queue))
    greenlets.append(gevent.spawn(
        publish, context, event_queue, event_exit_queue))

    def terminate_callback(*args, **kwargs):
        logger.warning('Worker is going to terminate!')
        for i in range(len(greenlets) - 1):
            # we don't want to kill the event greenlet, since we want to
            # publish events of already running jobs
            exit_queue.put(None)

    signal.signal(signal.SIGTERM, terminate_callback)

    for greenlet in greenlets[:-1]:
        greenlet.join()

    # now terminate the event queue
    event_exit_queue.put(None)
    greenlets[-1].join()
    sys.exit('Worker terminated')
def _handle_ping_action(message):
    """
    Handle the ``'ping'`` action.
    """
    worker_list = Worker.get_list(config.get("job_runner_worker", "worker_resource_uri"))

    if len(worker_list) == 1:
        worker_list[0].patch(
            {
                "ping_response_dts": datetime.now(utc).isoformat(" "),
                "worker_version": job_runner_worker.__version__,
                "concurrent_jobs": config.getint("job_runner_worker", "concurrent_jobs"),
            }
        )
    else:
        logger.warning("API returned multiple workers, expected one")
def _handle_ping_action(message):
    """
    Handle the ``'ping'`` action.
    """
    worker_list = Worker.get_list(
        config.get('job_runner_worker', 'worker_resource_uri'))

    if len(worker_list) == 1:
        worker_list[0].patch({
            'ping_response_dts':
            datetime.now(utc).isoformat(' '),
            'worker_version':
            job_runner_worker.__version__,
            'concurrent_jobs':
            config.getint('job_runner_worker', 'concurrent_jobs')
        })
    else:
        logger.warning('API returned multiple workers, expected one')
Exemple #4
0
def _truncate_log(log_txt):
    """
    Truncate the ``log_txt`` in case it exeeds the max. log size.

    :param log_txt:
        A ``str``.

    """
    max_log_bytes = config.getint('job_runner_worker', 'max_log_bytes')

    if len(log_txt) > max_log_bytes:
        top_length = int(max_log_bytes * 0.2)
        bottom_length = int(max_log_bytes * 0.8)

        log_txt = '{0}\n\n[truncated]\n\n{1}'.format(
            log_txt[:top_length],
            log_txt[len(log_txt) - bottom_length:]
        )

    return log_txt
def enqueue_actions(
        zmq_context, run_queue, kill_queue, event_queue, exit_queue):
    """
    Handle incoming actions sent by the broadcaster.

    :param zmq_context:
        An instance of ``zmq.Context``.

    :param run_queue:
        An instance of ``Queue`` for pushing the runs to.

    :param kill_queue:
        An instance of ``Queue`` for pushing the kill-requests to.

    :param event_queue:
        An instance of ``Queue`` for pushing events to.

    :param exit_queue:
        An instance of ``Queue`` to consume from. If this queue is not empty,
        the function needs to terminate.

    """
    logger.info('Starting enqueue loop')
    subscriber = _get_subscriber(zmq_context)

    expected_address = 'master.broadcast.{0}'.format(
        config.get('job_runner_worker', 'api_key'))

    last_activity_dts = datetime.utcnow()
    reconnect_after_inactivity = config.getint(
        'job_runner_worker', 'reconnect_after_inactivity')

    while True:
        try:
            exit_queue.get(block=False)
            logger.info('Termintating enqueue loop')
            return
        except Empty:
            pass

        try:
            address, content = subscriber.recv_multipart(zmq.NOBLOCK)
            last_activity_dts = datetime.utcnow()
        except zmq.ZMQError:
            # this is needed in case the ZMQ publisher is load-balanced and the
            # loadbalancer dropped the connection to the backend, but not the
            # connection to our side. without this work-around, zmq will think
            # that all is well, and we won't receive anything anymore
            delta = datetime.utcnow() - last_activity_dts
            if delta > timedelta(seconds=reconnect_after_inactivity):
                logger.warning(
                    'There was not activity for {0}, reconnecting'
                    ' to publisher'.format(delta)
                )
                subscriber.close()
                time.sleep(random.randint(1, 10))
                subscriber = _get_subscriber(zmq_context)
                last_activity_dts = datetime.utcnow()
                continue
            else:
                time.sleep(0.5)
                continue

        # since zmq is subscribed to everything that starts with the given
        # prefix, we have to do a double check to make sure this is an exact
        # match.
        if not address == expected_address:
            continue

        logger.debug('Received [{0}]: {1}'.format(address, content))
        message = json.loads(content)

        if message['action'] == 'enqueue':
            _handle_enqueue_action(message, run_queue, event_queue)

        elif message['action'] == 'kill':
            _handle_kill_action(message, kill_queue, event_queue)

        elif message['action'] == 'ping':
            _handle_ping_action(message)

    subscriber.close()
Exemple #6
0
def run():
    """
    Start consuming runs and executing them.
    """
    context = zmq.Context(1)

    gevent_pool = gevent.pool.Group()
    reset_incomplete_runs()
    concurrent_jobs = config.getint('job_runner_worker', 'concurrent_jobs')

    run_queue = Queue()
    kill_queue = Queue()
    event_queue = Queue()
    exit_queue = JoinableQueue()
    event_exit_queue = Queue()

    # callback for SIGTERM
    def terminate_callback(*args, **kwargs):
        logger.warning('Worker is going to terminate!')
        for i in range(concurrent_jobs + 2):
            # we don't want to kill the event greenlet, since we want to
            # publish events of already running jobs
            exit_queue.put(None)

    # callback for when an exception is raised in a execute_run greenlet
    def recover_run(greenlet):
        logger.warning(
            'Recovering execute_run greenlet which raised: {0}'.format(
                greenlet.exception))
        gevent_pool.spawn(
            execute_run,
            run_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_run)

    # callback for when an exception is raised in enqueue_actions greenlet
    def recover_enqueue_actions(greenlet):
        logger.warning(
            'Recovering enqueue_actions greenlet which raised: {0}'.format(
                greenlet.exception))
        gevent_pool.spawn(
            enqueue_actions,
            context,
            run_queue,
            kill_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_enqueue_actions)

    # callback for when an exception is raised in kill_run greenlet
    def recover_kill_run(greenlet):
        logger.warning(
            'Recovering kill_run greenlet which raised: {0}'.format(greenlet))
        gevent_pool.spawn(
            kill_run,
            kill_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_kill_run)

    # start the enqueue_actions greenlet
    gevent_pool.spawn(
        enqueue_actions,
        context,
        run_queue,
        kill_queue,
        event_queue,
        exit_queue,
    ).link_exception(recover_enqueue_actions)

    # start the execute_run greenlets
    for x in range(concurrent_jobs):
        gevent_pool.spawn(
            execute_run,
            run_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_run)

    # start the kill_run greenlet
    gevent_pool.spawn(
        kill_run,
        kill_queue,
        event_queue,
        exit_queue
    ).link_exception(recover_kill_run)

    # start the publish (event publisher) greentlet
    publisher_loop = gevent.spawn(
        publish, context, event_queue, event_exit_queue)

    # catch SIGTERM signal
    signal.signal(signal.SIGTERM, terminate_callback)

    # wait for all the greenlets to complete in this group
    gevent_pool.join()

    # now terminate the event queue. this one should be terminated at the
    # end, since we want all events to be published.
    event_exit_queue.put(None)
    publisher_loop.join()
    sys.exit('Worker terminated')
def enqueue_actions(zmq_context, run_queue, kill_queue, event_queue,
                    exit_queue):
    """
    Handle incoming actions sent by the broadcaster.

    :param zmq_context:
        An instance of ``zmq.Context``.

    :param run_queue:
        An instance of ``Queue`` for pushing the runs to.

    :param kill_queue:
        An instance of ``Queue`` for pushing the kill-requests to.

    :param event_queue:
        An instance of ``Queue`` for pushing events to.

    :param exit_queue:
        An instance of ``Queue`` to consume from. If this queue is not empty,
        the function needs to terminate.

    """
    logger.info('Starting enqueue loop')
    subscriber = _get_subscriber(zmq_context)

    expected_address = 'master.broadcast.{0}'.format(
        config.get('job_runner_worker', 'api_key'))

    last_activity_dts = datetime.utcnow()
    reconnect_after_inactivity = config.getint('job_runner_worker',
                                               'reconnect_after_inactivity')

    while True:
        try:
            exit_queue.get(block=False)
            logger.info('Termintating enqueue loop')
            return
        except Empty:
            pass

        try:
            address, content = subscriber.recv_multipart(zmq.NOBLOCK)
            last_activity_dts = datetime.utcnow()
        except zmq.ZMQError:
            # this is needed in case the ZMQ publisher is load-balanced and the
            # loadbalancer dropped the connection to the backend, but not the
            # connection to our side. without this work-around, zmq will think
            # that all is well, and we won't receive anything anymore
            delta = datetime.utcnow() - last_activity_dts
            if delta > timedelta(seconds=reconnect_after_inactivity):
                logger.warning('There was not activity for {0}, reconnecting'
                               ' to publisher'.format(delta))
                subscriber.close()
                time.sleep(random.randint(1, 10))
                subscriber = _get_subscriber(zmq_context)
                last_activity_dts = datetime.utcnow()
                continue
            else:
                time.sleep(0.5)
                continue

        # since zmq is subscribed to everything that starts with the given
        # prefix, we have to do a double check to make sure this is an exact
        # match.
        if not address == expected_address:
            continue

        logger.debug('Received [{0}]: {1}'.format(address, content))
        message = json.loads(content)

        if message['action'] == 'enqueue':
            _handle_enqueue_action(message, run_queue, event_queue)

        elif message['action'] == 'kill':
            _handle_kill_action(message, kill_queue, event_queue)

        elif message['action'] == 'ping':
            _handle_ping_action(message)

    subscriber.close()
Exemple #8
0
def run():
    """
    Start consuming runs and executing them.
    """
    context = zmq.Context(1)

    gevent_pool = gevent.pool.Group()
    reset_incomplete_runs()
    concurrent_jobs = config.getint('job_runner_worker', 'concurrent_jobs')

    run_queue = Queue()
    kill_queue = Queue()
    event_queue = Queue()
    exit_queue = JoinableQueue()
    event_exit_queue = Queue()

    # callback for SIGTERM
    def terminate_callback(*args, **kwargs):
        logger.warning('Worker is going to terminate!')
        for i in range(concurrent_jobs + 2):
            # we don't want to kill the event greenlet, since we want to
            # publish events of already running jobs
            exit_queue.put(None)

    # callback for when an exception is raised in a execute_run greenlet
    def recover_run(greenlet):
        logger.warning(
            'Recovering execute_run greenlet which raised: {0}'.format(
                greenlet.exception))
        gevent_pool.spawn(
            execute_run,
            run_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_run)

    # callback for when an exception is raised in enqueue_actions greenlet
    def recover_enqueue_actions(greenlet):
        logger.warning(
            'Recovering enqueue_actions greenlet which raised: {0}'.format(
                greenlet.exception))
        gevent_pool.spawn(
            enqueue_actions,
            context,
            run_queue,
            kill_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_enqueue_actions)

    # callback for when an exception is raised in kill_run greenlet
    def recover_kill_run(greenlet):
        logger.warning(
            'Recovering kill_run greenlet which raised: {0}'.format(greenlet))
        gevent_pool.spawn(
            kill_run,
            kill_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_kill_run)

    # start the enqueue_actions greenlet
    gevent_pool.spawn(
        enqueue_actions,
        context,
        run_queue,
        kill_queue,
        event_queue,
        exit_queue,
    ).link_exception(recover_enqueue_actions)

    # start the execute_run greenlets
    for x in range(concurrent_jobs):
        gevent_pool.spawn(
            execute_run,
            run_queue,
            event_queue,
            exit_queue,
        ).link_exception(recover_run)

    # start the kill_run greenlet
    gevent_pool.spawn(kill_run, kill_queue, event_queue,
                      exit_queue).link_exception(recover_kill_run)

    # start the publish (event publisher) greentlet
    publisher_loop = gevent.spawn(publish, context, event_queue,
                                  event_exit_queue)

    # catch SIGTERM signal
    signal.signal(signal.SIGTERM, terminate_callback)

    # wait for all the greenlets to complete in this group
    gevent_pool.join()

    # now terminate the event queue. this one should be terminated at the
    # end, since we want all events to be published.
    event_exit_queue.put(None)
    publisher_loop.join()
    sys.exit('Worker terminated')