def run(): """ Start consuming runs and executing them. """ context = zmq.Context(1) greenlets = [] reset_incomplete_runs() concurrent_jobs = config.getint('job_runner_worker', 'concurrent_jobs') run_queue = Queue() kill_queue = Queue() event_queue = Queue() exit_queue = JoinableQueue() event_exit_queue = Queue() greenlets.append( gevent.spawn( enqueue_actions, context, run_queue, kill_queue, event_queue, exit_queue, ) ) for x in range(concurrent_jobs): greenlets.append(gevent.spawn( execute_run, run_queue, event_queue, exit_queue, )) greenlets.append(gevent.spawn( kill_run, kill_queue, event_queue, exit_queue)) greenlets.append(gevent.spawn( publish, context, event_queue, event_exit_queue)) def terminate_callback(*args, **kwargs): logger.warning('Worker is going to terminate!') for i in range(len(greenlets) - 1): # we don't want to kill the event greenlet, since we want to # publish events of already running jobs exit_queue.put(None) signal.signal(signal.SIGTERM, terminate_callback) for greenlet in greenlets[:-1]: greenlet.join() # now terminate the event queue event_exit_queue.put(None) greenlets[-1].join() sys.exit('Worker terminated')
def _handle_ping_action(message): """ Handle the ``'ping'`` action. """ worker_list = Worker.get_list(config.get("job_runner_worker", "worker_resource_uri")) if len(worker_list) == 1: worker_list[0].patch( { "ping_response_dts": datetime.now(utc).isoformat(" "), "worker_version": job_runner_worker.__version__, "concurrent_jobs": config.getint("job_runner_worker", "concurrent_jobs"), } ) else: logger.warning("API returned multiple workers, expected one")
def _handle_ping_action(message): """ Handle the ``'ping'`` action. """ worker_list = Worker.get_list( config.get('job_runner_worker', 'worker_resource_uri')) if len(worker_list) == 1: worker_list[0].patch({ 'ping_response_dts': datetime.now(utc).isoformat(' '), 'worker_version': job_runner_worker.__version__, 'concurrent_jobs': config.getint('job_runner_worker', 'concurrent_jobs') }) else: logger.warning('API returned multiple workers, expected one')
def _truncate_log(log_txt): """ Truncate the ``log_txt`` in case it exeeds the max. log size. :param log_txt: A ``str``. """ max_log_bytes = config.getint('job_runner_worker', 'max_log_bytes') if len(log_txt) > max_log_bytes: top_length = int(max_log_bytes * 0.2) bottom_length = int(max_log_bytes * 0.8) log_txt = '{0}\n\n[truncated]\n\n{1}'.format( log_txt[:top_length], log_txt[len(log_txt) - bottom_length:] ) return log_txt
def enqueue_actions( zmq_context, run_queue, kill_queue, event_queue, exit_queue): """ Handle incoming actions sent by the broadcaster. :param zmq_context: An instance of ``zmq.Context``. :param run_queue: An instance of ``Queue`` for pushing the runs to. :param kill_queue: An instance of ``Queue`` for pushing the kill-requests to. :param event_queue: An instance of ``Queue`` for pushing events to. :param exit_queue: An instance of ``Queue`` to consume from. If this queue is not empty, the function needs to terminate. """ logger.info('Starting enqueue loop') subscriber = _get_subscriber(zmq_context) expected_address = 'master.broadcast.{0}'.format( config.get('job_runner_worker', 'api_key')) last_activity_dts = datetime.utcnow() reconnect_after_inactivity = config.getint( 'job_runner_worker', 'reconnect_after_inactivity') while True: try: exit_queue.get(block=False) logger.info('Termintating enqueue loop') return except Empty: pass try: address, content = subscriber.recv_multipart(zmq.NOBLOCK) last_activity_dts = datetime.utcnow() except zmq.ZMQError: # this is needed in case the ZMQ publisher is load-balanced and the # loadbalancer dropped the connection to the backend, but not the # connection to our side. without this work-around, zmq will think # that all is well, and we won't receive anything anymore delta = datetime.utcnow() - last_activity_dts if delta > timedelta(seconds=reconnect_after_inactivity): logger.warning( 'There was not activity for {0}, reconnecting' ' to publisher'.format(delta) ) subscriber.close() time.sleep(random.randint(1, 10)) subscriber = _get_subscriber(zmq_context) last_activity_dts = datetime.utcnow() continue else: time.sleep(0.5) continue # since zmq is subscribed to everything that starts with the given # prefix, we have to do a double check to make sure this is an exact # match. if not address == expected_address: continue logger.debug('Received [{0}]: {1}'.format(address, content)) message = json.loads(content) if message['action'] == 'enqueue': _handle_enqueue_action(message, run_queue, event_queue) elif message['action'] == 'kill': _handle_kill_action(message, kill_queue, event_queue) elif message['action'] == 'ping': _handle_ping_action(message) subscriber.close()
def run(): """ Start consuming runs and executing them. """ context = zmq.Context(1) gevent_pool = gevent.pool.Group() reset_incomplete_runs() concurrent_jobs = config.getint('job_runner_worker', 'concurrent_jobs') run_queue = Queue() kill_queue = Queue() event_queue = Queue() exit_queue = JoinableQueue() event_exit_queue = Queue() # callback for SIGTERM def terminate_callback(*args, **kwargs): logger.warning('Worker is going to terminate!') for i in range(concurrent_jobs + 2): # we don't want to kill the event greenlet, since we want to # publish events of already running jobs exit_queue.put(None) # callback for when an exception is raised in a execute_run greenlet def recover_run(greenlet): logger.warning( 'Recovering execute_run greenlet which raised: {0}'.format( greenlet.exception)) gevent_pool.spawn( execute_run, run_queue, event_queue, exit_queue, ).link_exception(recover_run) # callback for when an exception is raised in enqueue_actions greenlet def recover_enqueue_actions(greenlet): logger.warning( 'Recovering enqueue_actions greenlet which raised: {0}'.format( greenlet.exception)) gevent_pool.spawn( enqueue_actions, context, run_queue, kill_queue, event_queue, exit_queue, ).link_exception(recover_enqueue_actions) # callback for when an exception is raised in kill_run greenlet def recover_kill_run(greenlet): logger.warning( 'Recovering kill_run greenlet which raised: {0}'.format(greenlet)) gevent_pool.spawn( kill_run, kill_queue, event_queue, exit_queue, ).link_exception(recover_kill_run) # start the enqueue_actions greenlet gevent_pool.spawn( enqueue_actions, context, run_queue, kill_queue, event_queue, exit_queue, ).link_exception(recover_enqueue_actions) # start the execute_run greenlets for x in range(concurrent_jobs): gevent_pool.spawn( execute_run, run_queue, event_queue, exit_queue, ).link_exception(recover_run) # start the kill_run greenlet gevent_pool.spawn( kill_run, kill_queue, event_queue, exit_queue ).link_exception(recover_kill_run) # start the publish (event publisher) greentlet publisher_loop = gevent.spawn( publish, context, event_queue, event_exit_queue) # catch SIGTERM signal signal.signal(signal.SIGTERM, terminate_callback) # wait for all the greenlets to complete in this group gevent_pool.join() # now terminate the event queue. this one should be terminated at the # end, since we want all events to be published. event_exit_queue.put(None) publisher_loop.join() sys.exit('Worker terminated')
def enqueue_actions(zmq_context, run_queue, kill_queue, event_queue, exit_queue): """ Handle incoming actions sent by the broadcaster. :param zmq_context: An instance of ``zmq.Context``. :param run_queue: An instance of ``Queue`` for pushing the runs to. :param kill_queue: An instance of ``Queue`` for pushing the kill-requests to. :param event_queue: An instance of ``Queue`` for pushing events to. :param exit_queue: An instance of ``Queue`` to consume from. If this queue is not empty, the function needs to terminate. """ logger.info('Starting enqueue loop') subscriber = _get_subscriber(zmq_context) expected_address = 'master.broadcast.{0}'.format( config.get('job_runner_worker', 'api_key')) last_activity_dts = datetime.utcnow() reconnect_after_inactivity = config.getint('job_runner_worker', 'reconnect_after_inactivity') while True: try: exit_queue.get(block=False) logger.info('Termintating enqueue loop') return except Empty: pass try: address, content = subscriber.recv_multipart(zmq.NOBLOCK) last_activity_dts = datetime.utcnow() except zmq.ZMQError: # this is needed in case the ZMQ publisher is load-balanced and the # loadbalancer dropped the connection to the backend, but not the # connection to our side. without this work-around, zmq will think # that all is well, and we won't receive anything anymore delta = datetime.utcnow() - last_activity_dts if delta > timedelta(seconds=reconnect_after_inactivity): logger.warning('There was not activity for {0}, reconnecting' ' to publisher'.format(delta)) subscriber.close() time.sleep(random.randint(1, 10)) subscriber = _get_subscriber(zmq_context) last_activity_dts = datetime.utcnow() continue else: time.sleep(0.5) continue # since zmq is subscribed to everything that starts with the given # prefix, we have to do a double check to make sure this is an exact # match. if not address == expected_address: continue logger.debug('Received [{0}]: {1}'.format(address, content)) message = json.loads(content) if message['action'] == 'enqueue': _handle_enqueue_action(message, run_queue, event_queue) elif message['action'] == 'kill': _handle_kill_action(message, kill_queue, event_queue) elif message['action'] == 'ping': _handle_ping_action(message) subscriber.close()
def run(): """ Start consuming runs and executing them. """ context = zmq.Context(1) gevent_pool = gevent.pool.Group() reset_incomplete_runs() concurrent_jobs = config.getint('job_runner_worker', 'concurrent_jobs') run_queue = Queue() kill_queue = Queue() event_queue = Queue() exit_queue = JoinableQueue() event_exit_queue = Queue() # callback for SIGTERM def terminate_callback(*args, **kwargs): logger.warning('Worker is going to terminate!') for i in range(concurrent_jobs + 2): # we don't want to kill the event greenlet, since we want to # publish events of already running jobs exit_queue.put(None) # callback for when an exception is raised in a execute_run greenlet def recover_run(greenlet): logger.warning( 'Recovering execute_run greenlet which raised: {0}'.format( greenlet.exception)) gevent_pool.spawn( execute_run, run_queue, event_queue, exit_queue, ).link_exception(recover_run) # callback for when an exception is raised in enqueue_actions greenlet def recover_enqueue_actions(greenlet): logger.warning( 'Recovering enqueue_actions greenlet which raised: {0}'.format( greenlet.exception)) gevent_pool.spawn( enqueue_actions, context, run_queue, kill_queue, event_queue, exit_queue, ).link_exception(recover_enqueue_actions) # callback for when an exception is raised in kill_run greenlet def recover_kill_run(greenlet): logger.warning( 'Recovering kill_run greenlet which raised: {0}'.format(greenlet)) gevent_pool.spawn( kill_run, kill_queue, event_queue, exit_queue, ).link_exception(recover_kill_run) # start the enqueue_actions greenlet gevent_pool.spawn( enqueue_actions, context, run_queue, kill_queue, event_queue, exit_queue, ).link_exception(recover_enqueue_actions) # start the execute_run greenlets for x in range(concurrent_jobs): gevent_pool.spawn( execute_run, run_queue, event_queue, exit_queue, ).link_exception(recover_run) # start the kill_run greenlet gevent_pool.spawn(kill_run, kill_queue, event_queue, exit_queue).link_exception(recover_kill_run) # start the publish (event publisher) greentlet publisher_loop = gevent.spawn(publish, context, event_queue, event_exit_queue) # catch SIGTERM signal signal.signal(signal.SIGTERM, terminate_callback) # wait for all the greenlets to complete in this group gevent_pool.join() # now terminate the event queue. this one should be terminated at the # end, since we want all events to be published. event_exit_queue.put(None) publisher_loop.join() sys.exit('Worker terminated')