def reset_incomplete_runs(): """ Cleanup incomplete runs. A run is left incomplete when a worker dies while the run hasn't been finished (or was marked as enqueued). These runs needs to be re-started and therefore reset to scheduled state. """ logger.info('Cleaning up incomplete runs') incomplete_runs = [] for state in ['in_queue', 'started']: incomplete_runs.extend(Run.get_list( config.get('job_runner_worker', 'run_resource_uri'), params={ 'state': state, 'worker__api_key': config.get('job_runner_worker', 'api_key'), } )) for run in incomplete_runs: logger.warning('Run {0} was left incomplete'.format(run.resource_uri)) run.patch({ 'enqueue_dts': None, 'start_dts': None, })
def _get_json_data(self): """ Return JSON data. :raises: :exc:`!RequestException` on ``requests`` error. :raises: :exc:`.RequestServerError` on 5xx response. :raises: :exc:`.RequestClientError` on errors caused client-side. """ response = requests.get( urlparse.urljoin(config.get('job_runner_worker', 'api_base_url'), self._resource_path), auth=HmacAuth(config.get('job_runner_worker', 'api_key'), config.get('job_runner_worker', 'secret')), headers={'content-type': 'application/json'}, verify=False, ) if response.status_code != 200: if response.status_code >= 500 and response.status_code <= 599: raise RequestServerError('Server returned {0} - {1}'.format( response.status_code, response.content)) else: raise RequestClientError('Server returned {0} - {1}'.format( response.status_code, response.content)) return response.json
def post(self, attributes={}): """ PATCH resource with given keyword arguments. :raises: :exc:`!RequestException` on ``requests`` error. :raises: :exc:`.RequestServerError` on 5xx response. :raises: :exc:`.RequestClientError` on errors caused client-side. """ response = requests.post( urlparse.urljoin(config.get('job_runner_worker', 'api_base_url'), self._resource_path), auth=HmacAuth(config.get('job_runner_worker', 'api_key'), config.get('job_runner_worker', 'secret')), headers={'content-type': 'application/json'}, data=json.dumps(attributes), verify=False, ) if response.status_code != 201: if response.status_code >= 500 and response.status_code <= 599: raise RequestServerError('Server returned {0} - {1}'.format( response.status_code, response.content)) else: raise RequestClientError('Server returned {0} - {1}'.format( response.status_code, response.content))
def _handle_enqueue_action(message, run_queue, event_queue): """ Handle the ``'enqueue'`` action. """ run = Run('{0}{1}/'.format( config.get('job_runner_worker', 'run_resource_uri'), message['run_id'])) worker_list = Worker.get_list( config.get('job_runner_worker', 'worker_resource_uri')) if run.enqueue_dts: logger.warning( 'Was expecting that run: {0} was not in queue yet'.format(run.id)) elif len(worker_list) != 1: logger.warning('API returned multiple workers, expected one') else: run.patch({ 'enqueue_dts': datetime.now(utc).isoformat(' '), # set the worker so we know which worker of the pool claimed the # run 'worker': worker_list[0].resource_uri, }) run_queue.put(run) event_queue.put( json.dumps({ 'event': 'enqueued', 'run_id': run.id, 'kind': 'run' }))
def post(self, attributes={}): """ PATCH resource with given keyword arguments. :raises: :exc:`!RequestException` on ``requests`` error. :raises: :exc:`.RequestServerError` on 5xx response. :raises: :exc:`.RequestClientError` on errors caused client-side. """ response = requests.post( urlparse.urljoin( config.get('job_runner_worker', 'api_base_url'), self._resource_path ), auth=HmacAuth( config.get('job_runner_worker', 'api_key'), config.get('job_runner_worker', 'secret') ), headers={'content-type': 'application/json'}, data=json.dumps(attributes), verify=False, ) if response.status_code != 201: if response.status_code >= 500 and response.status_code <= 599: raise RequestServerError('Server returned {0} - {1}'.format( response.status_code, response.content)) else: raise RequestClientError('Server returned {0} - {1}'.format( response.status_code, response.content))
def _get_subscriber(zmq_context): """ Return a new subscriber connection for the given ``zmq_context``. """ subscriber = zmq_context.socket(zmq.SUB) subscriber.connect('tcp://{0}:{1}'.format( config.get('job_runner_worker', 'broadcaster_server_hostname'), config.get('job_runner_worker', 'broadcaster_server_port'), )) subscriber.setsockopt(zmq.SUBSCRIBE, 'master.broadcast.{0}'.format( config.get('job_runner_worker', 'api_key'))) return subscriber
def _get_subscriber(zmq_context): """ Return a new subscriber connection for the given ``zmq_context``. """ subscriber = zmq_context.socket(zmq.SUB) subscriber.connect('tcp://{0}:{1}'.format( config.get('job_runner_worker', 'broadcaster_server_hostname'), config.get('job_runner_worker', 'broadcaster_server_port'), )) subscriber.setsockopt( zmq.SUBSCRIBE, 'master.broadcast.{0}'.format( config.get('job_runner_worker', 'api_key'))) return subscriber
def _get_subscriber(zmq_context): """ Return a new subscriber connection for the given ``zmq_context``. """ subscriber = zmq_context.socket(zmq.SUB) subscriber.connect( "tcp://{0}:{1}".format( config.get("job_runner_worker", "broadcaster_server_hostname"), config.get("job_runner_worker", "broadcaster_server_port"), ) ) subscriber.setsockopt(zmq.SUBSCRIBE, "master.broadcast.{0}".format(config.get("job_runner_worker", "api_key"))) return subscriber
def get_list(cls, resource_path, params={}): """ Return a list of models for ``resource_path``. :param resource_path: The path of the resource. :param params: A ``dict`` containing optional request params. Optional. :return: A ``list`` of class instances. :raises: :exc:`.RestError` when response code is not 200. """ response = requests.get( urlparse.urljoin( config.get('job_runner_worker', 'api_base_url'), resource_path ), auth=HmacAuth( config.get('job_runner_worker', 'api_key'), config.get('job_runner_worker', 'secret') ), params=params, headers={'content-type': 'application/json'}, verify=False, ) if response.status_code != 200: if response.status_code >= 500 and response.status_code <= 599: raise RequestServerError('Server returned {0} - {1}'.format( response.status_code, response.content)) else: raise RequestClientError('Server returned {0} - {1}'.format( response.status_code, response.content)) output = [] for obj_dict in response.json['objects']: output.append(cls(obj_dict['resource_uri'], obj_dict)) if 'next' in response.json['meta'] and response.json['meta']['next']: output.extend(cls.get_list(response.json['meta']['next'])) return output
def _handle_ping_action(message): """ Handle the ``'ping'`` action. """ worker_list = Worker.get_list( config.get('job_runner_worker', 'worker_resource_uri'), params={ 'api_key': config.get('job_runner_worker', 'api_key') } ) if len(worker_list) == 1: worker_list[0].patch({ 'ping_response_dts': datetime.now(utc).isoformat(' '), }) else: logger.warning('Workers by api_key query resulted in multiple results')
def get_list(cls, resource_path, params={}): """ Return a list of models for ``resource_path``. :param resource_path: The path of the resource. :param params: A ``dict`` containing optional request params. Optional. :return: A ``list`` of class instances. :raises: :exc:`.RestError` when response code is not 200. """ response = requests.get( urlparse.urljoin(config.get('job_runner_worker', 'api_base_url'), resource_path), auth=HmacAuth(config.get('job_runner_worker', 'api_key'), config.get('job_runner_worker', 'secret')), params=params, headers={'content-type': 'application/json'}, verify=False, ) if response.status_code != 200: if response.status_code >= 500 and response.status_code <= 599: raise RequestServerError('Server returned {0} - {1}'.format( response.status_code, response.content)) else: raise RequestClientError('Server returned {0} - {1}'.format( response.status_code, response.content)) output = [] for obj_dict in response.json['objects']: output.append(cls(obj_dict['resource_uri'], obj_dict)) if 'next' in response.json['meta'] and response.json['meta']['next']: output.extend(cls.get_list(response.json['meta']['next'])) return output
def publish(zmq_context, event_queue, exit_queue): """ Publish enqueued events to the WebSocket server. :param zmq_context: An instance of ``zmq.Context``. :param event_queue: A ``Queue`` instance for events to broadcast. :param exit_queue: An instance of ``Queue`` to consume from. If this queue is not empty, the function needs to terminate. """ logger.info('Starting event publisher') publisher = zmq_context.socket(zmq.PUB) publisher.connect('tcp://{0}:{1}'.format( config.get('job_runner_worker', 'ws_server_hostname'), config.get('job_runner_worker', 'ws_server_port'), )) while True: try: event = event_queue.get(block=False) logger.debug('Sending event: {0}'.format(event)) publisher.send_multipart(['worker.event', event]) continue except Empty: pass try: exit_queue.get(block=False) logger.info('Terminating event publisher') return except Empty: pass time.sleep(0.5) publisher.close()
def _handle_kill_action(message, kill_queue, event_queue): """ Handle the ``'kill'`` action. """ kill_request = KillRequest( "{0}{1}/".format(config.get("job_runner_worker", "kill_request_resource_uri"), message["kill_request_id"]) ) if kill_request.enqueue_dts: logger.warning("Was expecting that kill: {0} was not in queue yet".format(message["kill_request_id"])) else: kill_request.patch({"enqueue_dts": datetime.now(utc).isoformat(" ")}) kill_queue.put(kill_request) event_queue.put(json.dumps({"event": "enqueued", "kill_request_id": kill_request.id, "kind": "kill_request"}))
def _handle_enqueue_action(message, run_queue, event_queue): """ Handle the ``'enqueue'`` action. """ run = Run("{0}{1}/".format(config.get("job_runner_worker", "run_resource_uri"), message["run_id"])) worker_list = Worker.get_list(config.get("job_runner_worker", "worker_resource_uri")) if run.enqueue_dts: logger.warning("Was expecting that run: {0} was not in queue yet".format(run.id)) elif len(worker_list) != 1: logger.warning("API returned multiple workers, expected one") else: run.patch( { "enqueue_dts": datetime.now(utc).isoformat(" "), # set the worker so we know which worker of the pool claimed the # run "worker": worker_list[0].resource_uri, } ) run_queue.put(run) event_queue.put(json.dumps({"event": "enqueued", "run_id": run.id, "kind": "run"}))
def _get_json_data(self): """ Return JSON data. :raises: :exc:`!RequestException` on ``requests`` error. :raises: :exc:`.RequestServerError` on 5xx response. :raises: :exc:`.RequestClientError` on errors caused client-side. """ response = requests.get( urlparse.urljoin( config.get('job_runner_worker', 'api_base_url'), self._resource_path ), auth=HmacAuth( config.get('job_runner_worker', 'api_key'), config.get('job_runner_worker', 'secret') ), headers={'content-type': 'application/json'}, verify=False, ) if response.status_code != 200: if response.status_code >= 500 and response.status_code <= 599: raise RequestServerError('Server returned {0} - {1}'.format( response.status_code, response.content)) else: raise RequestClientError('Server returned {0} - {1}'.format( response.status_code, response.content)) return response.json
def _handle_ping_action(message): """ Handle the ``'ping'`` action. """ worker_list = Worker.get_list(config.get("job_runner_worker", "worker_resource_uri")) if len(worker_list) == 1: worker_list[0].patch( { "ping_response_dts": datetime.now(utc).isoformat(" "), "worker_version": job_runner_worker.__version__, "concurrent_jobs": config.getint("job_runner_worker", "concurrent_jobs"), } ) else: logger.warning("API returned multiple workers, expected one")
def _handle_ping_action(message): """ Handle the ``'ping'`` action. """ worker_list = Worker.get_list( config.get('job_runner_worker', 'worker_resource_uri')) if len(worker_list) == 1: worker_list[0].patch({ 'ping_response_dts': datetime.now(utc).isoformat(' '), 'worker_version': job_runner_worker.__version__, 'concurrent_jobs': config.getint('job_runner_worker', 'concurrent_jobs') }) else: logger.warning('API returned multiple workers, expected one')
def _handle_enqueue_action(message, run_queue, event_queue): """ Handle the ``'enqueue'`` action. """ run = Run('{0}{1}/'.format( config.get('job_runner_worker', 'run_resource_uri'), message['run_id'] )) if run.enqueue_dts: logger.warning( 'Was expecting that run: {0} was not in queue yet'.format( run.id)) else: run.patch({ 'enqueue_dts': datetime.now(utc).isoformat(' ') }) run_queue.put(run) event_queue.put(json.dumps( {'event': 'enqueued', 'run_id': run.id, 'kind': 'run'}))
def _handle_kill_action(message, kill_queue, event_queue): """ Handle the ``'kill'`` action. """ kill_request = KillRequest('{0}{1}/'.format( config.get('job_runner_worker', 'kill_request_resource_uri'), message['kill_request_id'])) if kill_request.enqueue_dts: logger.warning( 'Was expecting that kill: {0} was not in queue yet'.format( message['kill_request_id'])) else: kill_request.patch({'enqueue_dts': datetime.now(utc).isoformat(' ')}) kill_queue.put(kill_request) event_queue.put( json.dumps({ 'event': 'enqueued', 'kill_request_id': kill_request.id, 'kind': 'kill_request' }))
def _handle_kill_action(message, kill_queue, event_queue): """ Handle the ``'kill'`` action. """ kill_request = KillRequest('{0}{1}/'.format( config.get('job_runner_worker', 'kill_request_resource_uri'), message['kill_request_id'] )) if kill_request.enqueue_dts: logger.warning( 'Was expecting that kill: {0} was not in queue yet'.format( message['kill_request_id'])) else: kill_request.patch({ 'enqueue_dts': datetime.now(utc).isoformat(' ') }) kill_queue.put(kill_request) event_queue.put(json.dumps({ 'event': 'enqueued', 'kill_request_id': kill_request.id, 'kind': 'kill_request' }))
def enqueue_actions(zmq_context, run_queue, kill_queue, event_queue, exit_queue): """ Handle incoming actions sent by the broadcaster. :param zmq_context: An instance of ``zmq.Context``. :param run_queue: An instance of ``Queue`` for pushing the runs to. :param kill_queue: An instance of ``Queue`` for pushing the kill-requests to. :param event_queue: An instance of ``Queue`` for pushing events to. :param exit_queue: An instance of ``Queue`` to consume from. If this queue is not empty, the function needs to terminate. """ logger.info('Starting enqueue loop') subscriber = _get_subscriber(zmq_context) expected_address = 'master.broadcast.{0}'.format( config.get('job_runner_worker', 'api_key')) last_activity_dts = datetime.utcnow() reconnect_after_inactivity = config.getint('job_runner_worker', 'reconnect_after_inactivity') while True: try: exit_queue.get(block=False) logger.info('Termintating enqueue loop') return except Empty: pass try: address, content = subscriber.recv_multipart(zmq.NOBLOCK) last_activity_dts = datetime.utcnow() except zmq.ZMQError: # this is needed in case the ZMQ publisher is load-balanced and the # loadbalancer dropped the connection to the backend, but not the # connection to our side. without this work-around, zmq will think # that all is well, and we won't receive anything anymore delta = datetime.utcnow() - last_activity_dts if delta > timedelta(seconds=reconnect_after_inactivity): logger.warning('There was not activity for {0}, reconnecting' ' to publisher'.format(delta)) subscriber.close() time.sleep(random.randint(1, 10)) subscriber = _get_subscriber(zmq_context) last_activity_dts = datetime.utcnow() continue else: time.sleep(0.5) continue # since zmq is subscribed to everything that starts with the given # prefix, we have to do a double check to make sure this is an exact # match. if not address == expected_address: continue logger.debug('Received [{0}]: {1}'.format(address, content)) message = json.loads(content) if message['action'] == 'enqueue': _handle_enqueue_action(message, run_queue, event_queue) elif message['action'] == 'kill': _handle_kill_action(message, kill_queue, event_queue) elif message['action'] == 'ping': _handle_ping_action(message) subscriber.close()
def execute_run(run_queue, event_queue, exit_queue): """ Execute runs from the ``run_queue``. :param run_queue: An instance of ``Queue`` to consume run instances from. :param event_queue: An instance of ``Queue`` to push events to. :param exit_queue: An instance of ``Queue`` to consume from. If this queue is not empty, the function needs to terminate. """ logger.info('Starting run executer') while True: try: exit_queue.get(block=False) logger.info('Termintating run executer') return except Empty: pass try: run = run_queue.get(block=False) except Empty: time.sleep(0.5) continue # If *anything goes wrong* we want to have feedback bubling up to # the master server, including email sent and dashboard updated. # From a user POV, a job not run is a failure. # Hence the catchall try. did_run = False file_path = None logger.info('Starting run {0}'.format(run.resource_uri)) run.patch({'start_dts': datetime.now(utc).isoformat(' ')}) event_queue.put(json.dumps( {'event': 'started', 'run_id': run.id, 'kind': 'run'})) try: file_desc, file_path = tempfile.mkstemp( dir=config.get('job_runner_worker', 'script_temp_path') ) # seems there isn't support to open file descriptors directly in # utf-8 encoding os.fdopen(file_desc).close() file_obj = codecs.open(file_path, 'w', 'utf-8') file_obj.write(run.job.script_content.replace('\r', '')) file_obj.close() # get shebang from content of the script shebang = run.job.script_content.split('\n', 1)[0] if not shebang.startswith('#!'): raise Exception( 'The first line of the job to run needs to ' 'start with a shebang (#!). The current first line is: "' '{0}"'.format(shebang)) executable = "{0} {1}".format(shebang.replace('#!', ''), file_path) sub_proc = subprocess.Popen( shlex.split(executable), stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) run.patch({'pid': sub_proc.pid}) did_run = True out, err = sub_proc.communicate() except Exception as e: logger.exception('The run failed to complete because of an error') out = ('[job runner worker] Could not execute job: ' + traceback.format_exc(e)) log_output = _truncate_log(out) logger.info('Run {0} ended'.format(run.resource_uri)) run.reload() run_log = run.run_log if run_log: # handles the rare case when a job alread has a log, but was # restarted (because the return_dts was never set) run_log.patch({ 'content': log_output, }) else: run_log = RunLog( config.get('job_runner_worker', 'run_log_resource_uri')) run_log.post({ 'run': '{0}{1}/'.format( config.get('job_runner_worker', 'run_resource_uri'), run.id ), 'content': log_output }) run.patch({ 'return_dts': datetime.now(utc).isoformat(' '), 'return_success': False if did_run is False or sub_proc.returncode else True, }) event_queue.put(json.dumps( {'event': 'returned', 'run_id': run.id, 'kind': 'run'})) if file_path: os.remove(file_path)
def enqueue_actions( zmq_context, run_queue, kill_queue, event_queue, exit_queue): """ Handle incoming actions sent by the broadcaster. :param zmq_context: An instance of ``zmq.Context``. :param run_queue: An instance of ``Queue`` for pushing the runs to. :param kill_queue: An instance of ``Queue`` for pushing the kill-requests to. :param event_queue: An instance of ``Queue`` for pushing events to. :param exit_queue: An instance of ``Queue`` to consume from. If this queue is not empty, the function needs to terminate. """ logger.info('Starting enqueue loop') subscriber = _get_subscriber(zmq_context) expected_address = 'master.broadcast.{0}'.format( config.get('job_runner_worker', 'api_key')) last_activity_dts = datetime.utcnow() reconnect_after_inactivity = config.getint( 'job_runner_worker', 'reconnect_after_inactivity') while True: try: exit_queue.get(block=False) logger.info('Termintating enqueue loop') return except Empty: pass try: address, content = subscriber.recv_multipart(zmq.NOBLOCK) last_activity_dts = datetime.utcnow() except zmq.ZMQError: # this is needed in case the ZMQ publisher is load-balanced and the # loadbalancer dropped the connection to the backend, but not the # connection to our side. without this work-around, zmq will think # that all is well, and we won't receive anything anymore delta = datetime.utcnow() - last_activity_dts if delta > timedelta(seconds=reconnect_after_inactivity): logger.warning( 'There was not activity for {0}, reconnecting' ' to publisher'.format(delta) ) subscriber.close() time.sleep(random.randint(1, 10)) subscriber = _get_subscriber(zmq_context) last_activity_dts = datetime.utcnow() continue else: time.sleep(0.5) continue # since zmq is subscribed to everything that starts with the given # prefix, we have to do a double check to make sure this is an exact # match. if not address == expected_address: continue logger.debug('Received [{0}]: {1}'.format(address, content)) message = json.loads(content) if message['action'] == 'enqueue': _handle_enqueue_action(message, run_queue, event_queue) elif message['action'] == 'kill': _handle_kill_action(message, kill_queue, event_queue) elif message['action'] == 'ping': _handle_ping_action(message) subscriber.close()
def execute_run(run_queue, event_queue, exit_queue): """ Execute runs from the ``run_queue``. :param run_queue: An instance of ``Queue`` to consume run instances from. :param event_queue: An instance of ``Queue`` to push events to. :param exit_queue: An instance of ``Queue`` to consume from. If this queue is not empty, the function needs to terminate. """ logger.info('Starting run executer') while True: try: exit_queue.get(block=False) logger.info('Termintating run executer') return except Empty: pass try: run = run_queue.get(block=False) except Empty: time.sleep(0.5) continue file_desc, file_path = tempfile.mkstemp( dir=config.get('job_runner_worker', 'script_temp_path') ) # seems there isn't support to open file descriptors directly in # utf-8 encoding os.fdopen(file_desc).close() file_obj = codecs.open(file_path, 'w', 'utf-8') file_obj.write(run.job.script_content.replace('\r', '')) file_obj.close() # get shebang from content of the script shebang = run.job.script_content.split('\n', 1)[0] executable = shebang.replace('#!', '').split() executable.append(file_path) logger.info('Starting run {0}'.format(run.resource_uri)) did_run = False try: run.patch({'start_dts': datetime.now(utc).isoformat(' ')}) sub_proc = subprocess.Popen( executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) event_queue.put(json.dumps( {'event': 'started', 'run_id': run.id, 'kind': 'run'})) run.patch({'pid': sub_proc.pid}) did_run = True out, err = sub_proc.communicate() except OSError as e: out = 'Could not execute job: ' + str(e) event_queue.put(json.dumps( {'event': 'started', 'run_id': run.id, 'kind': 'run'})) log_output = _truncate_log(out) logger.info('Run {0} ended'.format(run.resource_uri)) run_log = RunLog( config.get('job_runner_worker', 'run_log_resource_uri')) run_log.post({ 'run': '{0}{1}/'.format( config.get('job_runner_worker', 'run_resource_uri'), run.id ), 'content': log_output }) run.patch({ 'return_dts': datetime.now(utc).isoformat(' '), 'return_success': False if did_run is False or sub_proc.returncode else True, }) event_queue.put(json.dumps( {'event': 'returned', 'run_id': run.id, 'kind': 'run'})) os.remove(file_path)