def remove_proxy_route(app_logger, uuidcode, app_hub_url_proxy_route, jhubtoken, username, server_name): app_logger.debug( "uuidcode={} - Remove proxys from server_name, because the original host is not accessable any longer" .format(uuidcode)) hub_header = { "Authorization": "token {}".format(jhubtoken), "uuidcode": uuidcode, "Intern-Authorization": get_jhubtoken() } try: app_logger.info("uuidcode={} - Remove Proxys for {}".format( uuidcode, server_name)) url = app_hub_url_proxy_route if ':' in server_name: server_name = server_name.split(':')[1] url = url + '/' + username if server_name != '': url = url + '/' + server_name app_logger.trace("uuidcode={} - Delete Proxy Route: {} {}".format( uuidcode, url, hub_header)) for i in range(0, 10): with closing( requests.delete(url, headers=hub_header, verify=False, timeout=1800)) as r: if r.status_code == 200: app_logger.info( "uuidcode={} - Proxy route deletion successful".format( uuidcode)) return True elif r.status_code == 503: app_logger.info( "uuidcode={} - Proxy route deletion status_code 503. Try again (Try {}/10)" .format(uuidcode, i + 1)) else: raise Exception( "uuidcode={} - Could not remove proxy route for server_name {}: {} {}" .format(uuidcode, server_name, r.text, r.status_code)) except requests.exceptions.ConnectTimeout: app_logger.exception( "uuidcode={} - Timeout reached (1800). Could not remove routes from proxy via JupyterHub. Url: {} Headers: {}" .format(uuidcode, url, remove_secret(hub_header))) except: app_logger.exception( "uuidcode={} - Could not remove routes from proxy via JupyterHub. Url: {} Headers: {}" .format(uuidcode, url, remove_secret(hub_header)))
def abort_job(app_logger, uuidcode, kernelurl, unicore_header, cert): app_logger.debug( "uuidcode={} - Try to abort job with kernelurl: {}".format( uuidcode, kernelurl)) try: # If the API of UNICORE will change, the additional GET call might be necessary. # Since the action:abort url is (right now) always: kernelurl + /actions/abort we will just use this """ method = "GET" method_args = { "url": kernelurl, "headers": unicore_header, "certificate", cert } text, status_code, response_header = unicore_communication.request(app_logger, uuidcode, method, method_args) if status_code != 200 ... else: url = json.loads(text)['_links']['action:abort']['href'] """ method = "POST" method_args = { "url": kernelurl + '/actions/abort', "headers": unicore_header, "data": "{}", "certificate": cert } app_logger.info("uuidcode={} - Abort UNICORE/X Job {}".format( uuidcode, kernelurl)) text, status_code, response_header = unicore_communication.request( app_logger, uuidcode, method, method_args) if status_code < 200 or status_code > 299: app_logger.warning( "uuidcode={} - Could not abort Job. Response from UNICORE/X: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) else: unicore_header['X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] except: app_logger.exception( "uuidcode={} - Could not abort Job.".format(uuidcode))
def destroy_job(app_logger, uuidcode, kernelurl, unicore_header, cert): app_logger.debug( "uuidcode={} - Try to destroy Job with kernelurl: {}".format( uuidcode, kernelurl)) method = "DELETE" method_args = { "url": kernelurl, "headers": unicore_header, "certificate": cert } try: app_logger.info("uuidcode={} - Destroy UNICORE/X Job".format(uuidcode)) text, status_code, response_header = unicore_communication.request( app_logger, uuidcode, method, method_args) if status_code > 399: app_logger.warning( "uuidcode={} - Could not destroy job. WorkDirectory may still exist. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) except: app_logger.exception( "uuidcode={} - Could not destroy job.".format(uuidcode))
def token(app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_token, jhubtoken, accesstoken, expire, username, server_name): app_logger.debug( "uuidcode={} - Send new token to JupyterHub".format(uuidcode)) app_logger.trace("uuidcode={} - Access_token: {} , expire: {}".format( uuidcode, accesstoken, expire)) hub_header = { "Authorization": "token {}".format(jhubtoken), "uuidcode": uuidcode, "Intern-Authorization": get_jhubtoken() } hub_json = {"accesstoken": accesstoken, "expire": str(expire)} try: app_logger.info( "uuidcode={} - Update JupyterHub Token".format(uuidcode)) url = app_hub_url_token if ':' in server_name: server_name = server_name.split(':')[1] url = url + '/' + username if server_name != '': url = url + '/' + server_name app_logger.trace( "uuidcode={} - Update JupyterHub Token: {} {} {}".format( uuidcode, url, hub_header, hub_json)) with closing( requests.post(url, headers=hub_header, json=hub_json, verify=False, timeout=1800)) as r: if r.status_code == 201: app_logger.trace( "uuidcode={} - Token Update successful: {} {} {}".format( uuidcode, r.text, r.status_code, r.headers)) return elif r.status_code == 503: remove_proxy_route(app_logger, uuidcode, app_hub_url_proxy_route, jhubtoken, username, server_name) # try again with closing( requests.post(url, headers=hub_header, json=hub_json, verify=False, timeout=1800)) as r2: if r2.status_code == 201: app_logger.trace( "uuidcode={} - Token Update successful: {} {} {}". format(uuidcode, r2.text, r2.status_code, r2.headers)) return else: app_logger.warning( "uuidcode={} - Token Update sent wrong status_code: {} {} {}" .format(uuidcode, r2.text, r2.status_code, remove_secret(r2.headers))) else: app_logger.warning( "uuidcode={} - Token Update sent wrong status_code: {} {} {}" .format(uuidcode, r.text, r.status_code, remove_secret(r.headers))) except requests.exceptions.ConnectTimeout: app_logger.exception( "uuidcode={} - Timeout reached (1800). Could not send update token to JupyterHub. Url: {} Headers: {}" .format(uuidcode, url, remove_secret(hub_header))) except: app_logger.exception( "uuidcode={} - Could not send update token to JupyterHub. Url: {} Headers: {}" .format(uuidcode, url, remove_secret(hub_header)))
def cancel(app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_cancel, jhubtoken, errormsg, username, server_name=''): app_logger.debug( "uuidcode={} - Send cancel to JupyterHub".format(uuidcode)) hub_header = { "Authorization": "token {}".format(jhubtoken), "Intern-Authorization": get_jhubtoken(), "uuidcode": uuidcode, "Error": errormsg, "Stopped": "True" } try: url = app_hub_url_cancel if ':' in server_name: server_name = server_name.split(':')[1] url = url + '/' + username if server_name != '': url = url + '/' + server_name app_logger.trace("uuidcode={} - Cancel JupyterHub: {} {}".format( uuidcode, url, hub_header)) with closing( requests.delete(url, headers=hub_header, verify=False, timeout=1800)) as r: if r.status_code == 202: app_logger.trace( "uuidcode={} - Cancel successful: {} {} {}".format( uuidcode, r.text, r.status_code, r.headers)) return elif r.status_code == 503: remove_proxy_route(app_logger, uuidcode, app_hub_url_proxy_route, jhubtoken, username, server_name) # try again with closing( requests.delete(url, headers=hub_header, verify=False, timeout=1800)) as r2: if r2.status_code == 202: app_logger.trace( "uuidcode={} - Cancel successful: {} {} {}".format( uuidcode, r2.text, r2.status_code, r2.headers)) return else: app_logger.warning( "uuidcode={} - JupyterHub.cancel sent wrong status_code: {} {} {}" .format(uuidcode, r2.text, r2.status_code, remove_secret(r2.headers))) else: app_logger.warning( "uuidcode={} - JupyterHub.cancel sent wrong status_code: {} {} {}" .format(uuidcode, r.text, r.status_code, remove_secret(r.headers))) except requests.exceptions.ConnectTimeout: app_logger.exception( "uuidcode={} - Timeout reached (1800). Could not send cancel to JupyterHub. Url: {}, Headers: {}" .format(uuidcode, url, remove_secret(hub_header))) except: if errormsg != "": cancel(app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_cancel, jhubtoken, "", username, server_name) else: app_logger.exception( "uuidcode={} - Could not send cancel to JupyterHub. Url: {}, Headers: {}" .format(uuidcode, url, remove_secret(hub_header)))
def status(app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_status, jhubtoken, status, username, server_name=''): app_logger.debug("uuidcode={} - Send status to JupyterHub: {}".format( uuidcode, status)) hub_header = { "Authorization": "token {}".format(jhubtoken), "uuidcode": uuidcode, "Intern-Authorization": get_jhubtoken() } hub_json = {"Status": status} try: url = app_hub_url_status if ':' in server_name: server_name = server_name.split(':')[1] url = url + '/' + username if server_name != '': url = url + '/' + server_name app_logger.trace( "uuidcode={} - Update JupyterHub Status: {} {} {}".format( uuidcode, url, hub_header, hub_json)) with closing( requests.post(url, headers=hub_header, json=hub_json, verify=False, timeout=1800)) as r: if r.status_code == 201: app_logger.trace( "uuidcode={} - Status Update successful: {} {} {}".format( uuidcode, r.text, r.status_code, r.headers)) return elif r.status_code == 503: remove_proxy_route(app_logger, uuidcode, app_hub_url_proxy_route, jhubtoken, username, server_name) # try again with closing( requests.post(url, headers=hub_header, json=hub_json, verify=False, timeout=1800)) as r2: if r2.status_code == 201: app_logger.trace( "uuidcode={} - Status Update successful: {} {} {}". format(uuidcode, r2.text, r2.status_code, r2.headers)) return elif r2.status_code == 404: app_logger.info( "uuidcode={} - JupyterHub doesn't know the spawner." .format(uuidcode)) elif r.status_code == 404: app_logger.info( "uuidcode={} - JupyterHub doesn't know the spawner.". format(uuidcode)) except requests.exceptions.ConnectTimeout: app_logger.exception( "uuidcode={} - Timeout reached (1800). Could not send status update to JupyterHub. Url: {} Headers: {}" .format(uuidcode, url, remove_secret(hub_header))) except: app_logger.exception( "uuidcode={} - Could not send status update to JupyterHub. Url: {} Headers: {}" .format(uuidcode, url, remove_secret(hub_header)))
def create(app_logger, uuidcode, app_hub_url_proxy_route, app_tunnel_url, app_hub_url_cancel, kernelurl, filedir, unicore_header, servername, system, port, cert, jhubtoken, username, servername_short, app_orchestrator_url_hostname): app_logger.trace("uuidcode={} - Try to create a tunnel".format(uuidcode)) accept = unicore_header.get('Accept', False) unicore_header['Accept'] = 'application/octet-stream' hostname = "" try: method = "GET" method_args = { "url": filedir + '/.host', "headers": unicore_header, "certificate": cert, "return_content": True } content, status_code, response_header = unicore_communication.request( app_logger, uuidcode, method, method_args) if status_code != 200: app_logger.warning( "uuidcode={} - Could not get hostname. UNICORE/X Response: {} {} {}" .format(uuidcode, content, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get hostname. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) else: unicore_header['X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] hostname = content.strip() except: app_logger.exception( "uuidcode={} - Could not get hostname. {} {}".format( uuidcode, method, remove_secret(method_args))) app_logger.warning( "uuidcode={} - Send cancel to JupyterHub.".format(uuidcode)) hub_communication.cancel( app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_cancel, jhubtoken, "A mandatory backend service had a problem. An administrator is informed.", username, servername_short) unicore_utils.abort_job(app_logger, uuidcode, kernelurl, unicore_header, cert) unicore_utils.destroy_job(app_logger, uuidcode, kernelurl, unicore_header, cert) raise Exception("{} - Could not get hostname".format(uuidcode)) app_logger.trace( 'uuidcode={} - Inform J4J_Orchestrator about the hostname'.format( uuidcode)) try: orchestrator_communication.set_hostname(app_logger, uuidcode, app_orchestrator_url_hostname, servername, hostname) except: app_logger.exception( "uuidcode={} - Could not set hostname to {} in J4J_Orchestrator database for {}" .format(uuidcode, hostname, servername)) tunnel_header = { 'Intern-Authorization': utils_file_loads.get_j4j_tunnel_token(), 'uuidcode': uuidcode } if system == 'JUWELS' and hostname[:3] == 'jwc': hostname = hostname.split('.')[0] if system == 'JURON' and hostname[:6] == 'juronc': hostname = hostname.split('.')[0] tunnel_data = { 'account': servername, # for internal tunnel database 'system': system, 'hostname': hostname, 'port': port } tunnel_communication.j4j_start_tunnel(app_logger, uuidcode, app_tunnel_url, tunnel_header, tunnel_data) try: method = "PUT" method_args = { "url": filedir + '/.tunnel', "headers": unicore_header, "data": '{}'.format(port), "certificate": cert } text, status_code, response_header = unicore_communication.request( app_logger, uuidcode, method, method_args) if status_code != 204: app_logger.warning( "uuidcode={} - Could not create .tunnel file. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not create .tunnel file. Throw Exception because of wrong status_code: {}" .format(uuidcode, status_code)) else: unicore_header['X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] except: app_logger.exception( "uuidcode={} - Could not create .tunnel file. {} {}".format( uuidcode, method, remove_secret(method_args))) app_logger.warning( "uuidcode={} - Send cancel to JupyterHub.".format(uuidcode)) hub_communication.cancel( app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_cancel, jhubtoken, "A mandatory backend service had a problem. An administrator is informed.", username, servername_short) unicore_utils.abort_job(app_logger, uuidcode, kernelurl, unicore_header, cert) unicore_utils.destroy_job(app_logger, uuidcode, kernelurl, unicore_header, cert) raise Exception("{} - Could not create .tunnel file.".format(uuidcode)) if accept: unicore_header['Accept'] = accept else: del unicore_header['Accept']
def get(app_logger, uuidcode, request_headers, unicore_header, app_urls, cert): try: servername = request_headers.get('servername') if ':' in servername: servername = servername.split(':')[1] else: servername = '' counter = 0 children = [] status = '' accesstoken = request_headers.get('accesstoken') expire = request_headers.get('expire') while True: # start with sleep, this function is only called, if .host was not in children time.sleep(3) # renew token. This may be run for a long time, so the accesstoken can expire accesstoken, expire = renew_token( app_logger, uuidcode, request_headers.get("tokenurl"), request_headers.get("authorizeurl"), request_headers.get("refreshtoken"), accesstoken, expire, request_headers.get('jhubtoken'), app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('hub', {}).get('url_token'), request_headers.get('escapedusername'), request_headers.get('servername')) unicore_header['Authorization'] = 'Bearer {}'.format(accesstoken) for i in range(3): # @UnusedVariable properties_json = {} try: method = "GET" method_args = { "url": request_headers.get('kernelurl'), "headers": unicore_header, "certificate": cert } app_logger.info( "uuidcode={} - Get Properties of UNICORE/X Job {}". format(uuidcode, request_headers.get('kernelurl'))) text, status_code, response_header = unicore_communication.request( app_logger, uuidcode, method, method_args) if status_code == 200: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] properties_json = json.loads(text) if properties_json.get( 'status') == 'UNDEFINED' and i < 4: app_logger.debug( "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds" .format(uuidcode)) time.sleep(2) else: break elif status_code == 404: if i < 4: app_logger.debug( "uuidcode={} - Could not get properties. 404 Not found. Sleep for 2 seconds and try again" .format(uuidcode)) time.sleep(2) else: orchestrator_communication.set_skip( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_skip'), request_headers.get('servername'), 'False') app_logger.error( "uuidcode={} - Could not get properties. 404 Not found. Do nothing and return. {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) return "", 539 elif status_code == 500: if i < 4: app_logger.debug( "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again" .format(uuidcode)) time.sleep(2) else: app_logger.error( "uuidcode={} - Could not get properties. system: {}" .format( uuidcode, request_headers.get( 'system', '<system_unknown>'))) app_logger.warning( "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) app_logger.warning( "uuidcode={} - Do not send update to JupyterHub." .format(uuidcode)) # If JupyterHub don't receives an update for a long time it can stop the job itself. orchestrator_communication.set_skip( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_skip'), request_headers.get('servername'), 'False') return "", 539 else: app_logger.error( "uuidcode={} - Unknown status_code. Add case for this" .format(uuidcode)) if i < 4: app_logger.debug( "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again" .format(uuidcode)) time.sleep(2) else: app_logger.warning( "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get properties. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) except: orchestrator_communication.set_skip( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_skip'), request_headers.get('servername'), 'False') app_logger.exception( "uuidcode={} - Could not get properties. Try to stop it {} {}" .format(uuidcode, method, remove_secret(method_args))) app_logger.trace( "uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job( app_logger, uuidcode, servername, request_headers.get('system'), request_headers, app_urls, True, "Jupyter@JSC backend error. An administrator is informed. Please try again in a few minutes." ) except: app_logger.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return "", 539 if properties_json.get('status') in [ 'SUCCESSFUL', 'ERROR', 'FAILED', 'NOT_SUCCESSFUL' ]: # Job is Finished for UNICORE, so it should be for JupyterHub orchestrator_communication.set_skip( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_skip'), request_headers.get('servername'), 'False') if not properties_json.get( 'statusMessage') == 'Job was aborted by the user.': app_logger.error( 'uuidcode={} - Get: Job is finished or failed - JobStatus: {}. Send Information to JHub.\n{}' .format(uuidcode, properties_json.get('status'), properties_json)) app_logger.trace( "uuidcode={} - Call stop_job".format(uuidcode)) error_msg = "" try: mem = utils_file_loads.map_error_messages() if properties_json.get('status') in [ 'FAILED' ] and properties_json.get('statusMessage') in mem.keys(): error_msg = mem.get( properties_json.get('statusMessage', ''), "Could not start your Job. Please check your configuration. An administrator is informed." ) else: app_logger.error( "uuidcode={} - StatusMessage from Failed UNICORE Job not found in /etc/j4j/j4j_mount/j4j_unicore/map_error_messages.json. Please update to have a better user experience" .format(uuidcode)) error_msg = "Could not start your Job. Please check your configuration. An administrator is informed." except: error_msg = "Could not start your Job. Please check your configuration. An administrator is informed." try: stop_job(app_logger, uuidcode, servername, request_headers.get('system'), request_headers, app_urls, True, error_msg) except: app_logger.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 530 try: method = "GET" method_args = { "url": request_headers.get('filedir'), "headers": unicore_header, "certificate": cert } text, status_code, response_header = unicore_communication.request( app_logger, uuidcode, method, method_args) if status_code == 200: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] # in UNICORE 8 the answer is a bit different children_json = json.loads(text) if 'children' in children_json.keys(): children = json.loads(text).get('children', []) elif 'content' in children_json.keys(): children = list( json.loads(text).get('content', {}).keys()) else: app_logger.warning( "uuidcode={} - Could not find any childrens in {}". format(uuidcode, text)) children = [] elif status_code == 404: orchestrator_communication.set_skip( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_skip'), request_headers.get('servername'), 'False') app_logger.warning( "uuidcode={} - Could not get properties. 404 Not found. Do nothing and return. {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) return "", 539 else: app_logger.warning( "uuidcode={} - Could not get information about filedirectory. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get information about filedirectory. Throw Exception because of wrong status_code: {}" .format(uuidcode, status_code)) except: counter += 1 if counter > 10: app_logger.error( "uuidcode={} - Get filelist ({}) failed 10 times over 30 seconds. {} {}" .format(uuidcode, request_headers.get('filedir'), method, remove_secret(method_args))) app_logger.trace( "uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app_logger, uuidcode, servername, request_headers.get('system'), request_headers, app_urls) except: app_logger.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) app_logger.info( "uuidcode={} - Get filelist ({}) failed {} time(s)".format( uuidcode, request_headers.get('filedir'), counter)) hub_communication.status( app_logger, uuidcode, app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('hub', {}).get('url_status'), request_headers.get('jhubtoken'), 'waitforhostname', request_headers.get('escapedusername'), servername) continue if '.end' in children or '/.end' in children: # It's not running anymore status = 'stopped' elif '.host' in children or '/.host' in children: # running, build up tunnel try: tunnel_utils.create( app_logger, uuidcode, app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('tunnel', {}).get('url_tunnel'), app_urls.get('hub', {}).get('url_cancel'), request_headers.get('kernelurl'), request_headers.get('filedir'), unicore_header, request_headers.get('servername'), request_headers.get('system'), request_headers.get('port'), cert, request_headers.get('jhubtoken'), request_headers.get('escapedusername'), servername, app_urls.get('orchestrator', {}).get('url_hostname')) except: orchestrator_communication.set_skip( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_skip'), request_headers.get('servername'), 'False') app_logger.exception( "uuidcode={} - Could not create tunnel".format( uuidcode)) app_logger.trace( "uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app_logger, uuidcode, servername, request_headers.get('system'), request_headers, app_urls) except: app_logger.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return status = "running" else: app_logger.info( "uuidcode={} - Update JupyterHub status ({})".format( uuidcode, "waitforhostname")) hub_communication.status( app_logger, uuidcode, app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('hub', {}).get('url_status'), request_headers.get('jhubtoken'), "waitforhostname", request_headers.get('escapedusername'), servername) continue app_logger.info( "uuidcode={} - Update JupyterHub status ({})".format( uuidcode, status)) hub_communication.status( app_logger, uuidcode, app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('hub', {}).get('url_status'), request_headers.get('jhubtoken'), status, request_headers.get('escapedusername'), servername) if status in ['running', 'stopped'] and request_headers.get( 'spawning', 'true').lower() == 'true': # spawning is finished app_logger.trace( 'uuidcode={} - Tell J4J_Orchestrator that the spawning is done' .format(uuidcode)) try: orchestrator_communication.set_spawning( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_spawning'), request_headers.get('servername'), 'False') except: app_logger.exception( "uuidcode={} - Could not set spawning to false in J4J_Orchestrator database for {}" .format(uuidcode, request_headers.get('servername'))) orchestrator_communication.set_skip( app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_skip'), request_headers.get('servername'), 'False') return except: app_logger.exception("uuidcode={} - Bugfix required".format(uuidcode))
def copy_log(app_logger, uuidcode, unicore_header, filedir, kernelurl, cert): app_logger.debug("uuidcode={} - Copy Log from {}".format( uuidcode, kernelurl)) # in this directory we will write the complete log from the started server. directory = '/etc/j4j/j4j_mount/jobs/{}_{}'.format( kernelurl.split('/')[-1], datetime.datetime.today().strftime('%Y_%m_%d-%H_%M_%S')) for i in range(10): if os.path.exists(directory): add_uuid = uuid.uuid4().hex directory = directory + '_' + add_uuid if not os.path.exists(directory): os.makedirs(directory) break if i == 9: app_logger.warning( "uuidcode={} - Could not find a directory to save files". format(uuidcode)) return app_logger.debug("uuidcode={} - Copy Log to {}".format( uuidcode, directory)) # Get children list try: app_logger.info( "uuidcode={} - Get list of files of UNICORE/X Job".format( uuidcode)) text, status_code, response_header = unicore_communication.request( app_logger, uuidcode, "GET", { "url": filedir, "headers": unicore_header, "certificate": cert }) if status_code != 200: app_logger.warning( "uuidcode={} - Could not save files from {}. Response from UNICORE: {} {} {}" .format(uuidcode, kernelurl, text, status_code, remove_secret(response_header))) return # in UNICORE 8 the answer is a bit different children_json = json.loads(text) if 'children' in children_json.keys(): children = json.loads(text).get('children', []) elif 'content' in children_json.keys(): children = list(json.loads(text).get('content', {}).keys()) else: app_logger.warning( "uuidcode={} - Could not find any childrens in {}".format( uuidcode, text)) children = [] unicore_header['X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] except: app_logger.exception( "uuidcode={} - Could not save files from {}".format( uuidcode, kernelurl)) return # For the file input we need another Accept in the header, save the old one hostname = "" accept = unicore_header.get('Accept', False) unicore_header['Accept'] = 'application/octet-stream' app_logger.info("uuidcode={} - Save files in directory {}".format( uuidcode, directory)) for child in children: try: content, status_code, response_header = unicore_communication.request( app_logger, uuidcode, "GET", { "url": filedir + '/' + child, "headers": unicore_header, "certificate": cert, "return_content": True }) if status_code != 200: app_logger.warning( "uuidcode={} - Could not save file {} from {}. Try next. Response from UNICORE: {} {} {}" .format(uuidcode, child, kernelurl, content, status_code, remove_secret(response_header))) continue with open(directory + '/' + child, 'w') as f: f.write(str(content.encode("utf-8"))) if child == ".host" or child == "/.host": hostname = content.strip() except: app_logger.exception( "uuidcode={} - Could not save file {} from {}".format( uuidcode, child, kernelurl)) break if accept: unicore_header['Accept'] = accept else: del unicore_header['Accept'] app_logger.debug("uuidcode={} - Log from {} to {} copied".format( uuidcode, kernelurl, directory)) return hostname
def renew_token(app_logger, uuidcode, token_url, authorize_url, refreshtoken, accesstoken, expire, jhubtoken, app_hub_url_proxy_route, app_hub_url_token, username, servername=''): if int(expire) - int(time.time()) > 60: return accesstoken, expire app_logger.info( "uuidcode={} - Renew Token: Expire at {} , time: {}".format( uuidcode, int(expire), int(time.time()))) unity = get_unity() if token_url == '': app_logger.warning( "uuidcode={} - Use default token_url. Please send token_url in header" .format(uuidcode)) token_url = unity.get('links').get('token') tokeninfo_url = unity[token_url].get('links', {}).get('tokeninfo') cert_path = unity[token_url].get('certificate', False) scope = ' '.join(unity[authorize_url].get('scope')) b64key = base64.b64encode( bytes( '{}:{}'.format(unity[token_url].get('client_id'), unity[token_url].get('client_secret')), 'utf-8')).decode('utf-8') data = { 'refresh_token': refreshtoken, 'grant_type': 'refresh_token', 'scope': scope } headers = { 'Authorization': 'Basic {}'.format(b64key), 'Accept': 'application/json' } app_logger.info("uuidcode={} - Post to {}".format(uuidcode, token_url)) app_logger.trace("uuidcode={} - Header: {}".format(uuidcode, headers)) app_logger.trace("uuidcode={} - Data: {}".format(uuidcode, data)) try: with closing( requests.post(token_url, headers=headers, data=data, verify=cert_path, timeout=1800)) as r: app_logger.trace( "uuidcode={} - Unity Response: {} {} {} {}".format( uuidcode, r.text, r.status_code, remove_secret(r.headers), remove_secret(r.json))) if r.status_code == 400: # wrong refresh_token, send cancel error_msg = "Unknown Error. An Administrator is informed." try: r_json = json.loads(r.text) if r_json.get( 'error_description', '') != "Invalid request; wrong refresh token": app_logger.error( "uuidcode={} - Received unknown answer from Unity: {}" .format(uuidcode, r.text)) else: error_msg = "Invalid token. Please logout and login again." except: try: app_logger.exception( "uuidcode={} - Could not check for Unity error description: {}" .format(uuidcode, r.text)) except: app_logger.exception( "uuidcode={} - Could not check for Unity error description" .format(uuidcode)) raise SpawnException(error_msg) accesstoken = r.json().get('access_token') with closing( requests.get( tokeninfo_url, headers={'Authorization': 'Bearer {}'.format(accesstoken)}, verify=cert_path, timeout=1800)) as r: app_logger.trace( "uuidcode={} - Unity Response: {} {} {} {}".format( uuidcode, r.text, r.status_code, remove_secret(r.headers), remove_secret(r.json))) expire = r.json().get('exp') except SpawnException as e: raise SpawnException(str(e)) except: app_logger.exception( "uuidcode={} - Could not update token".format(uuidcode)) raise Exception("{} - Could not update token".format(uuidcode)) app_logger.info("uuidcode={} - Update JupyterHub Token".format(uuidcode)) hub_communication.token(app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_token, jhubtoken, accesstoken, expire, username, servername) return accesstoken, expire
def post(self): try: # Track actions through different webservices. uuidcode = request.headers.get('uuidcode', '<no uuidcode>') app.log.info("uuidcode={} - Spawn Server".format(uuidcode)) app.log.trace("uuidcode={} - Headers: {}".format( uuidcode, request.headers)) app.log.trace("uuidcode={} - Json: {}".format( uuidcode, request.json)) # Check for J4J intern token validate_auth(app.log, uuidcode, request.headers.get('Intern-Authorization')) servername = request.headers.get('servername') # Create header for unicore job try: unicore_header, accesstoken, expire = unicore_utils.create_header( app.log, # @UnusedVariable uuidcode, request.headers, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_token'), request.headers.get('escapedusername'), servername) except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - Could not create header for UNICORE/X Job. {} {}" .format(uuidcode, remove_secret(request.json), app.urls.get('tunnel', {}).get('url_remote'))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg, False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) # Return positive status: Administrator is informed and there is nothing we can do here otherwise. return "", 200 # Create input files for the job. A working J4J_tunnel webservice is required try: unicore_input = unicore_utils.create_inputs( app.log, uuidcode, request.json, request.headers.get('project'), app.urls.get('tunnel', {}).get('url_remote'), request.headers.get('account')) except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes." app.log.exception( "uuidcode={} - Could not create input files for UNICORE/X Job. {} {}" .format(uuidcode, remove_secret(request.json), app.urls.get('tunnel', {}).get('url_remote'))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg, False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 534 # Create Job description unicore_file = utils_file_loads.get_unicorex() if unicore_file.get(request.json.get('system').upper(), {}).get("UNICORE8", False): unicore_json = unicore_utils.create_unicore8_job( app.log, uuidcode, request.json, request.headers.get('Project'), unicore_input, request.headers.get('escapedusername')) else: unicore_json = unicore_utils.create_job( app.log, uuidcode, request.json, request.headers.get('Project'), unicore_input) # Get URL and certificate to communicate with UNICORE/X app.log.trace( "uuidcode={} - FileLoad: UNICORE/X url".format(uuidcode)) unicorex = utils_file_loads.get_unicorex() url = unicorex.get(request.json.get('system', ''), {}).get( 'link', '<no_url_found_for_{}>'.format(request.json.get('system'))) app.log.trace( "uuidcode={} - FileLoad: UNICORE/X url Result: {}".format( uuidcode, url)) cert = unicorex.get(request.json.get('system', ''), {}).get('certificate', False) app.log.trace( "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}" .format(uuidcode, cert)) # Submit Job. It will not be started, because of unicore_json['haveClientStageIn']='true' kernelurl = "" try: hub_communication.status( app.log, uuidcode, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_status'), request.headers.get('jhubtoken'), 'submitunicorejob', request.headers.get('escapedusername'), servername) method = "POST" method_args = { "url": url + "/jobs", "headers": unicore_header, "data": json.dumps(unicore_json), "certificate": cert } app.log.info("uuidcode={} - Submit UNICORE/X Job to {}".format( uuidcode, url + "/jobs")) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code != 201: app.log.warning( "uuidcode={} - Could not submit Job. Response from UNICORE/X: {} {} {}." .format(uuidcode, text, status_code, remove_secret(response_header))) if status_code == 500: app.log.error( "uuidcode={} - UNICORE RESTART REQUIRED!! {}". format( uuidcode, request.json.get('system', '<system_unknown>'))) elif status_code == 403 or status_code == 432: raise SpawnException( "Invalid token. Please logout and login again.") else: app.log.error( "uuidcode={} - Unexpected status_code. Add case for this status_code." .format(uuidcode)) raise SpawnException( "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes." ) else: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] kernelurl = response_header['Location'] except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - User message: {} - Could not submit Job. {} {}" .format(uuidcode, err_msg, method, remove_secret(method_args))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg, False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 539 # get properties of job for i in range(5): # @UnusedVariable properties_json = {} try: method = "GET" method_args = { "url": kernelurl, "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get Properties of UNICORE/X Job {}". format(uuidcode, kernelurl)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code != 200: if status_code == 500: app.log.error( "uuidcode={} - UNICORE RESTART REQUIRED!! {}". format( uuidcode, request.json.get('system', '<system_unknown>'))) raise SpawnException( "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes." ) else: app.log.error( "uuidcode={} - Unexpected status_code. Add case for this status_code." .format(uuidcode)) if i < 4: app.log.debug( "uuidcode={} - Could not get properties of Job. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: app.log.warning( "uuidcode={} - Could not get properties of Job. Response from UNICORE/X: {} {} {}." .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get properties of Job. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) else: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] properties_json = json.loads(text) if properties_json.get( 'status') == 'UNDEFINED' and i < 4: app.log.debug( "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: break except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - Could not get properties of Job. {} {}" .format(uuidcode, method, remove_secret(method_args))) app.log.trace( "uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return "", 539 # get file directory # this will be used in get. Ask it here once and send it to get() afterwards filedirectory = "" try: method = "GET" method_args = { "url": properties_json['_links']['workingDirectory']['href'], "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get path of file directory of UNICORE/X Job" .format(uuidcode)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code != 200: app.log.error( "uuidcode={} - Unknown status_code. Please add case for this status_code" .format(uuidcode)) app.log.warning( "uuidcode={} - Could not get filedirectory. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get filedirectory. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) else: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] filedirectory = json.loads(text)['_links']['files']['href'] except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - Could not get filedirectory. {} {}".format( uuidcode, method, remove_secret(method_args))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 539 return "", 201, { 'kernelurl': kernelurl, 'filedir': filedirectory, 'X-UNICORE-SecuritySession': unicore_header.get('X-UNICORE-SecuritySession') } except: app.log.exception("Jobs.post failed. Bugfix required")
def get(self): try: # Track actions through different webservices. uuidcode = request.headers.get('uuidcode', '<no uuidcode>') app.log.info("uuidcode={} - Get Server Status".format(uuidcode)) app.log.trace("uuidcode={} - Headers: {}".format( uuidcode, request.headers)) # Check for J4J intern token validate_auth(app.log, uuidcode, request.headers.get('intern-authorization')) servername = request.headers.get('servername') # Create UNICORE header and get certificate try: unicore_header, accesstoken, expire = unicore_utils.create_header( app.log, # @UnusedVariable uuidcode, request.headers, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_token'), request.headers.get('escapedusername'), servername) except (SpawnException, Exception): app.log.exception( "uuidcode={} - Could not Create Header. Token from user {} might be revoked. Do nothing and return." .format(uuidcode, request.headers.get('escapedusername'))) # Return positive status: Administrator is informed and there is nothing we can do here otherwise. return "", 200 app.log.trace( "uuidcode={} - FileLoad: UNICORE/X certificate path".format( uuidcode)) unicorex = utils_file_loads.get_unicorex() cert = unicorex.get(request.headers.get('system', ''), {}).get('certificate', False) app.log.trace( "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}" .format(uuidcode, cert)) # Get Properties of kernelurl kernelurl = request.headers.get('kernelurl') for i in range(5): # @UnusedVariable properties_json = {} try: method = "GET" method_args = { "url": kernelurl, "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get Properties of UNICORE/X Job {}". format(uuidcode, kernelurl)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code == 200: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] properties_json = json.loads(text) if properties_json.get( 'status') == 'UNDEFINED' and i < 4: app.log.debug( "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: break elif status_code == 404: if i < 4: app.log.debug( "uuidcode={} - Could not get properties. 404 Not found. Sleep for 2 seconds and try again" .format(uuidcode)) sleep(2) else: app.log.warning( "uuidcode={} - Could not get properties. 404 Not found. Stop Job and return. {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') try: stop_job(app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, '', False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return "", 539 elif status_code == 500: if i < 4: app.log.debug( "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again" .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - UNICORE RESTART REQUIRED!!. system: {}" .format( uuidcode, request.headers.get( 'system', '<system_unknown>'))) app.log.warning( "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) app.log.warning( "uuidcode={} - Do not send update to JupyterHub." .format(uuidcode)) # If JupyterHub don't receives an update for a long time it can stop the job itself. orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 else: app.log.error( "uuidcode={} - Unknown status_code received. Add case for this: {} {}" .format(uuidcode, status_code, text)) if i < 4: app.log.debug( "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again" .format(uuidcode)) sleep(2) else: app.log.warning( "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get properties. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) except: app.log.exception( "uuidcode={} - Could not get properties. JupyterLab will be still running. {} {}" .format(uuidcode, method, remove_secret(method_args))) app.log.warning( "uuidcode={} - Do not send update to JupyterHub.". format(uuidcode)) # If JupyterHub don't receives an update for a long time it can stop the job itself. orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 if properties_json.get('status') in [ 'SUCCESSFUL', 'ERROR', 'FAILED', 'NOT_SUCCESSFUL' ]: # Job is Finished for UNICORE, so it should be for JupyterHub if request.headers.get('pollspawner', 'false').lower() == 'true': app.log.error( 'uuidcode={} - Get (poll spawner): Job is finished or failed - JobStatus: {}. Send Information to JHub. {}' .format(uuidcode, properties_json.get('status'), properties_json)) if properties_json.get( 'statusMessage', '' ) == "Failed: Execution was not completed (no exit code file found), please check standard error file <stderr>": app.log.error( "uuidcode={} - UNICORE hotfix: do nothing because that's most likely a bug." .format(uuidcode)) return "", 200 else: if not properties_json.get( 'statusMessage') == 'Job was aborted by the user.': app.log.error( 'uuidcode={} - At starting process: Job is finished or failed - JobStatus: {}. Send Information to JHub. {}' .format(uuidcode, properties_json.get('status'), properties_json)) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') error_msg = "" try: mem = utils_file_loads.map_error_messages() if properties_json.get('status') in [ 'FAILED' ] and properties_json.get('statusMessage') in mem.keys(): error_msg = mem.get( properties_json.get('statusMessage', ''), "Could not start your Job. Please check your configuration. An administrator is informed." ) else: for key, value in mem.items(): if properties_json.get('statusMessage', '').startswith(key): error_msg = value if error_msg == "": if request.headers.get('pollspawner', 'false').lower() == 'true': app.log.error( "uuidcode={} - StatusMessage from Failed UNICORE Job not found in /etc/j4j/j4j_mount/j4j_unicore/map_error_messages.json. Please update to have a better user experience" .format(uuidcode)) error_msg = "Could not start your Job. Please check your configuration. An administrator is informed." except: error_msg = "Could not start your Job. Please check your configuration. An administrator is informed." try: stop_job(app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, error_msg) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 530 # The Job is not finished yet (good) # Get Files in the filedir children = [] for i in range(5): # @UnusedVariable try: method = "GET" method_args = { "url": request.headers.get('filedir'), "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get list of files of UNICORE/X Job {}". format(uuidcode, kernelurl)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code == 200: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] # in UNICORE 8 the answer is a bit different children_json = json.loads(text) if 'children' in children_json.keys(): children = json.loads(text).get('children', []) elif 'content' in children_json.keys(): children = list( json.loads(text).get('content', {}).keys()) else: app.log.warning( "uuidcode={} - Could not find any childrens in {}" .format(uuidcode, text)) children = [] if len(children) == 0 and i < 4: app.log.debug( "uuidcode={} - Received empty children list. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: break elif status_code == 404: if i < 4: app.log.debug( "uuidcode={} - Could not get children list. 404 Not found. Try again in 2 seconds." .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - Could not get children list. 404 Not found. Do nothing and return. {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 elif status_code == 500: if i < 4: app.log.debug( "uuidcode={} - Could not get children list. Status Code 500. Try again in 2 seconds." .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - UNICORE/X RESTART REQUIRED". format(uuidcode)) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 else: if i < 4: app.log.debug( "uuidcode={} - Could not get children list. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - Unknown status code. Add case for this: {} {}" .format(status_code, text)) app.log.error( "uuidcode={} - Could not get children list. Do nothing and return. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 except: app.log.error( "uuidcode={} - UNICORE/X RESTART REQUIRED".format( uuidcode)) app.log.exception( "uuidcode={} - Could not get children list. {} {}". format(uuidcode, method, remove_secret(method_args))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 # get the 'real' status of the job from the files in the working_directory # 'real' means: We don't care about Queued, ready, running or something. We just want to know: Is it bad (failed or cancelled) or good (running or spawning) status = '' if properties_json.get('status') in [ 'QUEUED', 'READY', 'RUNNING', 'STAGINGIN' ]: if '.end' in children or '/.end' in children: # It's not running anymore status = 'stopped' elif '.tunnel' in children or '/.tunnel' in children: # It's running and tunnel is up status = 'running' elif '.host' in children or '/.host' in children: if request.headers.get('pollspawner', 'false').lower() == 'true': # If there's an error when collecting the children list it may happen, that we would try to create a tunnel for a server that's already running for a long time app.log.error( 'uuidcode={} - Poll Spawner wants to create tunnel. Stop it. Children list: {}' .format(uuidcode, children)) status = 'running' else: # build up tunnel try: tunnel_utils.create( app.log, uuidcode, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('tunnel', {}).get('url_tunnel'), app.urls.get('hub', {}).get('url_cancel'), kernelurl, request.headers.get('filedir'), unicore_header, request.headers.get('servername'), request.headers.get('system'), request.headers.get('port'), cert, request.headers.get('jhubtoken'), request.headers.get('escapedusername'), servername) except: app.log.error( "uuidcode={} - Could not create Tunnel. Used Parameters: {} {} {} {} {} {} {} {} {} {}" .format( uuidcode, app.urls.get('tunnel', {}).get('url_tunnel'), app.urls.get('hub', {}).get('url_cancel'), kernelurl, request.headers.get('filedir'), remove_secret(unicore_header), request.headers.get('servername'), request.headers.get('system'), request.headers.get('port'), cert, '<secret>')) app.log.trace( "uuidcode={} - Call stop_job".format(uuidcode)) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') try: stop_job( app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, "Jupyter@JSC internal error. An administrator is informed. Please try again in a few minutes." ) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return "", 539 status = 'running' else: if request.headers.get('pollspawner', 'false').lower() == 'true': # If there's an error when collecting the children list it may happen, that we would create a thread to get better information. We just send running and hope for the next run app.log.error( 'uuidcode={} - Poll Spawner wants to create get_status thread. Prevent it. Children list: {}' .format(uuidcode, children)) status = 'running' else: request_headers = {} for key, value in request.headers.items(): if 'Token' in key: key = key.replace('-', '_') request_headers[key.lower()] = value app.log.trace( "uuidcode={} - New Header for Thread: {}".format( uuidcode, request_headers)) # no .host in children, let's start a thread which looks for it every second t = Thread(target=jobs_threads.get, args=(app.log, uuidcode, request_headers, unicore_header, app.urls, cert)) t.start() status = 'waitforhostname' app.log.info( "uuidcode={} - Update JupyterHub status ({})".format( uuidcode, status)) hub_communication.status( app.log, uuidcode, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_status'), request.headers.get('jhubtoken'), status, request.headers.get('escapedusername'), servername) if status in ['running', 'stopped'] and request.headers.get( 'spawning', 'true').lower() == 'true': # spawning is finished app.log.trace( 'uuidcode={} - Tell J4J_Orchestrator that the spawning is done' .format(uuidcode)) try: orchestrator_communication.set_spawning( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_spawning'), request.headers.get('servername'), 'False') except: app.log.exception( "uuidcode={} - Could not set spawning to false in J4J_Orchestrator database for {}" .format(uuidcode, request_headers.get('servername'))) else: app.log.error('uuidcode={} - Unknown JobStatus: {}'.format( uuidcode, properties_json.get('status'))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job( app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, "A backend Service had a problem. An administrator is informed. Please try it again in a few minutes." ) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) if status != 'waitforhostname': # no thread was started, so the check is finished orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') except: app.log.exception("Jobs.get failed. Bugfix required")
def quota_check(app_logger, uuidcode, app_urls, request_headers, unicore_header, cert, servername): try: method = "GET" accept = unicore_header.get('Accept', False) unicore_header['Accept'] = 'application/octet-stream' method_args = {"url": request_headers.get('filedir')+'/.quota_check.out', "headers": unicore_header, "certificate": cert, "return_content": True} content, status_code, response_header = unicore_communication.request(app_logger, uuidcode, method, method_args) if status_code != 200: app_logger.warning("uuidcode={} - Could not get quota check output. UNICORE/X Response: {} {} {}".format(uuidcode, content, status_code, remove_secret(response_header))) raise Exception("{} - Could not get hostname. Throw exception because of wrong status_code: {}".format(uuidcode, status_code)) else: unicore_header['X-UNICORE-SecuritySession'] = response_header['X-UNICORE-SecuritySession'] quota_result = content.strip() except: app_logger.exception("uuidcode={} - Could not get quota check output. {} {}".format(uuidcode, method, remove_secret(method_args))) app_logger.warning("uuidcode={} - Send cancel to JupyterHub.".format(uuidcode)) hub_communication.cancel(app_logger, uuidcode, app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('hub', {}).get('url_cancel'), request_headers.get('jhubtoken'), "Something went wrong. An administrator is informed.", request_headers.get('escapedusername'), servername) if accept: unicore_header['Accept'] = accept else: del unicore_header['Accept'] return False if accept: unicore_header['Accept'] = accept else: del unicore_header['Accept'] if quota_result.lower() == "datausage": app_logger.info("uuidcode={} - Quota Check for user: Quota exceeded {}".format(uuidcode, quota_result)) stop_job(app_logger, uuidcode, servername, request_headers.get('system'), request_headers, app_urls, True, "Your disk quota in $HOME is exceeded. Please check it at https://judoor.fz-juelich.de or with this command: \"$ jutil user dataquota\".", True, False) return False elif quota_result.lower() == "inode": app_logger.info("uuidcode={} - Quota Check for user: Quota exceeded {}".format(uuidcode, quota_result)) stop_job(app_logger, uuidcode, servername, request_headers.get('system'), request_headers, app_urls, True, "You've got too many inodes in $HOME. Please check it at https://judoor.fz-juelich.de or with this command: \"$ jutil user dataquota\".", True, False) return False elif quota_result.lower() == "ok": app_logger.debug("uuidcode={} - Quota Check for user ok".format(uuidcode)) return True else: app_logger.error("uuidcode={} - Could not understand the quota result: {}".format(uuidcode, quota_result)) return True