def start_sh(app_logger, uuidcode, system, project, checkboxes, inputs, account): app_logger.debug("uuidcode={} - Create start.sh file".format(uuidcode)) unicorex_info = utils_file_loads.get_unicorex() startjupyter = '#!/bin/bash\n_term() {\n echo \"Caught SIGTERM signal!\"\n kill -TERM \"$child\" 2>/dev/null\n}\ntrap _term SIGTERM\n' if 'sanity_checks' in inputs.get(system.upper(), {}).get('start', {}).keys(): startjupyter += inputs.get(system.upper(), {}).get('start', {}).get( 'sanity_checks', '#SanityChecks') + '\n' startjupyter += 'hostname>.host;\n' startjupyter += inputs.get(system.upper(), {}).get('start', {}).get( 'precommands', '#precommands') + '\n' project_link_list = unicorex_info.get(system.upper(), {}).get("projectLinks", []) if project in project_link_list: startjupyter += "if ! [ -e ${{HOME}}/PROJECT_{} ]; then\n".format( project) startjupyter += " ln -s ${{PROJECT_{project}}} ${{HOME}}/PROJECT_{project}\n".format( project=project) startjupyter += "fi\n" if account in inputs.get(system.upper(), {}).get('start', {}).get('accountmodules', {}).keys(): startjupyter += inputs.get(system.upper(), {}).get('start', {}).get( 'accountmodules', {}).get( account, '#usermodules: {}'.format(account)) + '\n' else: startjupyter += inputs.get(system.upper(), {}).get('start', {}).get( 'defaultmodules', '#defaultmodules') + '\n' startjupyter += inputs.get(system.upper(), {}).get('start', {}).get( 'postcommands', '#postcommands') + '\n' startjupyter += 'export JPY_API_TOKEN=`cat .jupyter.token`\n' startjupyter += 'export JUPYTERHUB_API_TOKEN=`cat .jupyter.token`\n' for cbname, cbinfos in checkboxes.items(): script = "# {}\n".format(cbname) with open(cbinfos.get('scriptpath'), 'r') as f: script += f.read() startjupyter += script + '\n' if project in unicorex_info.get(system.upper(), {}).get("project_path", []): startjupyter += 'export JUPYTER_PATH=$PROJECT_{}/.local/share/jupyter:$JUPYTER_PATH\n'.format( project) if 'executable' in inputs.get(system.upper()).get('start').keys(): startjupyter += inputs.get( system.upper()).get('start').get('executable') else: startjupyter += 'jupyter labhub $@ --config .config.py &' startjupyter += '\nchild=$!\nwait "$child"' startjupyter += '\necho "end">.end\n' app_logger.trace("uuidcode={} - start.sh file: {}".format( uuidcode, startjupyter.replace("\n", "/n"))) return startjupyter
def create_inputs_dashboards(app_logger, uuidcode, request_json, project, tunnel_url_remote, account, dashboard_info, dashboard_name): app_logger.debug( "uuidcode={} - Create Inputs for UNICORE/X.".format(uuidcode)) inp = [] ux = get_unicorex() nodes = ux.get(request_json.get('system').upper(), {}).get('nodes', []) try: with open( dashboard_info.get(request_json.get('system'), {}).get("config_file")) as f: baseconf = f.read().rstrip() except: baseconf = "" inps = get_inputs() node = get_remote_node(app_logger, uuidcode, tunnel_url_remote, nodes) inp.append({ 'To': '.start.sh', 'Data': dashboard_start_sh(app_logger, uuidcode, request_json.get('system'), project, request_json.get('Checkboxes'), inps, account, dashboard_info, dashboard_name) }) inp.append({ 'To': '.config.py', 'Data': get_config( app_logger, uuidcode, baseconf, request_json.get('port'), node, request_json.get('Environment', {}).get('JUPYTERHUB_USER'), request_json.get('service'), request_json.get('Environment', {}).get('JUPYTERHUB_SERVER_NAME')) }) inp.append({ 'To': '.jupyter.token', 'Data': request_json.get('Environment').get('JUPYTERHUB_API_TOKEN') }) try: del request_json['Environment']['JUPYTERHUB_API_TOKEN'] del request_json['Environment']['JPY_API_TOKEN'] except KeyError: pass app_logger.trace("uuidcode={} - Inputs for UNICORE/X: {}".format( uuidcode, inp)) return inp
def create_job(app_logger, uuidcode, request_json, project, unicore_input): app_logger.debug("uuidcode={} - Create UNICORE/X-7 Job.".format(uuidcode)) job = { 'ApplicationName': 'Jupyter4JSC', 'Environment': request_json.get('Environment', {}), 'Imports': [] } unicorex_info = utils_file_loads.get_unicorex() for inp in unicore_input: job['Imports'].append({ "From": "inline://dummy", "To": inp.get('To'), "Data": inp.get('Data'), }) if request_json.get('partition') == 'LoginNode': job['Environment']['UC_PREFER_INTERACTIVE_EXECUTION'] = 'true' job['Executable'] = 'bash .start.sh' app_logger.trace("uuidcode={} - UNICORE/X Job: {}".format( uuidcode, job)) return job if unicorex_info.get(request_json.get('system').upper(), {}).get('queues', False): job['Resources'] = {'Queue': request_json.get('partition')} else: job['Resources'] = {} if request_json.get('reservation', None): if len(request_json.get('reservation', '')) > 0 and request_json.get( 'reservation', 'none').lower() != 'none': job['Resources']['Reservation'] = request_json.get('reservation') for key, value in request_json.get('Resources').items(): job['Resources'][key] = value job['Executable'] = '.start.sh' app_logger.debug("uuidcode={} - UNICORE/X-7 Job: {}".format(uuidcode, job)) return job
def stop_job(app_logger, uuidcode, servername, system, request_headers, app_urls, send_cancel=True, errormsg="", stop_unicore_job=True): app_logger.trace("uuidcode={} - Create UNICORE Header".format(uuidcode)) if ':' not in servername: servername = "{}:{}".format(request_headers.get('escapedusername'), servername) if send_cancel: app_logger.debug("uuidcode={} - Send cancel to JupyterHub".format(uuidcode)) hub_communication.cancel(app_logger, uuidcode, app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('hub', {}).get('url_cancel'), request_headers.get('jhubtoken'), errormsg, request_headers.get('escapedusername'), servername) unicore_header = {} accesstoken = "" expire = "" if stop_unicore_job: unicore_header, accesstoken, expire = unicore_utils.create_header(app_logger, uuidcode, request_headers, app_urls.get('hub', {}).get('url_proxy_route'), app_urls.get('hub', {}).get('url_token'), request_headers.get('escapedusername'), servername) # Get certificate path to communicate with UNICORE/X Server app_logger.trace("uuidcode={} - FileLoad: UNICORE/X certificate path".format(uuidcode)) unicorex = utils_file_loads.get_unicorex() cert = unicorex.get(system, {}).get('certificate', False) app_logger.trace("uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}".format(uuidcode, cert)) # Get logs from the UNICORE workspace. Necessary for support app_logger.debug("uuidcode={} - Copy_log".format(uuidcode)) try: unicore_utils.copy_log(app_logger, uuidcode, unicore_header, request_headers.get('filedir'), request_headers.get('kernelurl'), cert) except: app_logger.exception("uuidcode={} - Could not copy log.".format(uuidcode)) # Abort the Job via UNICORE app_logger.debug("uuidcode={} - Abort Job".format(uuidcode)) unicore_utils.abort_job(app_logger, uuidcode, request_headers.get('kernelurl'), unicore_header, cert) if unicorex.get(system, {}).get('destroyjobs', 'false').lower() == 'true': # Destroy the Job via UNICORE app_logger.debug("uuidcode={} - Destroy Job".format(uuidcode)) unicore_utils.destroy_job(app_logger, uuidcode, request_headers.get('kernelurl'), unicore_header, cert) # Kill the tunnel tunnel_info = { "servername": servername } try: app_logger.debug("uuidcode={} - Close ssh tunnel".format(uuidcode)) tunnel_communication.close(app_logger, uuidcode, app_urls.get('tunnel', {}).get('url_tunnel'), tunnel_info) except: app_logger.exception("uuidcode={} - Could not stop tunnel. tunnel_info: {} {}".format(uuidcode, tunnel_info, app_urls.get('tunnel', {}).get('url_tunnel'))) # Remove Database entry for J4J_Orchestrator app_logger.debug("uuidcode={} - Call J4J_Orchestrator to remove entry {} from database".format(uuidcode, servername)) orchestrator_communication.delete_database_entry(app_logger, uuidcode, app_urls.get('orchestrator', {}).get('url_database'), servername) return accesstoken, expire, unicore_header.get('X-UNICORE-SecuritySession')
def create_unicore8_job_dashboard(app_logger, uuidcode, request_json, project, unicore_input, escapedusername): app_logger.debug("uuidcode={} - Create UNICORE/X-8 Job.".format(uuidcode)) env_list = [] for key, value in request_json.get('Environment', {}).items(): env_list.append('{}={}'.format(key, value)) job = { 'ApplicationName': 'Bash shell', 'Environment': env_list, 'Imports': [] } unicorex_info = utils_file_loads.get_unicorex() if unicorex_info.get(request_json.get('system').upper(), {}).get('set_project', False): if unicorex_info.get(request_json.get('system').upper(), {}).get( 'projects', {}).get('ALL', '') != '': job['Project'] = unicorex_info.get( request_json.get('system').upper(), {}).get('projects', {}).get('ALL', '') elif unicorex_info.get(request_json.get('system').upper(), {}).get( 'projects', {}).get(project.lower(), '') != '': job['Project'] = unicorex_info.get( request_json.get('system').upper(), {}).get('projects', {}).get(project.lower(), '') elif unicorex_info.get(request_json.get('system').upper(), {}).get('projects_truncate', False): job['Project'] = project[1:] else: job['Project'] = project for inp in unicore_input: job['Imports'].append({ "From": "inline://dummy", "To": inp.get('To'), "Data": inp.get('Data') }) urls = utils_file_loads.get_urls() ux_notify = urls.get('hub', {}).get( 'url_ux', '<no_url_for_unicore_notification_configured>') ux_notify_server_name = "{}_{}_{}".format( len(uuidcode), uuidcode, request_json.get('Environment', {}).get('JUPYTERHUB_SERVER_NAME')) ux_notify = ux_notify.replace('<user>', escapedusername).replace( '<server>', ux_notify_server_name) job['Notification'] = ux_notify if request_json.get('partition') in ['LoginNode', 'LoginNodeVis']: job['Executable'] = '/bin/bash' job['Arguments'] = ['.start.sh'] job['Job type'] = 'interactive' if request_json.get('partition') in ['LoginNodeVis']: nodes = unicorex_info.get(request_json.get('system').upper(), {}).get('LoginNodeVis', []) if len(nodes) > 0: # get system list ... choose one ... use it node = random.choice(nodes) app_logger.trace( "uuidcode={} - Use random VIS Node: {}".format( uuidcode, node)) job['Login node'] = node elif 'LoginNodeVis' in unicorex_info.get( request_json.get('system').upper(), {}).keys(): # this system supports vis nodes. So we have to set the non vis nodes explicitly nodes = unicorex_info.get(request_json.get('system').upper(), {}).get('LoginNode', []) if len(nodes) > 0: # get system list ... choose one ... use it node = random.choice(nodes) app_logger.trace( "uuidcode={} - Use random non-VIS Node: {}".format( uuidcode, node)) job['Login node'] = node app_logger.trace("uuidcode={} - UNICORE/X Job: {}".format( uuidcode, job)) return job if unicorex_info.get(request_json.get('system').upper(), {}).get('queues', False): job['Resources'] = {'Queue': request_json.get('partition')} else: job['Resources'] = {} if request_json.get('reservation', None): if len(request_json.get('reservation', '')) > 0 and request_json.get( 'reservation', 'none').lower() != 'none': job['Resources']['Reservation'] = request_json.get('reservation') for key, value in request_json.get('Resources').items(): job['Resources'][key] = value job['Executable'] = '/bin/bash' job['Arguments'] = ['.start.sh'] app_logger.debug("uuidcode={} - UNICORE/X-8 Job: {}".format(uuidcode, job)) return job
def post(self): try: # Track actions through different webservices. uuidcode = request.headers.get('uuidcode', '<no uuidcode>') app.log.info("uuidcode={} - Spawn Server".format(uuidcode)) app.log.trace("uuidcode={} - Headers: {}".format( uuidcode, request.headers)) app.log.trace("uuidcode={} - Json: {}".format( uuidcode, request.json)) # Check for J4J intern token validate_auth(app.log, uuidcode, request.headers.get('Intern-Authorization')) servername = request.headers.get('servername') # Create header for unicore job try: unicore_header, accesstoken, expire = unicore_utils.create_header( app.log, # @UnusedVariable uuidcode, request.headers, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_token'), request.headers.get('escapedusername'), servername) except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - Could not create header for UNICORE/X Job. {} {}" .format(uuidcode, remove_secret(request.json), app.urls.get('tunnel', {}).get('url_remote'))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg, False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) # Return positive status: Administrator is informed and there is nothing we can do here otherwise. return "", 200 # Create input files for the job. A working J4J_tunnel webservice is required try: unicore_input = unicore_utils.create_inputs( app.log, uuidcode, request.json, request.headers.get('project'), app.urls.get('tunnel', {}).get('url_remote'), request.headers.get('account')) except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes." app.log.exception( "uuidcode={} - Could not create input files for UNICORE/X Job. {} {}" .format(uuidcode, remove_secret(request.json), app.urls.get('tunnel', {}).get('url_remote'))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg, False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 534 # Create Job description unicore_file = utils_file_loads.get_unicorex() if unicore_file.get(request.json.get('system').upper(), {}).get("UNICORE8", False): unicore_json = unicore_utils.create_unicore8_job( app.log, uuidcode, request.json, request.headers.get('Project'), unicore_input, request.headers.get('escapedusername')) else: unicore_json = unicore_utils.create_job( app.log, uuidcode, request.json, request.headers.get('Project'), unicore_input) # Get URL and certificate to communicate with UNICORE/X app.log.trace( "uuidcode={} - FileLoad: UNICORE/X url".format(uuidcode)) unicorex = utils_file_loads.get_unicorex() url = unicorex.get(request.json.get('system', ''), {}).get( 'link', '<no_url_found_for_{}>'.format(request.json.get('system'))) app.log.trace( "uuidcode={} - FileLoad: UNICORE/X url Result: {}".format( uuidcode, url)) cert = unicorex.get(request.json.get('system', ''), {}).get('certificate', False) app.log.trace( "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}" .format(uuidcode, cert)) # Submit Job. It will not be started, because of unicore_json['haveClientStageIn']='true' kernelurl = "" try: hub_communication.status( app.log, uuidcode, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_status'), request.headers.get('jhubtoken'), 'submitunicorejob', request.headers.get('escapedusername'), servername) method = "POST" method_args = { "url": url + "/jobs", "headers": unicore_header, "data": json.dumps(unicore_json), "certificate": cert } app.log.info("uuidcode={} - Submit UNICORE/X Job to {}".format( uuidcode, url + "/jobs")) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code != 201: app.log.warning( "uuidcode={} - Could not submit Job. Response from UNICORE/X: {} {} {}." .format(uuidcode, text, status_code, remove_secret(response_header))) if status_code == 500: app.log.error( "uuidcode={} - UNICORE RESTART REQUIRED!! {}". format( uuidcode, request.json.get('system', '<system_unknown>'))) elif status_code == 403 or status_code == 432: raise SpawnException( "Invalid token. Please logout and login again.") else: app.log.error( "uuidcode={} - Unexpected status_code. Add case for this status_code." .format(uuidcode)) raise SpawnException( "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes." ) else: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] kernelurl = response_header['Location'] except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - User message: {} - Could not submit Job. {} {}" .format(uuidcode, err_msg, method, remove_secret(method_args))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg, False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 539 # get properties of job for i in range(5): # @UnusedVariable properties_json = {} try: method = "GET" method_args = { "url": kernelurl, "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get Properties of UNICORE/X Job {}". format(uuidcode, kernelurl)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code != 200: if status_code == 500: app.log.error( "uuidcode={} - UNICORE RESTART REQUIRED!! {}". format( uuidcode, request.json.get('system', '<system_unknown>'))) raise SpawnException( "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes." ) else: app.log.error( "uuidcode={} - Unexpected status_code. Add case for this status_code." .format(uuidcode)) if i < 4: app.log.debug( "uuidcode={} - Could not get properties of Job. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: app.log.warning( "uuidcode={} - Could not get properties of Job. Response from UNICORE/X: {} {} {}." .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get properties of Job. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) else: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] properties_json = json.loads(text) if properties_json.get( 'status') == 'UNDEFINED' and i < 4: app.log.debug( "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: break except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - Could not get properties of Job. {} {}" .format(uuidcode, method, remove_secret(method_args))) app.log.trace( "uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return "", 539 # get file directory # this will be used in get. Ask it here once and send it to get() afterwards filedirectory = "" try: method = "GET" method_args = { "url": properties_json['_links']['workingDirectory']['href'], "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get path of file directory of UNICORE/X Job" .format(uuidcode)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code != 200: app.log.error( "uuidcode={} - Unknown status_code. Please add case for this status_code" .format(uuidcode)) app.log.warning( "uuidcode={} - Could not get filedirectory. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get filedirectory. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) else: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] filedirectory = json.loads(text)['_links']['files']['href'] except (SpawnException, Exception) as e: if type(e).__name__ == "SpawnException": err_msg = str(e) else: err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes" app.log.exception( "uuidcode={} - Could not get filedirectory. {} {}".format( uuidcode, method, remove_secret(method_args))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job(app.log, uuidcode, servername, request.json.get('system'), request.headers, app.urls, True, err_msg) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 539 return "", 201, { 'kernelurl': kernelurl, 'filedir': filedirectory, 'X-UNICORE-SecuritySession': unicore_header.get('X-UNICORE-SecuritySession') } except: app.log.exception("Jobs.post failed. Bugfix required")
def get(self): try: # Track actions through different webservices. uuidcode = request.headers.get('uuidcode', '<no uuidcode>') app.log.info("uuidcode={} - Get Server Status".format(uuidcode)) app.log.trace("uuidcode={} - Headers: {}".format( uuidcode, request.headers)) # Check for J4J intern token validate_auth(app.log, uuidcode, request.headers.get('intern-authorization')) servername = request.headers.get('servername') # Create UNICORE header and get certificate try: unicore_header, accesstoken, expire = unicore_utils.create_header( app.log, # @UnusedVariable uuidcode, request.headers, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_token'), request.headers.get('escapedusername'), servername) except (SpawnException, Exception): app.log.exception( "uuidcode={} - Could not Create Header. Token from user {} might be revoked. Do nothing and return." .format(uuidcode, request.headers.get('escapedusername'))) # Return positive status: Administrator is informed and there is nothing we can do here otherwise. return "", 200 app.log.trace( "uuidcode={} - FileLoad: UNICORE/X certificate path".format( uuidcode)) unicorex = utils_file_loads.get_unicorex() cert = unicorex.get(request.headers.get('system', ''), {}).get('certificate', False) app.log.trace( "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}" .format(uuidcode, cert)) # Get Properties of kernelurl kernelurl = request.headers.get('kernelurl') for i in range(5): # @UnusedVariable properties_json = {} try: method = "GET" method_args = { "url": kernelurl, "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get Properties of UNICORE/X Job {}". format(uuidcode, kernelurl)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code == 200: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] properties_json = json.loads(text) if properties_json.get( 'status') == 'UNDEFINED' and i < 4: app.log.debug( "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: break elif status_code == 404: if i < 4: app.log.debug( "uuidcode={} - Could not get properties. 404 Not found. Sleep for 2 seconds and try again" .format(uuidcode)) sleep(2) else: app.log.warning( "uuidcode={} - Could not get properties. 404 Not found. Stop Job and return. {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') try: stop_job(app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, '', False) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return "", 539 elif status_code == 500: if i < 4: app.log.debug( "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again" .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - UNICORE RESTART REQUIRED!!. system: {}" .format( uuidcode, request.headers.get( 'system', '<system_unknown>'))) app.log.warning( "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) app.log.warning( "uuidcode={} - Do not send update to JupyterHub." .format(uuidcode)) # If JupyterHub don't receives an update for a long time it can stop the job itself. orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 else: app.log.error( "uuidcode={} - Unknown status_code received. Add case for this: {} {}" .format(uuidcode, status_code, text)) if i < 4: app.log.debug( "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again" .format(uuidcode)) sleep(2) else: app.log.warning( "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) raise Exception( "{} - Could not get properties. Throw exception because of wrong status_code: {}" .format(uuidcode, status_code)) except: app.log.exception( "uuidcode={} - Could not get properties. JupyterLab will be still running. {} {}" .format(uuidcode, method, remove_secret(method_args))) app.log.warning( "uuidcode={} - Do not send update to JupyterHub.". format(uuidcode)) # If JupyterHub don't receives an update for a long time it can stop the job itself. orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 if properties_json.get('status') in [ 'SUCCESSFUL', 'ERROR', 'FAILED', 'NOT_SUCCESSFUL' ]: # Job is Finished for UNICORE, so it should be for JupyterHub if request.headers.get('pollspawner', 'false').lower() == 'true': app.log.error( 'uuidcode={} - Get (poll spawner): Job is finished or failed - JobStatus: {}. Send Information to JHub. {}' .format(uuidcode, properties_json.get('status'), properties_json)) if properties_json.get( 'statusMessage', '' ) == "Failed: Execution was not completed (no exit code file found), please check standard error file <stderr>": app.log.error( "uuidcode={} - UNICORE hotfix: do nothing because that's most likely a bug." .format(uuidcode)) return "", 200 else: if not properties_json.get( 'statusMessage') == 'Job was aborted by the user.': app.log.error( 'uuidcode={} - At starting process: Job is finished or failed - JobStatus: {}. Send Information to JHub. {}' .format(uuidcode, properties_json.get('status'), properties_json)) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') error_msg = "" try: mem = utils_file_loads.map_error_messages() if properties_json.get('status') in [ 'FAILED' ] and properties_json.get('statusMessage') in mem.keys(): error_msg = mem.get( properties_json.get('statusMessage', ''), "Could not start your Job. Please check your configuration. An administrator is informed." ) else: for key, value in mem.items(): if properties_json.get('statusMessage', '').startswith(key): error_msg = value if error_msg == "": if request.headers.get('pollspawner', 'false').lower() == 'true': app.log.error( "uuidcode={} - StatusMessage from Failed UNICORE Job not found in /etc/j4j/j4j_mount/j4j_unicore/map_error_messages.json. Please update to have a better user experience" .format(uuidcode)) error_msg = "Could not start your Job. Please check your configuration. An administrator is informed." except: error_msg = "Could not start your Job. Please check your configuration. An administrator is informed." try: stop_job(app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, error_msg) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) return "", 530 # The Job is not finished yet (good) # Get Files in the filedir children = [] for i in range(5): # @UnusedVariable try: method = "GET" method_args = { "url": request.headers.get('filedir'), "headers": unicore_header, "certificate": cert } app.log.info( "uuidcode={} - Get list of files of UNICORE/X Job {}". format(uuidcode, kernelurl)) text, status_code, response_header = unicore_communication.request( app.log, uuidcode, method, method_args) if status_code == 200: unicore_header[ 'X-UNICORE-SecuritySession'] = response_header[ 'X-UNICORE-SecuritySession'] # in UNICORE 8 the answer is a bit different children_json = json.loads(text) if 'children' in children_json.keys(): children = json.loads(text).get('children', []) elif 'content' in children_json.keys(): children = list( json.loads(text).get('content', {}).keys()) else: app.log.warning( "uuidcode={} - Could not find any childrens in {}" .format(uuidcode, text)) children = [] if len(children) == 0 and i < 4: app.log.debug( "uuidcode={} - Received empty children list. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: break elif status_code == 404: if i < 4: app.log.debug( "uuidcode={} - Could not get children list. 404 Not found. Try again in 2 seconds." .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - Could not get children list. 404 Not found. Do nothing and return. {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 elif status_code == 500: if i < 4: app.log.debug( "uuidcode={} - Could not get children list. Status Code 500. Try again in 2 seconds." .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - UNICORE/X RESTART REQUIRED". format(uuidcode)) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 else: if i < 4: app.log.debug( "uuidcode={} - Could not get children list. Try again in 2 seconds" .format(uuidcode)) sleep(2) else: app.log.error( "uuidcode={} - Unknown status code. Add case for this: {} {}" .format(status_code, text)) app.log.error( "uuidcode={} - Could not get children list. Do nothing and return. UNICORE/X Response: {} {} {}" .format(uuidcode, text, status_code, remove_secret(response_header))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 except: app.log.error( "uuidcode={} - UNICORE/X RESTART REQUIRED".format( uuidcode)) app.log.exception( "uuidcode={} - Could not get children list. {} {}". format(uuidcode, method, remove_secret(method_args))) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') return "", 539 # get the 'real' status of the job from the files in the working_directory # 'real' means: We don't care about Queued, ready, running or something. We just want to know: Is it bad (failed or cancelled) or good (running or spawning) status = '' if properties_json.get('status') in [ 'QUEUED', 'READY', 'RUNNING', 'STAGINGIN' ]: if '.end' in children or '/.end' in children: # It's not running anymore status = 'stopped' elif '.tunnel' in children or '/.tunnel' in children: # It's running and tunnel is up status = 'running' elif '.host' in children or '/.host' in children: if request.headers.get('pollspawner', 'false').lower() == 'true': # If there's an error when collecting the children list it may happen, that we would try to create a tunnel for a server that's already running for a long time app.log.error( 'uuidcode={} - Poll Spawner wants to create tunnel. Stop it. Children list: {}' .format(uuidcode, children)) status = 'running' else: # build up tunnel try: tunnel_utils.create( app.log, uuidcode, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('tunnel', {}).get('url_tunnel'), app.urls.get('hub', {}).get('url_cancel'), kernelurl, request.headers.get('filedir'), unicore_header, request.headers.get('servername'), request.headers.get('system'), request.headers.get('port'), cert, request.headers.get('jhubtoken'), request.headers.get('escapedusername'), servername) except: app.log.error( "uuidcode={} - Could not create Tunnel. Used Parameters: {} {} {} {} {} {} {} {} {} {}" .format( uuidcode, app.urls.get('tunnel', {}).get('url_tunnel'), app.urls.get('hub', {}).get('url_cancel'), kernelurl, request.headers.get('filedir'), remove_secret(unicore_header), request.headers.get('servername'), request.headers.get('system'), request.headers.get('port'), cert, '<secret>')) app.log.trace( "uuidcode={} - Call stop_job".format(uuidcode)) orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') try: stop_job( app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, "Jupyter@JSC internal error. An administrator is informed. Please try again in a few minutes." ) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run" .format(uuidcode)) return "", 539 status = 'running' else: if request.headers.get('pollspawner', 'false').lower() == 'true': # If there's an error when collecting the children list it may happen, that we would create a thread to get better information. We just send running and hope for the next run app.log.error( 'uuidcode={} - Poll Spawner wants to create get_status thread. Prevent it. Children list: {}' .format(uuidcode, children)) status = 'running' else: request_headers = {} for key, value in request.headers.items(): if 'Token' in key: key = key.replace('-', '_') request_headers[key.lower()] = value app.log.trace( "uuidcode={} - New Header for Thread: {}".format( uuidcode, request_headers)) # no .host in children, let's start a thread which looks for it every second t = Thread(target=jobs_threads.get, args=(app.log, uuidcode, request_headers, unicore_header, app.urls, cert)) t.start() status = 'waitforhostname' app.log.info( "uuidcode={} - Update JupyterHub status ({})".format( uuidcode, status)) hub_communication.status( app.log, uuidcode, app.urls.get('hub', {}).get('url_proxy_route'), app.urls.get('hub', {}).get('url_status'), request.headers.get('jhubtoken'), status, request.headers.get('escapedusername'), servername) if status in ['running', 'stopped'] and request.headers.get( 'spawning', 'true').lower() == 'true': # spawning is finished app.log.trace( 'uuidcode={} - Tell J4J_Orchestrator that the spawning is done' .format(uuidcode)) try: orchestrator_communication.set_spawning( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_spawning'), request.headers.get('servername'), 'False') except: app.log.exception( "uuidcode={} - Could not set spawning to false in J4J_Orchestrator database for {}" .format(uuidcode, request_headers.get('servername'))) else: app.log.error('uuidcode={} - Unknown JobStatus: {}'.format( uuidcode, properties_json.get('status'))) app.log.trace("uuidcode={} - Call stop_job".format(uuidcode)) try: stop_job( app.log, uuidcode, servername, request.headers.get('system'), request.headers, app.urls, True, "A backend Service had a problem. An administrator is informed. Please try it again in a few minutes." ) except: app.log.exception( "uuidcode={} - Could not stop Job. It may still run". format(uuidcode)) if status != 'waitforhostname': # no thread was started, so the check is finished orchestrator_communication.set_skip( app.log, uuidcode, app.urls.get('orchestrator', {}).get('url_skip'), request.headers.get('servername'), 'False') except: app.log.exception("Jobs.get failed. Bugfix required")