Ejemplo n.º 1
0
def abort_job(app_logger, uuidcode, kernelurl, unicore_header, cert):
    app_logger.debug(
        "uuidcode={} - Try to abort job with kernelurl: {}".format(
            uuidcode, kernelurl))
    try:
        # If the API of UNICORE will change, the additional GET call might be necessary.
        # Since the action:abort url is (right now) always: kernelurl + /actions/abort we will just use this
        """
        method = "GET"
        method_args = { "url": kernelurl, "headers": unicore_header, "certificate", cert }
        text, status_code, response_header = unicore_communication.request(app_logger,
                                                                           uuidcode,
                                                                           method,
                                                                           method_args)
        if status_code != 200
            ...
        else:
            url = json.loads(text)['_links']['action:abort']['href']
        """
        method = "POST"
        method_args = {
            "url": kernelurl + '/actions/abort',
            "headers": unicore_header,
            "data": "{}",
            "certificate": cert
        }

        app_logger.info("uuidcode={} - Abort UNICORE/X Job {}".format(
            uuidcode, kernelurl))
        text, status_code, response_header = unicore_communication.request(
            app_logger, uuidcode, method, method_args)

        if status_code < 200 or status_code > 299:
            app_logger.warning(
                "uuidcode={} - Could not abort Job. Response from UNICORE/X: {} {} {}"
                .format(uuidcode, text, status_code,
                        remove_secret(response_header)))
        else:
            unicore_header['X-UNICORE-SecuritySession'] = response_header[
                'X-UNICORE-SecuritySession']
    except:
        app_logger.exception(
            "uuidcode={} - Could not abort Job.".format(uuidcode))
Ejemplo n.º 2
0
def destroy_job(app_logger, uuidcode, kernelurl, unicore_header, cert):
    app_logger.debug(
        "uuidcode={} - Try to destroy Job with kernelurl: {}".format(
            uuidcode, kernelurl))
    method = "DELETE"
    method_args = {
        "url": kernelurl,
        "headers": unicore_header,
        "certificate": cert
    }
    try:
        app_logger.info("uuidcode={} - Destroy UNICORE/X Job".format(uuidcode))
        text, status_code, response_header = unicore_communication.request(
            app_logger, uuidcode, method, method_args)
        if status_code > 399:
            app_logger.warning(
                "uuidcode={} - Could not destroy job. WorkDirectory may still exist. UNICORE/X Response: {} {} {}"
                .format(uuidcode, text, status_code,
                        remove_secret(response_header)))
    except:
        app_logger.exception(
            "uuidcode={} - Could not destroy job.".format(uuidcode))
Ejemplo n.º 3
0
def create(app_logger, uuidcode, app_hub_url_proxy_route, app_tunnel_url,
           app_hub_url_cancel, kernelurl, filedir, unicore_header, servername,
           system, port, cert, jhubtoken, username, servername_short,
           app_orchestrator_url_hostname):
    app_logger.trace("uuidcode={} - Try to create a tunnel".format(uuidcode))
    accept = unicore_header.get('Accept', False)
    unicore_header['Accept'] = 'application/octet-stream'
    hostname = ""
    try:
        method = "GET"
        method_args = {
            "url": filedir + '/.host',
            "headers": unicore_header,
            "certificate": cert,
            "return_content": True
        }
        content, status_code, response_header = unicore_communication.request(
            app_logger, uuidcode, method, method_args)
        if status_code != 200:
            app_logger.warning(
                "uuidcode={} - Could not get hostname. UNICORE/X Response: {} {} {}"
                .format(uuidcode, content, status_code,
                        remove_secret(response_header)))
            raise Exception(
                "{} - Could not get hostname. Throw exception because of wrong status_code: {}"
                .format(uuidcode, status_code))
        else:
            unicore_header['X-UNICORE-SecuritySession'] = response_header[
                'X-UNICORE-SecuritySession']
            hostname = content.strip()
    except:
        app_logger.exception(
            "uuidcode={} - Could not get hostname. {} {}".format(
                uuidcode, method, remove_secret(method_args)))
        app_logger.warning(
            "uuidcode={} - Send cancel to JupyterHub.".format(uuidcode))
        hub_communication.cancel(
            app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_cancel,
            jhubtoken,
            "A mandatory backend service had a problem. An administrator is informed.",
            username, servername_short)
        unicore_utils.abort_job(app_logger, uuidcode, kernelurl,
                                unicore_header, cert)
        unicore_utils.destroy_job(app_logger, uuidcode, kernelurl,
                                  unicore_header, cert)
        raise Exception("{} - Could not get hostname".format(uuidcode))
    app_logger.trace(
        'uuidcode={} - Inform J4J_Orchestrator about the hostname'.format(
            uuidcode))
    try:
        orchestrator_communication.set_hostname(app_logger, uuidcode,
                                                app_orchestrator_url_hostname,
                                                servername, hostname)
    except:
        app_logger.exception(
            "uuidcode={} - Could not set hostname to {} in J4J_Orchestrator database for {}"
            .format(uuidcode, hostname, servername))
    tunnel_header = {
        'Intern-Authorization': utils_file_loads.get_j4j_tunnel_token(),
        'uuidcode': uuidcode
    }
    if system == 'JUWELS' and hostname[:3] == 'jwc':
        hostname = hostname.split('.')[0]
    if system == 'JURON' and hostname[:6] == 'juronc':
        hostname = hostname.split('.')[0]
    tunnel_data = {
        'account': servername,  # for internal tunnel database
        'system': system,
        'hostname': hostname,
        'port': port
    }

    tunnel_communication.j4j_start_tunnel(app_logger, uuidcode, app_tunnel_url,
                                          tunnel_header, tunnel_data)
    try:
        method = "PUT"
        method_args = {
            "url": filedir + '/.tunnel',
            "headers": unicore_header,
            "data": '{}'.format(port),
            "certificate": cert
        }
        text, status_code, response_header = unicore_communication.request(
            app_logger, uuidcode, method, method_args)
        if status_code != 204:
            app_logger.warning(
                "uuidcode={} - Could not create .tunnel file. UNICORE/X Response: {} {} {}"
                .format(uuidcode, text, status_code,
                        remove_secret(response_header)))
            raise Exception(
                "{} - Could not create .tunnel file. Throw Exception because of wrong status_code: {}"
                .format(uuidcode, status_code))
        else:
            unicore_header['X-UNICORE-SecuritySession'] = response_header[
                'X-UNICORE-SecuritySession']
    except:
        app_logger.exception(
            "uuidcode={} - Could not create .tunnel file. {} {}".format(
                uuidcode, method, remove_secret(method_args)))
        app_logger.warning(
            "uuidcode={} - Send cancel to JupyterHub.".format(uuidcode))
        hub_communication.cancel(
            app_logger, uuidcode, app_hub_url_proxy_route, app_hub_url_cancel,
            jhubtoken,
            "A mandatory backend service had a problem. An administrator is informed.",
            username, servername_short)
        unicore_utils.abort_job(app_logger, uuidcode, kernelurl,
                                unicore_header, cert)
        unicore_utils.destroy_job(app_logger, uuidcode, kernelurl,
                                  unicore_header, cert)
        raise Exception("{} - Could not create .tunnel file.".format(uuidcode))

    if accept:
        unicore_header['Accept'] = accept
    else:
        del unicore_header['Accept']
Ejemplo n.º 4
0
def get(app_logger, uuidcode, request_headers, unicore_header, app_urls, cert):
    try:
        servername = request_headers.get('servername')
        if ':' in servername:
            servername = servername.split(':')[1]
        else:
            servername = ''
        counter = 0
        children = []
        status = ''
        accesstoken = request_headers.get('accesstoken')
        expire = request_headers.get('expire')
        while True:
            # start with sleep, this function is only called, if .host was not in children
            time.sleep(3)
            # renew token. This may be run for a long time, so the accesstoken can expire
            accesstoken, expire = renew_token(
                app_logger, uuidcode, request_headers.get("tokenurl"),
                request_headers.get("authorizeurl"),
                request_headers.get("refreshtoken"), accesstoken, expire,
                request_headers.get('jhubtoken'),
                app_urls.get('hub', {}).get('url_proxy_route'),
                app_urls.get('hub', {}).get('url_token'),
                request_headers.get('escapedusername'),
                request_headers.get('servername'))
            unicore_header['Authorization'] = 'Bearer {}'.format(accesstoken)

            for i in range(3):  # @UnusedVariable
                properties_json = {}
                try:
                    method = "GET"
                    method_args = {
                        "url": request_headers.get('kernelurl'),
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app_logger.info(
                        "uuidcode={} - Get Properties of UNICORE/X Job {}".
                        format(uuidcode, request_headers.get('kernelurl')))
                    text, status_code, response_header = unicore_communication.request(
                        app_logger, uuidcode, method, method_args)
                    if status_code == 200:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        properties_json = json.loads(text)
                        if properties_json.get(
                                'status') == 'UNDEFINED' and i < 4:
                            app_logger.debug(
                                "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            break
                    elif status_code == 404:
                        if i < 4:
                            app_logger.debug(
                                "uuidcode={} - Could not get properties. 404 Not found. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            orchestrator_communication.set_skip(
                                app_logger, uuidcode,
                                app_urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request_headers.get('servername'), 'False')
                            app_logger.error(
                                "uuidcode={} - Could not get properties. 404 Not found. Do nothing and return. {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            return "", 539
                    elif status_code == 500:
                        if i < 4:
                            app_logger.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            app_logger.error(
                                "uuidcode={} - Could not get properties. system: {}"
                                .format(
                                    uuidcode,
                                    request_headers.get(
                                        'system', '<system_unknown>')))
                            app_logger.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            app_logger.warning(
                                "uuidcode={} - Do not send update to JupyterHub."
                                .format(uuidcode))
                            # If JupyterHub don't receives an update for a long time it can stop the job itself.
                            orchestrator_communication.set_skip(
                                app_logger, uuidcode,
                                app_urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request_headers.get('servername'), 'False')
                            return "", 539
                    else:
                        app_logger.error(
                            "uuidcode={} - Unknown status_code. Add case for this"
                            .format(uuidcode))
                        if i < 4:
                            app_logger.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            app_logger.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            raise Exception(
                                "{} - Could not get properties. Throw exception because of wrong status_code: {}"
                                .format(uuidcode, status_code))
                except:
                    orchestrator_communication.set_skip(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_skip'),
                        request_headers.get('servername'), 'False')
                    app_logger.exception(
                        "uuidcode={} - Could not get properties. Try to stop it {} {}"
                        .format(uuidcode, method, remove_secret(method_args)))
                    app_logger.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(
                            app_logger, uuidcode, servername,
                            request_headers.get('system'), request_headers,
                            app_urls, True,
                            "Jupyter@JSC backend error. An administrator is informed. Please try again in a few minutes."
                        )
                    except:
                        app_logger.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                    return "", 539

            if properties_json.get('status') in [
                    'SUCCESSFUL', 'ERROR', 'FAILED', 'NOT_SUCCESSFUL'
            ]:
                # Job is Finished for UNICORE, so it should be for JupyterHub
                orchestrator_communication.set_skip(
                    app_logger, uuidcode,
                    app_urls.get('orchestrator', {}).get('url_skip'),
                    request_headers.get('servername'), 'False')
                if not properties_json.get(
                        'statusMessage') == 'Job was aborted by the user.':
                    app_logger.error(
                        'uuidcode={} - Get: Job is finished or failed - JobStatus: {}. Send Information to JHub.\n{}'
                        .format(uuidcode, properties_json.get('status'),
                                properties_json))
                app_logger.trace(
                    "uuidcode={} - Call stop_job".format(uuidcode))
                error_msg = ""
                try:
                    mem = utils_file_loads.map_error_messages()
                    if properties_json.get('status') in [
                            'FAILED'
                    ] and properties_json.get('statusMessage') in mem.keys():
                        error_msg = mem.get(
                            properties_json.get('statusMessage', ''),
                            "Could not start your Job. Please check your configuration. An administrator is informed."
                        )
                    else:
                        app_logger.error(
                            "uuidcode={} - StatusMessage from Failed UNICORE Job not found in /etc/j4j/j4j_mount/j4j_unicore/map_error_messages.json. Please update to have a better user experience"
                            .format(uuidcode))
                        error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                except:
                    error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                try:
                    stop_job(app_logger, uuidcode, servername,
                             request_headers.get('system'), request_headers,
                             app_urls, True, error_msg)
                except:
                    app_logger.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 530

            try:
                method = "GET"
                method_args = {
                    "url": request_headers.get('filedir'),
                    "headers": unicore_header,
                    "certificate": cert
                }
                text, status_code, response_header = unicore_communication.request(
                    app_logger, uuidcode, method, method_args)
                if status_code == 200:
                    unicore_header[
                        'X-UNICORE-SecuritySession'] = response_header[
                            'X-UNICORE-SecuritySession']
                    # in UNICORE 8 the answer is a bit different
                    children_json = json.loads(text)
                    if 'children' in children_json.keys():
                        children = json.loads(text).get('children', [])
                    elif 'content' in children_json.keys():
                        children = list(
                            json.loads(text).get('content', {}).keys())
                    else:
                        app_logger.warning(
                            "uuidcode={} - Could not find any childrens in {}".
                            format(uuidcode, text))
                        children = []
                elif status_code == 404:
                    orchestrator_communication.set_skip(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_skip'),
                        request_headers.get('servername'), 'False')
                    app_logger.warning(
                        "uuidcode={} - Could not get properties. 404 Not found. Do nothing and return. {} {} {}"
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    return "", 539
                else:
                    app_logger.warning(
                        "uuidcode={} - Could not get information about filedirectory. UNICORE/X Response: {} {} {}"
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    raise Exception(
                        "{} - Could not get information about filedirectory. Throw Exception because of wrong status_code: {}"
                        .format(uuidcode, status_code))
            except:
                counter += 1
                if counter > 10:
                    app_logger.error(
                        "uuidcode={} - Get filelist ({}) failed 10 times over 30 seconds. {} {}"
                        .format(uuidcode, request_headers.get('filedir'),
                                method, remove_secret(method_args)))
                    app_logger.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(app_logger, uuidcode, servername,
                                 request_headers.get('system'),
                                 request_headers, app_urls)
                    except:
                        app_logger.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                app_logger.info(
                    "uuidcode={} - Get filelist ({}) failed {} time(s)".format(
                        uuidcode, request_headers.get('filedir'), counter))
                hub_communication.status(
                    app_logger, uuidcode,
                    app_urls.get('hub', {}).get('url_proxy_route'),
                    app_urls.get('hub', {}).get('url_status'),
                    request_headers.get('jhubtoken'), 'waitforhostname',
                    request_headers.get('escapedusername'), servername)
                continue
            if '.end' in children or '/.end' in children:
                # It's not running anymore
                status = 'stopped'
            elif '.host' in children or '/.host' in children:
                # running, build up tunnel
                try:
                    tunnel_utils.create(
                        app_logger, uuidcode,
                        app_urls.get('hub', {}).get('url_proxy_route'),
                        app_urls.get('tunnel', {}).get('url_tunnel'),
                        app_urls.get('hub', {}).get('url_cancel'),
                        request_headers.get('kernelurl'),
                        request_headers.get('filedir'), unicore_header,
                        request_headers.get('servername'),
                        request_headers.get('system'),
                        request_headers.get('port'), cert,
                        request_headers.get('jhubtoken'),
                        request_headers.get('escapedusername'), servername,
                        app_urls.get('orchestrator', {}).get('url_hostname'))
                except:
                    orchestrator_communication.set_skip(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_skip'),
                        request_headers.get('servername'), 'False')
                    app_logger.exception(
                        "uuidcode={} - Could not create tunnel".format(
                            uuidcode))
                    app_logger.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(app_logger, uuidcode, servername,
                                 request_headers.get('system'),
                                 request_headers, app_urls)
                    except:
                        app_logger.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                    return
                status = "running"
            else:
                app_logger.info(
                    "uuidcode={} - Update JupyterHub status ({})".format(
                        uuidcode, "waitforhostname"))
                hub_communication.status(
                    app_logger, uuidcode,
                    app_urls.get('hub', {}).get('url_proxy_route'),
                    app_urls.get('hub', {}).get('url_status'),
                    request_headers.get('jhubtoken'), "waitforhostname",
                    request_headers.get('escapedusername'), servername)
                continue
            app_logger.info(
                "uuidcode={} - Update JupyterHub status ({})".format(
                    uuidcode, status))
            hub_communication.status(
                app_logger, uuidcode,
                app_urls.get('hub', {}).get('url_proxy_route'),
                app_urls.get('hub', {}).get('url_status'),
                request_headers.get('jhubtoken'), status,
                request_headers.get('escapedusername'), servername)
            if status in ['running', 'stopped'] and request_headers.get(
                    'spawning',
                    'true').lower() == 'true':  # spawning is finished
                app_logger.trace(
                    'uuidcode={} - Tell J4J_Orchestrator that the spawning is done'
                    .format(uuidcode))
                try:
                    orchestrator_communication.set_spawning(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_spawning'),
                        request_headers.get('servername'), 'False')
                except:
                    app_logger.exception(
                        "uuidcode={} - Could not set spawning to false in J4J_Orchestrator database for {}"
                        .format(uuidcode, request_headers.get('servername')))
            orchestrator_communication.set_skip(
                app_logger, uuidcode,
                app_urls.get('orchestrator', {}).get('url_skip'),
                request_headers.get('servername'), 'False')
            return
    except:
        app_logger.exception("uuidcode={} - Bugfix required".format(uuidcode))
Ejemplo n.º 5
0
def copy_log(app_logger, uuidcode, unicore_header, filedir, kernelurl, cert):
    app_logger.debug("uuidcode={} - Copy Log from {}".format(
        uuidcode, kernelurl))
    # in this directory we will write the complete log from the started server.
    directory = '/etc/j4j/j4j_mount/jobs/{}_{}'.format(
        kernelurl.split('/')[-1],
        datetime.datetime.today().strftime('%Y_%m_%d-%H_%M_%S'))
    for i in range(10):
        if os.path.exists(directory):
            add_uuid = uuid.uuid4().hex
            directory = directory + '_' + add_uuid
        if not os.path.exists(directory):
            os.makedirs(directory)
            break
        if i == 9:
            app_logger.warning(
                "uuidcode={} - Could not find a directory to save files".
                format(uuidcode))
            return
    app_logger.debug("uuidcode={} - Copy Log to {}".format(
        uuidcode, directory))
    # Get children list
    try:
        app_logger.info(
            "uuidcode={} - Get list of files of UNICORE/X Job".format(
                uuidcode))
        text, status_code, response_header = unicore_communication.request(
            app_logger, uuidcode, "GET", {
                "url": filedir,
                "headers": unicore_header,
                "certificate": cert
            })
        if status_code != 200:
            app_logger.warning(
                "uuidcode={} - Could not save files from {}. Response from UNICORE: {} {} {}"
                .format(uuidcode, kernelurl, text, status_code,
                        remove_secret(response_header)))
            return
        # in UNICORE 8 the answer is a bit different
        children_json = json.loads(text)
        if 'children' in children_json.keys():
            children = json.loads(text).get('children', [])
        elif 'content' in children_json.keys():
            children = list(json.loads(text).get('content', {}).keys())
        else:
            app_logger.warning(
                "uuidcode={} - Could not find any childrens in {}".format(
                    uuidcode, text))
            children = []
        unicore_header['X-UNICORE-SecuritySession'] = response_header[
            'X-UNICORE-SecuritySession']
    except:
        app_logger.exception(
            "uuidcode={} - Could not save files from {}".format(
                uuidcode, kernelurl))
        return

    # For the file input we need another Accept in the header, save the old one
    hostname = ""
    accept = unicore_header.get('Accept', False)
    unicore_header['Accept'] = 'application/octet-stream'
    app_logger.info("uuidcode={} - Save files in directory {}".format(
        uuidcode, directory))
    for child in children:
        try:
            content, status_code, response_header = unicore_communication.request(
                app_logger, uuidcode, "GET", {
                    "url": filedir + '/' + child,
                    "headers": unicore_header,
                    "certificate": cert,
                    "return_content": True
                })
            if status_code != 200:
                app_logger.warning(
                    "uuidcode={} - Could not save file {} from {}. Try next. Response from UNICORE: {} {} {}"
                    .format(uuidcode, child, kernelurl, content, status_code,
                            remove_secret(response_header)))
                continue
            with open(directory + '/' + child, 'w') as f:
                f.write(str(content.encode("utf-8")))
            if child == ".host" or child == "/.host":
                hostname = content.strip()
        except:
            app_logger.exception(
                "uuidcode={} - Could not save file {} from {}".format(
                    uuidcode, child, kernelurl))
            break
    if accept:
        unicore_header['Accept'] = accept
    else:
        del unicore_header['Accept']
    app_logger.debug("uuidcode={} - Log from {} to {} copied".format(
        uuidcode, kernelurl, directory))
    return hostname
Ejemplo n.º 6
0
    def post(self):
        try:
            # Track actions through different webservices.
            uuidcode = request.headers.get('uuidcode', '<no uuidcode>')
            app.log.info("uuidcode={} - Spawn Server".format(uuidcode))
            app.log.trace("uuidcode={} - Headers: {}".format(
                uuidcode, request.headers))
            app.log.trace("uuidcode={} - Json: {}".format(
                uuidcode, request.json))

            # Check for J4J intern token
            validate_auth(app.log, uuidcode,
                          request.headers.get('Intern-Authorization'))

            servername = request.headers.get('servername')
            # Create header for unicore job
            try:
                unicore_header, accesstoken, expire = unicore_utils.create_header(
                    app.log,  # @UnusedVariable
                    uuidcode,
                    request.headers,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_token'),
                    request.headers.get('escapedusername'),
                    servername)
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                app.log.exception(
                    "uuidcode={} - Could not create header for UNICORE/X Job. {} {}"
                    .format(uuidcode, remove_secret(request.json),
                            app.urls.get('tunnel', {}).get('url_remote')))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg, False)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                # Return positive status: Administrator is informed and there is nothing we can do here otherwise.
                return "", 200

            # Create input files for the job. A working J4J_tunnel webservice is required
            try:
                unicore_input = unicore_utils.create_inputs(
                    app.log, uuidcode, request.json,
                    request.headers.get('project'),
                    app.urls.get('tunnel', {}).get('url_remote'),
                    request.headers.get('account'))
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes."
                app.log.exception(
                    "uuidcode={} - Could not create input files for UNICORE/X Job. {} {}"
                    .format(uuidcode, remove_secret(request.json),
                            app.urls.get('tunnel', {}).get('url_remote')))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg, False)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 534

            # Create Job description
            unicore_file = utils_file_loads.get_unicorex()
            if unicore_file.get(request.json.get('system').upper(),
                                {}).get("UNICORE8", False):
                unicore_json = unicore_utils.create_unicore8_job(
                    app.log, uuidcode, request.json,
                    request.headers.get('Project'), unicore_input,
                    request.headers.get('escapedusername'))
            else:
                unicore_json = unicore_utils.create_job(
                    app.log, uuidcode, request.json,
                    request.headers.get('Project'), unicore_input)

            # Get URL and certificate to communicate with UNICORE/X
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X url".format(uuidcode))
            unicorex = utils_file_loads.get_unicorex()
            url = unicorex.get(request.json.get('system', ''), {}).get(
                'link',
                '<no_url_found_for_{}>'.format(request.json.get('system')))
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X url Result: {}".format(
                    uuidcode, url))
            cert = unicorex.get(request.json.get('system', ''),
                                {}).get('certificate', False)
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}"
                .format(uuidcode, cert))

            # Submit Job. It will not be started, because of unicore_json['haveClientStageIn']='true'
            kernelurl = ""
            try:
                hub_communication.status(
                    app.log, uuidcode,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_status'),
                    request.headers.get('jhubtoken'), 'submitunicorejob',
                    request.headers.get('escapedusername'), servername)
                method = "POST"
                method_args = {
                    "url": url + "/jobs",
                    "headers": unicore_header,
                    "data": json.dumps(unicore_json),
                    "certificate": cert
                }
                app.log.info("uuidcode={} - Submit UNICORE/X Job to {}".format(
                    uuidcode, url + "/jobs"))
                text, status_code, response_header = unicore_communication.request(
                    app.log, uuidcode, method, method_args)
                if status_code != 201:
                    app.log.warning(
                        "uuidcode={} - Could not submit Job. Response from UNICORE/X: {} {} {}."
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    if status_code == 500:
                        app.log.error(
                            "uuidcode={} - UNICORE RESTART REQUIRED!! {}".
                            format(
                                uuidcode,
                                request.json.get('system',
                                                 '<system_unknown>')))
                    elif status_code == 403 or status_code == 432:
                        raise SpawnException(
                            "Invalid token. Please logout and login again.")
                    else:
                        app.log.error(
                            "uuidcode={} - Unexpected status_code. Add case for this status_code."
                            .format(uuidcode))
                    raise SpawnException(
                        "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes."
                    )
                else:
                    unicore_header[
                        'X-UNICORE-SecuritySession'] = response_header[
                            'X-UNICORE-SecuritySession']
                    kernelurl = response_header['Location']
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                    app.log.exception(
                        "uuidcode={} - User message: {} - Could not submit Job. {} {}"
                        .format(uuidcode, err_msg, method,
                                remove_secret(method_args)))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg, False)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 539

            # get properties of job
            for i in range(5):  # @UnusedVariable
                properties_json = {}
                try:
                    method = "GET"
                    method_args = {
                        "url": kernelurl,
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app.log.info(
                        "uuidcode={} - Get Properties of UNICORE/X Job {}".
                        format(uuidcode, kernelurl))
                    text, status_code, response_header = unicore_communication.request(
                        app.log, uuidcode, method, method_args)
                    if status_code != 200:
                        if status_code == 500:
                            app.log.error(
                                "uuidcode={} - UNICORE RESTART REQUIRED!! {}".
                                format(
                                    uuidcode,
                                    request.json.get('system',
                                                     '<system_unknown>')))
                            raise SpawnException(
                                "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes."
                            )
                        else:
                            app.log.error(
                                "uuidcode={} - Unexpected status_code. Add case for this status_code."
                                .format(uuidcode))
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties of Job. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not get properties of Job. Response from UNICORE/X: {} {} {}."
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            raise Exception(
                                "{} - Could not get properties of Job. Throw exception because of wrong status_code: {}"
                                .format(uuidcode, status_code))
                    else:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        properties_json = json.loads(text)
                        if properties_json.get(
                                'status') == 'UNDEFINED' and i < 4:
                            app.log.debug(
                                "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            break
                except (SpawnException, Exception) as e:
                    if type(e).__name__ == "SpawnException":
                        err_msg = str(e)
                    else:
                        err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                        app.log.exception(
                            "uuidcode={} - Could not get properties of Job. {} {}"
                            .format(uuidcode, method,
                                    remove_secret(method_args)))
                    app.log.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(app.log, uuidcode, servername,
                                 request.json.get('system'), request.headers,
                                 app.urls, True, err_msg)
                    except:
                        app.log.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                    return "", 539

            # get file directory
            # this will be used in get. Ask it here once and send it to get() afterwards
            filedirectory = ""
            try:
                method = "GET"
                method_args = {
                    "url":
                    properties_json['_links']['workingDirectory']['href'],
                    "headers": unicore_header,
                    "certificate": cert
                }
                app.log.info(
                    "uuidcode={} - Get path of file directory of UNICORE/X Job"
                    .format(uuidcode))
                text, status_code, response_header = unicore_communication.request(
                    app.log, uuidcode, method, method_args)
                if status_code != 200:
                    app.log.error(
                        "uuidcode={} - Unknown status_code. Please add case for this status_code"
                        .format(uuidcode))
                    app.log.warning(
                        "uuidcode={} - Could not get filedirectory. UNICORE/X Response: {} {} {}"
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    raise Exception(
                        "{} - Could not get filedirectory. Throw exception because of wrong status_code: {}"
                        .format(uuidcode, status_code))
                else:
                    unicore_header[
                        'X-UNICORE-SecuritySession'] = response_header[
                            'X-UNICORE-SecuritySession']
                    filedirectory = json.loads(text)['_links']['files']['href']
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                app.log.exception(
                    "uuidcode={} - Could not get filedirectory. {} {}".format(
                        uuidcode, method, remove_secret(method_args)))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 539

            return "", 201, {
                'kernelurl':
                kernelurl,
                'filedir':
                filedirectory,
                'X-UNICORE-SecuritySession':
                unicore_header.get('X-UNICORE-SecuritySession')
            }
        except:
            app.log.exception("Jobs.post failed. Bugfix required")
Ejemplo n.º 7
0
    def get(self):
        try:
            # Track actions through different webservices.
            uuidcode = request.headers.get('uuidcode', '<no uuidcode>')
            app.log.info("uuidcode={} - Get Server Status".format(uuidcode))
            app.log.trace("uuidcode={} - Headers: {}".format(
                uuidcode, request.headers))

            # Check for J4J intern token
            validate_auth(app.log, uuidcode,
                          request.headers.get('intern-authorization'))
            servername = request.headers.get('servername')

            # Create UNICORE header and get certificate
            try:
                unicore_header, accesstoken, expire = unicore_utils.create_header(
                    app.log,  # @UnusedVariable
                    uuidcode,
                    request.headers,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_token'),
                    request.headers.get('escapedusername'),
                    servername)
            except (SpawnException, Exception):
                app.log.exception(
                    "uuidcode={} - Could not Create Header. Token from user {} might be revoked. Do nothing and return."
                    .format(uuidcode, request.headers.get('escapedusername')))
                # Return positive status: Administrator is informed and there is nothing we can do here otherwise.
                return "", 200
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X certificate path".format(
                    uuidcode))
            unicorex = utils_file_loads.get_unicorex()
            cert = unicorex.get(request.headers.get('system', ''),
                                {}).get('certificate', False)
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}"
                .format(uuidcode, cert))

            # Get Properties of kernelurl
            kernelurl = request.headers.get('kernelurl')
            for i in range(5):  # @UnusedVariable
                properties_json = {}
                try:
                    method = "GET"
                    method_args = {
                        "url": kernelurl,
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app.log.info(
                        "uuidcode={} - Get Properties of UNICORE/X Job {}".
                        format(uuidcode, kernelurl))
                    text, status_code, response_header = unicore_communication.request(
                        app.log, uuidcode, method, method_args)
                    if status_code == 200:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        properties_json = json.loads(text)
                        if properties_json.get(
                                'status') == 'UNDEFINED' and i < 4:
                            app.log.debug(
                                "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            break
                    elif status_code == 404:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties. 404 Not found. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not get properties. 404 Not found. Stop Job and return. {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            try:
                                stop_job(app.log, uuidcode, servername,
                                         request.headers.get('system'),
                                         request.headers, app.urls, True, '',
                                         False)
                            except:
                                app.log.exception(
                                    "uuidcode={} - Could not stop Job. It may still run"
                                    .format(uuidcode))
                            return "", 539
                    elif status_code == 500:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - UNICORE RESTART REQUIRED!!. system: {}"
                                .format(
                                    uuidcode,
                                    request.headers.get(
                                        'system', '<system_unknown>')))
                            app.log.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            app.log.warning(
                                "uuidcode={} - Do not send update to JupyterHub."
                                .format(uuidcode))
                            # If JupyterHub don't receives an update for a long time it can stop the job itself.
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                    else:
                        app.log.error(
                            "uuidcode={} - Unknown status_code received. Add case for this: {} {}"
                            .format(uuidcode, status_code, text))
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            raise Exception(
                                "{} - Could not get properties. Throw exception because of wrong status_code: {}"
                                .format(uuidcode, status_code))
                except:
                    app.log.exception(
                        "uuidcode={} - Could not get properties. JupyterLab will be still running. {} {}"
                        .format(uuidcode, method, remove_secret(method_args)))
                    app.log.warning(
                        "uuidcode={} - Do not send update to JupyterHub.".
                        format(uuidcode))
                    # If JupyterHub don't receives an update for a long time it can stop the job itself.
                    orchestrator_communication.set_skip(
                        app.log, uuidcode,
                        app.urls.get('orchestrator', {}).get('url_skip'),
                        request.headers.get('servername'), 'False')
                    return "", 539

            if properties_json.get('status') in [
                    'SUCCESSFUL', 'ERROR', 'FAILED', 'NOT_SUCCESSFUL'
            ]:
                # Job is Finished for UNICORE, so it should be for JupyterHub
                if request.headers.get('pollspawner',
                                       'false').lower() == 'true':
                    app.log.error(
                        'uuidcode={} - Get (poll spawner): Job is finished or failed - JobStatus: {}. Send Information to JHub. {}'
                        .format(uuidcode, properties_json.get('status'),
                                properties_json))
                    if properties_json.get(
                            'statusMessage', ''
                    ) == "Failed: Execution was not completed (no exit code file found), please check standard error file <stderr>":
                        app.log.error(
                            "uuidcode={} - UNICORE hotfix: do nothing because that's most likely a bug."
                            .format(uuidcode))
                        return "", 200
                else:
                    if not properties_json.get(
                            'statusMessage') == 'Job was aborted by the user.':
                        app.log.error(
                            'uuidcode={} - At starting process: Job is finished or failed - JobStatus: {}. Send Information to JHub. {}'
                            .format(uuidcode, properties_json.get('status'),
                                    properties_json))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                orchestrator_communication.set_skip(
                    app.log, uuidcode,
                    app.urls.get('orchestrator', {}).get('url_skip'),
                    request.headers.get('servername'), 'False')
                error_msg = ""
                try:
                    mem = utils_file_loads.map_error_messages()
                    if properties_json.get('status') in [
                            'FAILED'
                    ] and properties_json.get('statusMessage') in mem.keys():
                        error_msg = mem.get(
                            properties_json.get('statusMessage', ''),
                            "Could not start your Job. Please check your configuration. An administrator is informed."
                        )
                    else:
                        for key, value in mem.items():
                            if properties_json.get('statusMessage',
                                                   '').startswith(key):
                                error_msg = value
                        if error_msg == "":
                            if request.headers.get('pollspawner',
                                                   'false').lower() == 'true':
                                app.log.error(
                                    "uuidcode={} - StatusMessage from Failed UNICORE Job not found in /etc/j4j/j4j_mount/j4j_unicore/map_error_messages.json. Please update to have a better user experience"
                                    .format(uuidcode))
                            error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                except:
                    error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.headers.get('system'), request.headers,
                             app.urls, True, error_msg)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 530

            # The Job is not finished yet (good)
            # Get Files in the filedir
            children = []
            for i in range(5):  # @UnusedVariable
                try:
                    method = "GET"
                    method_args = {
                        "url": request.headers.get('filedir'),
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app.log.info(
                        "uuidcode={} - Get list of files of UNICORE/X Job {}".
                        format(uuidcode, kernelurl))
                    text, status_code, response_header = unicore_communication.request(
                        app.log, uuidcode, method, method_args)
                    if status_code == 200:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        # in UNICORE 8 the answer is a bit different
                        children_json = json.loads(text)
                        if 'children' in children_json.keys():
                            children = json.loads(text).get('children', [])
                        elif 'content' in children_json.keys():
                            children = list(
                                json.loads(text).get('content', {}).keys())
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not find any childrens in {}"
                                .format(uuidcode, text))
                            children = []
                        if len(children) == 0 and i < 4:
                            app.log.debug(
                                "uuidcode={} - Received empty children list. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            break
                    elif status_code == 404:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get children list. 404 Not found. Try again in 2 seconds."
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - Could not get children list. 404 Not found. Do nothing and return. {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                    elif status_code == 500:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get children list. Status Code 500. Try again in 2 seconds."
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - UNICORE/X RESTART REQUIRED".
                                format(uuidcode))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                    else:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get children list. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - Unknown status code. Add case for this: {} {}"
                                .format(status_code, text))
                            app.log.error(
                                "uuidcode={} - Could not get children list. Do nothing and return. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                except:
                    app.log.error(
                        "uuidcode={} - UNICORE/X RESTART REQUIRED".format(
                            uuidcode))
                    app.log.exception(
                        "uuidcode={} - Could not get children list. {} {}".
                        format(uuidcode, method, remove_secret(method_args)))
                    orchestrator_communication.set_skip(
                        app.log, uuidcode,
                        app.urls.get('orchestrator', {}).get('url_skip'),
                        request.headers.get('servername'), 'False')
                    return "", 539

            # get the 'real' status of the job from the files in the working_directory
            # 'real' means: We don't care about Queued, ready, running or something. We just want to know: Is it bad (failed or cancelled) or good (running or spawning)
            status = ''
            if properties_json.get('status') in [
                    'QUEUED', 'READY', 'RUNNING', 'STAGINGIN'
            ]:
                if '.end' in children or '/.end' in children:
                    # It's not running anymore
                    status = 'stopped'
                elif '.tunnel' in children or '/.tunnel' in children:
                    # It's running and tunnel is up
                    status = 'running'
                elif '.host' in children or '/.host' in children:
                    if request.headers.get('pollspawner',
                                           'false').lower() == 'true':
                        # If there's an error when collecting the children list it may happen, that we would try to create a tunnel for a server that's already running for a long time
                        app.log.error(
                            'uuidcode={} - Poll Spawner wants to create tunnel. Stop it. Children list: {}'
                            .format(uuidcode, children))
                        status = 'running'
                    else:
                        # build up tunnel
                        try:
                            tunnel_utils.create(
                                app.log, uuidcode,
                                app.urls.get('hub', {}).get('url_proxy_route'),
                                app.urls.get('tunnel', {}).get('url_tunnel'),
                                app.urls.get('hub',
                                             {}).get('url_cancel'), kernelurl,
                                request.headers.get('filedir'), unicore_header,
                                request.headers.get('servername'),
                                request.headers.get('system'),
                                request.headers.get('port'), cert,
                                request.headers.get('jhubtoken'),
                                request.headers.get('escapedusername'),
                                servername)
                        except:
                            app.log.error(
                                "uuidcode={} - Could not create Tunnel. Used Parameters: {} {} {} {} {} {} {} {} {} {}"
                                .format(
                                    uuidcode,
                                    app.urls.get('tunnel',
                                                 {}).get('url_tunnel'),
                                    app.urls.get('hub', {}).get('url_cancel'),
                                    kernelurl, request.headers.get('filedir'),
                                    remove_secret(unicore_header),
                                    request.headers.get('servername'),
                                    request.headers.get('system'),
                                    request.headers.get('port'), cert,
                                    '<secret>'))
                            app.log.trace(
                                "uuidcode={} - Call stop_job".format(uuidcode))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            try:
                                stop_job(
                                    app.log, uuidcode, servername,
                                    request.headers.get('system'),
                                    request.headers, app.urls, True,
                                    "Jupyter@JSC internal error. An administrator is informed. Please try again in a few minutes."
                                )
                            except:
                                app.log.exception(
                                    "uuidcode={} - Could not stop Job. It may still run"
                                    .format(uuidcode))
                            return "", 539
                    status = 'running'
                else:
                    if request.headers.get('pollspawner',
                                           'false').lower() == 'true':
                        # If there's an error when collecting the children list it may happen, that we would create a thread to get better information. We just send running and hope for the next run
                        app.log.error(
                            'uuidcode={} - Poll Spawner wants to create get_status thread. Prevent it. Children list: {}'
                            .format(uuidcode, children))
                        status = 'running'
                    else:
                        request_headers = {}
                        for key, value in request.headers.items():
                            if 'Token' in key:
                                key = key.replace('-', '_')
                            request_headers[key.lower()] = value
                        app.log.trace(
                            "uuidcode={} - New Header for Thread: {}".format(
                                uuidcode, request_headers))
                        # no .host in children, let's start a thread which looks for it every second
                        t = Thread(target=jobs_threads.get,
                                   args=(app.log, uuidcode, request_headers,
                                         unicore_header, app.urls, cert))
                        t.start()
                        status = 'waitforhostname'
                app.log.info(
                    "uuidcode={} - Update JupyterHub status ({})".format(
                        uuidcode, status))
                hub_communication.status(
                    app.log, uuidcode,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_status'),
                    request.headers.get('jhubtoken'), status,
                    request.headers.get('escapedusername'), servername)
                if status in ['running', 'stopped'] and request.headers.get(
                        'spawning',
                        'true').lower() == 'true':  # spawning is finished
                    app.log.trace(
                        'uuidcode={} - Tell J4J_Orchestrator that the spawning is done'
                        .format(uuidcode))
                    try:
                        orchestrator_communication.set_spawning(
                            app.log, uuidcode,
                            app.urls.get('orchestrator',
                                         {}).get('url_spawning'),
                            request.headers.get('servername'), 'False')
                    except:
                        app.log.exception(
                            "uuidcode={} - Could not set spawning to false in J4J_Orchestrator database for {}"
                            .format(uuidcode,
                                    request_headers.get('servername')))

            else:
                app.log.error('uuidcode={} - Unknown JobStatus: {}'.format(
                    uuidcode, properties_json.get('status')))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(
                        app.log, uuidcode, servername,
                        request.headers.get('system'), request.headers,
                        app.urls, True,
                        "A backend Service had a problem. An administrator is informed. Please try it again in a few minutes."
                    )
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
            if status != 'waitforhostname':  # no thread was started, so the check is finished
                orchestrator_communication.set_skip(
                    app.log, uuidcode,
                    app.urls.get('orchestrator', {}).get('url_skip'),
                    request.headers.get('servername'), 'False')
        except:
            app.log.exception("Jobs.get failed. Bugfix required")
Ejemplo n.º 8
0
def quota_check(app_logger, uuidcode, app_urls, request_headers, unicore_header, cert, servername):
    try:
        method = "GET"
        accept = unicore_header.get('Accept', False)
        unicore_header['Accept'] = 'application/octet-stream'
        method_args = {"url": request_headers.get('filedir')+'/.quota_check.out',
                       "headers": unicore_header,
                       "certificate": cert,
                       "return_content": True}
        content, status_code, response_header = unicore_communication.request(app_logger,
                                                                              uuidcode,
                                                                              method,
                                                                              method_args)
        if status_code != 200:
            app_logger.warning("uuidcode={} - Could not get quota check output. UNICORE/X Response: {} {} {}".format(uuidcode, content, status_code, remove_secret(response_header)))
            raise Exception("{} - Could not get hostname. Throw exception because of wrong status_code: {}".format(uuidcode, status_code))
        else:
            unicore_header['X-UNICORE-SecuritySession'] = response_header['X-UNICORE-SecuritySession']
            quota_result = content.strip()
    except:
        app_logger.exception("uuidcode={} - Could not get quota check output. {} {}".format(uuidcode, method, remove_secret(method_args)))
        app_logger.warning("uuidcode={} - Send cancel to JupyterHub.".format(uuidcode))
        hub_communication.cancel(app_logger,
                                 uuidcode,
                                 app_urls.get('hub', {}).get('url_proxy_route'),
                                 app_urls.get('hub', {}).get('url_cancel'),
                                 request_headers.get('jhubtoken'),
                                 "Something went wrong. An administrator is informed.",
                                 request_headers.get('escapedusername'),
                                 servername)
        if accept:
            unicore_header['Accept'] = accept
        else:
            del unicore_header['Accept']
        return False
    if accept:
        unicore_header['Accept'] = accept
    else:
        del unicore_header['Accept']
    if quota_result.lower() == "datausage":
        app_logger.info("uuidcode={} - Quota Check for user: Quota exceeded {}".format(uuidcode, quota_result))
        stop_job(app_logger,
                 uuidcode,
                 servername,
                 request_headers.get('system'),
                 request_headers,
                 app_urls,
                 True,
                 "Your disk quota in $HOME is exceeded. Please check it at https://judoor.fz-juelich.de or with this command: \"$ jutil user dataquota\".",
                 True,
                 False)
        return False
    elif quota_result.lower() == "inode":
        app_logger.info("uuidcode={} - Quota Check for user: Quota exceeded {}".format(uuidcode, quota_result))
        stop_job(app_logger,
                 uuidcode,
                 servername,
                 request_headers.get('system'),
                 request_headers,
                 app_urls,
                 True,
                 "You've got too many inodes in $HOME. Please check it at https://judoor.fz-juelich.de or with this command: \"$ jutil user dataquota\".",
                 True,
                 False)
        return False
    elif quota_result.lower() == "ok":
        app_logger.debug("uuidcode={} - Quota Check for user ok".format(uuidcode))
        return True
    else:
        app_logger.error("uuidcode={} - Could not understand the quota result: {}".format(uuidcode, quota_result))
        return True