Example #1
0
def submit_job(j):
    """Submit HySDS job."""

    # get task_id and orchestrator queue
    task_id = submit_job.request.id
    orch_queue = submit_job.request.delivery_info.get('exchange', 'unknown')

    # get container image name and url
    image_name = j.get('container_image_name', None)
    image_url = j.get('container_image_url', None)
    image_mapping = j.get('container_mappings', None)

    # get hard/soft time limits
    time_limit = j.get('time_limit', None)
    soft_time_limit = j.get('soft_time_limit', None)

    # job dedup enabled?
    dedup = j.get('enable_dedup', True)

    # get priority
    priority = j.get('priority', None)
    if priority is None:
        priority = submit_job.request.delivery_info.get('priority')
        if priority is None: priority = 0

    # get tag
    tag = j.get('tag', None)

    # get username
    username = j.get('username', None)

    # default job json
    job = {
        'job_id': task_id,
        'name': task_id,
        'job_info': j,
    }

    # set job type
    if 'job_type' in j:
        match = JOB_TYPE_RE.search(j['job_type'])
        job['type'] = match.group(1) if match else j['job_type']

    # default context
    context = j.get('context', {})

    # get orchestrator configuration
    orch_cfg_file = os.environ.get('HYSDS_ORCHESTRATOR_CFG', None)
    if orch_cfg_file is None:
        error = "Environment variable HYSDS_ORCHESTRATOR_CFG is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))

    #logger.info("HYSDS_ORCHESTRATOR_CFG:%s" % orch_cfg_file)
    if not os.path.exists(orch_cfg_file):
        error = "Orchestrator configuration %s doesn't exist." % orch_cfg_file
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))

    with open(orch_cfg_file) as f:
        orch_cfg = json.load(f)

    # get job creators directory
    job_creators_dir = os.environ.get('HYSDS_JOB_CREATORS_DIR', None)
    if job_creators_dir is None:
        error = "Environment variable HYSDS_JOB_CREATORS_DIR is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))
    #logger.info("HYSDS_JOB_CREATORS_DIR:%s" % job_creators_dir)

    # parse job configurations
    job_cfgs = {}
    for cfg in orch_cfg['configs']:
        job_cfgs[cfg['job_type']] = cfg['job_creators']

    # check that we have info to create jobs
    if 'job_type' not in j:
        error = "Invalid job spec. No 'job_type' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))
    job_type = j['job_type']
    job_queue = j.get('job_queue', None)

    if 'payload' not in j:
        error = "Invalid job spec. No 'payload' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            'uuid': job['job_id'],
            'job_id': job['job_id'],
            'payload_id': task_id,
            'status': 'job-failed',
            'job': job,
            'context': context,
            'error': error_info,
            'short_error': get_short_error(error_info),
            'traceback': error_info
        }
        log_job_status(job_status_json)
        raise (OrchestratorExecutionError(error, job_status_json))
    payload = j['payload']
    #logger.info("got job_type: %s" % job_type)
    #logger.info("payload: %s" % payload)

    # set payload hash
    if j.get('payload_hash', None) is None:
        j['payload_hash'] = get_payload_hash(payload)
    payload_hash = j['payload_hash']

    # do dedup
    if dedup is True:
        dj = query_dedup_job(payload_hash)
        if isinstance(dj, dict):
            dedup_msg = "orchestrator found duplicate job %s with status %s" % (
                dj['_id'], dj['status'])
            job_status_json = {
                'uuid': job['job_id'],
                'job_id': job['job_id'],
                'payload_id': task_id,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'dedup_job': dj['_id'],
                'status': 'job-deduped',
                'job': job,
                'context': context,
                'dedup_msg': dedup_msg
            }
            log_job_status(job_status_json)
            return [task_id]

    # if no explicit job or data type defined in orchestrator, add catch-all
    if job_type not in job_cfgs:
        # first check if data product type; if not then assume job type
        match = DATA_TYPE_RE.search(job_type)
        if match:
            return queue_dataset_evaluation(payload)
        else:
            match = JOB_TYPE_RE.search(job_type)
            jt = match.group(1) if match else job_type
            job_cfgs[job_type] = [{
                "job_name":
                j.get('job_name', jt).replace(":", "__"),
                "function":
                "utils.get_job_json",
                "job_queues": [jt if job_queue is None else job_queue]
            }]

    # get job json and queue jobs
    results = []
    for jc in job_cfgs[job_type]:
        func = get_function(jc['function'], add_to_sys_path=job_creators_dir)
        argspec = getargspec(func)
        try:
            if len(argspec.args) > 1 and 'job_type' in argspec.args:
                match = JOB_TYPE_RE.search(job_type)
                jt = match.group(1) if match else job_type
                job = func(payload, jt)
            else:
                job = func(payload)
        except Exception as e:
            error = "Job creator function %s failed to generate job JSON." % jc[
                'function']
            error_info = ERROR_TMPL.substitute(orch_queue=orch_queue,
                                               error=error)
            job_status_json = {
                'uuid': job['job_id'],
                'job_id': job['job_id'],
                'payload_id': task_id,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'status': 'job-failed',
                'job': {
                    'job_id': task_id,
                    'name': task_id,
                    'job_info': j
                },
                'context': context,
                'error': error_info,
                'short_error': get_short_error(error_info),
                'traceback': traceback.format_exc()
            }
            log_job_status(job_status_json)
            raise (OrchestratorExecutionError(error, job_status_json))
        #logger.info("job: %s" % job)

        # set context
        job.setdefault('context', {}).update(context)

        # override hard/soft time limits
        time_limit = jc.get('time_limit', time_limit)
        soft_time_limit = jc.get('soft_time_limit', soft_time_limit)

        # queue jobs
        for queue in jc['job_queues']:
            # copy job
            job_json = copy.deepcopy(job)

            # set job id
            if 'name' in job:
                job_json['job_id'] = get_job_id(job['name'])
            else:
                job_json['job_id'] = get_job_id(jc['job_name'])
                job_json['name'] = job_json['job_id']

            # set container image name and url
            if image_name is not None:
                job_json['container_image_name'] = image_name
            if image_url is not None:
                job_json['container_image_url'] = image_url
            if image_mapping is not None:
                job_json['container_mappings'] = image_mapping

            # set priority
            job_json['priority'] = priority

            # set tag
            if 'tag' not in job_json and tag is not None:
                job_json['tag'] = tag

            # set username
            if 'username' not in job_json and username is not None:
                job_json['username'] = username

            # set job_info
            time_queued = datetime.utcnow()
            job_json['job_info'] = {
                'id': job_json['job_id'],
                'job_queue': queue,
                'time_queued': time_queued.isoformat() + 'Z',
                'time_limit': time_limit,
                'soft_time_limit': soft_time_limit,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'job_payload': {
                    'job_type': job_type,
                    'payload_task_id': task_id,
                }
            }

            # generate celery task id
            job_json['task_id'] = uuid()

            # log queued status
            job_status_json = {
                'uuid': job_json['task_id'],
                'job_id': job_json['job_id'],
                'payload_id': task_id,
                'payload_hash': payload_hash,
                'dedup': dedup,
                'status': 'job-queued',
                'job': job_json
            }
            log_job_status(job_status_json)

            # submit job
            res = run_job.apply_async((job_json, ),
                                      queue=queue,
                                      time_limit=time_limit,
                                      soft_time_limit=soft_time_limit,
                                      priority=priority,
                                      task_id=job_json['task_id'])

            # append result
            results.append(job_json['task_id'])

    return results
Example #2
0
def submit_job(j):
    """Submit HySDS job."""

    # get task_id and orchestrator queue
    task_id = submit_job.request.id
    orch_queue = submit_job.request.delivery_info.get("exchange", "unknown")

    # get container image name and url
    image_name = j.get("container_image_name", None)
    image_url = j.get("container_image_url", None)
    image_mapping = j.get("container_mappings", None)

    # get container runtime options
    runtime_options = j.get("runtime_options", None)

    # get hard/soft time limits
    time_limit = j.get("time_limit", None)
    soft_time_limit = j.get("soft_time_limit", None)

    # job dedup enabled?
    dedup = j.get("enable_dedup", True)

    # get priority
    priority = j.get("priority", None)
    if priority is None:
        priority = submit_job.request.delivery_info.get("priority")
        if priority is None:
            priority = 0

    # get tag
    tag = j.get("tag", None)

    # get username
    username = j.get("username", None)

    # default job json
    job = {
        "job_id": task_id,
        "name": task_id,
        "job_info": j,
    }

    # set job type
    if "job_type" in j:
        match = JOB_TYPE_RE.search(j["job_type"])
        job["type"] = match.group(1) if match else j["job_type"]

    # default context
    context = j.get("context", {})

    # get orchestrator configuration
    orch_cfg_file = os.environ.get("HYSDS_ORCHESTRATOR_CFG", None)
    if orch_cfg_file is None:
        error = "Environment variable HYSDS_ORCHESTRATOR_CFG is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)

    # logger.info("HYSDS_ORCHESTRATOR_CFG:%s" % orch_cfg_file)
    if not os.path.exists(orch_cfg_file):
        error = "Orchestrator configuration %s doesn't exist." % orch_cfg_file
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)

    with open(orch_cfg_file) as f:
        orch_cfg = json.load(f)

    # get job creators directory
    job_creators_dir = os.environ.get("HYSDS_JOB_CREATORS_DIR", None)
    if job_creators_dir is None:
        error = "Environment variable HYSDS_JOB_CREATORS_DIR is not set."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)
    # logger.info("HYSDS_JOB_CREATORS_DIR:%s" % job_creators_dir)

    # parse job configurations
    job_cfgs = {}
    for cfg in orch_cfg["configs"]:
        job_cfgs[cfg["job_type"]] = cfg["job_creators"]

    # check that we have info to create jobs
    if "job_type" not in j:
        error = "Invalid job spec. No 'job_type' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)
    job_type = j["job_type"]
    job_queue = j.get("job_queue", None)

    if "payload" not in j:
        error = "Invalid job spec. No 'payload' specified."
        error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
        job_status_json = {
            "uuid": job["job_id"],
            "job_id": job["job_id"],
            "payload_id": task_id,
            "status": "job-failed",
            "job": job,
            "context": context,
            "error": error_info,
            "short_error": get_short_error(error_info),
            "traceback": error_info,
        }
        log_job_status(job_status_json)
        raise OrchestratorExecutionError(error, job_status_json)
    payload = j["payload"]
    # logger.info("got job_type: %s" % job_type)
    # logger.info("payload: %s" % payload)

    # set payload hash
    if j.get("payload_hash", None) is None:
        j["payload_hash"] = get_payload_hash(payload)
    payload_hash = j["payload_hash"]

    # do dedup
    if dedup is True:
        try:
            dj = query_dedup_job(payload_hash)
        except NoDedupJobFoundException as e:
            logger.info(str(e))
            dj = None
        if isinstance(dj, dict):
            dedup_msg = "orchestrator found duplicate job %s with status %s" % (
                dj["_id"],
                dj["status"],
            )
            job_status_json = {
                "uuid": job["job_id"],
                "job_id": job["job_id"],
                "payload_id": task_id,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "dedup_job": dj["_id"],
                "status": "job-deduped",
                "job": job,
                "context": context,
                "dedup_msg": dedup_msg,
            }
            log_job_status(job_status_json)
            return [task_id]

    # if no explicit job or data type defined in orchestrator, add catch-all
    if job_type not in job_cfgs:
        # first check if data product type; if not then assume job type
        match = DATA_TYPE_RE.search(job_type)
        if match:
            return queue_dataset_evaluation(payload)
        else:
            match = JOB_TYPE_RE.search(job_type)
            jt = match.group(1) if match else job_type
            job_cfgs[job_type] = [
                {
                    "job_name": j.get("job_name", jt).replace(":", "__"),
                    "function": "utils.get_job_json",
                    "job_queues": [jt if job_queue is None else job_queue],
                }
            ]

    # get job json and queue jobs
    results = []
    for jc in job_cfgs[job_type]:
        func = get_function(jc["function"], add_to_sys_path=job_creators_dir)
        argspec = getargspec(func)
        try:
            if len(argspec.args) > 1 and "job_type" in argspec.args:
                match = JOB_TYPE_RE.search(job_type)
                jt = match.group(1) if match else job_type
                job = func(payload, jt)
            else:
                job = func(payload)
        except Exception as e:
            error = (
                "Job creator function %s failed to generate job JSON." % jc["function"]
            )
            error_info = ERROR_TMPL.substitute(orch_queue=orch_queue, error=error)
            job_status_json = {
                "uuid": job["job_id"],
                "job_id": job["job_id"],
                "payload_id": task_id,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "status": "job-failed",
                "job": {"job_id": task_id, "name": task_id, "job_info": j},
                "context": context,
                "error": error_info,
                "short_error": get_short_error(error_info),
                "traceback": traceback.format_exc(),
            }
            log_job_status(job_status_json)
            raise OrchestratorExecutionError(error, job_status_json)
        # logger.info("job: %s" % job)

        # set context
        job.setdefault("context", {}).update(context)

        # override hard/soft time limits and ensure gap
        soft_time_limit, time_limit = ensure_hard_time_limit_gap(
            jc.get("soft_time_limit", soft_time_limit), jc.get("time_limit", time_limit)
        )

        # queue jobs
        for queue in jc["job_queues"]:
            # copy job
            job_json = copy.deepcopy(job)

            # set job id
            if "name" in job:
                job_json["job_id"] = get_job_id(job["name"])
            else:
                job_json["job_id"] = get_job_id(jc["job_name"])
                job_json["name"] = job_json["job_id"]

            # set container image name, url, mappings, and runtime options
            if image_name is not None:
                job_json["container_image_name"] = image_name
            if image_url is not None:
                job_json["container_image_url"] = image_url
            if image_mapping is not None:
                job_json["container_mappings"] = image_mapping
            if runtime_options is not None:
                job_json["runtime_options"] = runtime_options

            # set priority
            job_json["priority"] = priority

            # set tag
            if "tag" not in job_json and tag is not None:
                job_json["tag"] = tag

            # set username
            if "username" not in job_json and username is not None:
                job_json["username"] = username

            # set job_info
            time_queued = datetime.utcnow()
            job_json["job_info"] = {
                "id": job_json["job_id"],
                "job_queue": queue,
                "time_queued": time_queued.isoformat() + "Z",
                "time_limit": time_limit,
                "soft_time_limit": soft_time_limit,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "job_payload": {
                    "job_type": job_type,
                    "payload_task_id": task_id,
                },
            }

            # generate celery task id
            job_json["task_id"] = uuid()

            # log queued status
            job_status_json = {
                "uuid": job_json["task_id"],
                "job_id": job_json["job_id"],
                "payload_id": task_id,
                "payload_hash": payload_hash,
                "dedup": dedup,
                "status": "job-queued",
                "job": job_json,
            }
            log_job_status(job_status_json)

            # submit job
            res = run_job.apply_async(
                (job_json,),
                queue=queue,
                time_limit=time_limit,
                soft_time_limit=soft_time_limit,
                priority=priority,
                task_id=job_json["task_id"],
            )

            # append result
            results.append(job_json["task_id"])

    return results