Example #1
0
def get_docker_params(image_name, image_url, image_mappings, root_work_dir, job_dir, runtime_options=None):
    """Build docker params."""

    # get dirs to mount
    root_jobs_dir = os.path.join(root_work_dir, 'jobs')
    root_tasks_dir = os.path.join(root_work_dir, 'tasks')
    root_workers_dir = os.path.join(root_work_dir, 'workers')
    root_cache_dir = os.path.join(root_work_dir, 'cache')

    # docker params dict
    params = {
        "image_name": image_name,
        "image_url": image_url,
        "uid": os.getuid(),
        "gid": os.getgid(),
        "working_dir": job_dir,
        "volumes": [
            ("/var/run/docker.sock", "/var/run/docker.sock"),
            (root_jobs_dir, root_jobs_dir),
            (root_tasks_dir, root_tasks_dir),
            (root_workers_dir, root_workers_dir),
            (root_cache_dir, "{}:ro".format(root_cache_dir)),
        ]
    }

    # add default image mappings
    celery_cfg_file = os.environ.get('HYSDS_CELERY_CFG', app.conf.__file__)
    if celery_cfg_file not in image_mappings and "celeryconfig.py" not in list(image_mappings.values()):
        image_mappings[celery_cfg_file] = "celeryconfig.py"
    dsets_cfg_file = os.environ.get('HYSDS_DATASETS_CFG',
                                    os.path.normpath(os.path.join(os.path.dirname(sys.executable),
                                                                  '..', 'etc', 'datasets.json')))
    if dsets_cfg_file not in image_mappings and "datasets.json" not in list(image_mappings.values()):
        image_mappings[dsets_cfg_file] = "datasets.json"

    # if running on k8s add hosts and resolv.conf; create mount directory
    blacklist = app.conf.WORKER_MOUNT_BLACKLIST
    mnt_dir = None
    on_k8s = int(app.conf.get('K8S', 0))
    if on_k8s:
        for f in ("/etc/hosts", "/etc/resolv.conf"):
            if f not in image_mappings and f not in list(image_mappings.values()):
                image_mappings[f] = f
        blacklist = [i for i in blacklist if i != "/etc"]
        mnt_dir = mkdtemp(prefix=".container_mounts-", dir=job_dir)

    # add user-defined image mappings
    for k, v in list(image_mappings.items()):
        k = os.path.expandvars(k)
        verify_docker_mount(k, blacklist)
        mode = "ro"
        if isinstance(v, list):
            if len(v) > 1:
                v, mode = v[0:2]
            elif len(v) == 1:
                v = v[0]
            else:
                raise RuntimeError("Invalid image mapping: %s:%s" % (k, v))
        if v.startswith('/'):
            mnt = v
        else:
            mnt = os.path.join(job_dir, v)
        if mnt_dir is not None:
            k = copy_mount(k, mnt_dir)
        params['volumes'].append((k, "%s:%s" % (mnt, mode)))

    # add runtime resources
    params['runtime_options'] = dict()
    if runtime_options is None:
        runtime_options = dict()
    for k, v in list(runtime_options.items()):
        # validate we have GPUs
        if k == "gpus" and int(os.environ.get("HYSDS_GPU_AVAILABLE", 0)) == 0:
            logger.warning("Job specified runtime option 'gpus' but no GPUs were detected. Skipping this option.")
            continue
        params['runtime_options'][k] = v

    return params
Example #2
0
File: utils.py Project: hysds/hysds
def triage(job, ctx):
    """Triage failed job's context and job json as well as _run.sh."""

    # set time_start if not defined (job failed prior to setting it)
    if "time_start" not in job["job_info"]:
        job["job_info"]["time_start"] = "{}Z".format(
            datetime.utcnow().isoformat("T"))

    # default triage id
    default_triage_id_format = "triaged_job-{job_id}_task-{job[task_id]}"
    default_triage_id_regex = "triaged_job-(?P<job_id>.+)_task-(?P<task_id>[-\\w])"

    # if exit code of job command is zero, don't triage anything
    exit_code = job["job_info"]["status"]
    if exit_code == 0:
        logger.info("Job exited with exit code %s. No need to triage." %
                    exit_code)
        return True

    # disable triage
    if ctx.get("_triage_disabled", False):
        logger.info(
            "Flag _triage_disabled set to True. Not performing triage.")
        return True

    # Check if custom triage id format was provided
    if "_triage_id_format" in ctx:
        triage_id_format = ctx["_triage_id_format"]
    else:
        triage_id_format = default_triage_id_format

    # get job info
    job_dir = job["job_info"]["job_dir"]
    job_id = job["job_info"]["id"]
    logger.info("job id: {}".format(job_id))

    # Check if the job_id is a triaged dataset. If so, let's parse out the job_id
    logger.info("Checking to see if the job_id matches the regex: {}".format(
        default_triage_id_regex))
    match = re.search(default_triage_id_regex, job_id)
    if match:
        logger.info(
            "job_id matches the triage dataset regex. Parsing out job_id")
        parsed_job_id = match.groupdict()["job_id"]
        logger.info("extracted job_id: {}".format(parsed_job_id))
    else:
        logger.info(
            "job_id does not match the triage dataset regex: {}".format(
                default_triage_id_regex))
        parsed_job_id = job_id

    # create triage dataset
    # Attempt to first use triage id format from user, but if there is any problem use the default id format instead
    try:
        triage_id = triage_id_format.format(job_id=parsed_job_id,
                                            job=job,
                                            job_context=ctx)
    except Exception as e:
        logger.warning(
            "Failed to apply custom triage id format because of {}: {}. Falling back to default triage id"
            .format(e.__class__.__name__, e))
        triage_id = default_triage_id_format.format(job_id=parsed_job_id,
                                                    job=job,
                                                    job_context=ctx)
    triage_dir = os.path.join(job_dir, triage_id)
    makedirs(triage_dir)

    # create dataset json
    ds_file = os.path.join(triage_dir, "{}.dataset.json".format(triage_id))
    ds = {
        "version": "v{}".format(hysds.__version__),
        "label": "triage for job {}".format(parsed_job_id),
    }
    if "cmd_start" in job["job_info"]:
        ds["starttime"] = job["job_info"]["cmd_start"]
    if "cmd_end" in job["job_info"]:
        ds["endtime"] = job["job_info"]["cmd_end"]
    with open(ds_file, "w") as f:
        json.dump(ds, f, sort_keys=True, indent=2)

    # create met json
    met_file = os.path.join(triage_dir, "{}.met.json".format(triage_id))
    with open(met_file, "w") as f:
        json.dump(job["job_info"], f, sort_keys=True, indent=2)

    # triage job-related files
    for f in glob(os.path.join(job_dir, "_*")):
        if os.path.isdir(f):
            shutil.copytree(f, os.path.join(triage_dir, os.path.basename(f)))
        else:
            shutil.copy(f, triage_dir)

    # triage log files
    for f in glob(os.path.join(job_dir, "*.log")):
        if os.path.isdir(f):
            shutil.copytree(f, os.path.join(triage_dir, os.path.basename(f)))
        else:
            shutil.copy(f, triage_dir)

    # triage additional globs
    for g in ctx.get("_triage_additional_globs", []):
        for f in glob(os.path.join(job_dir, g)):
            f = os.path.normpath(f)
            dst = os.path.join(triage_dir, os.path.basename(f))
            if os.path.exists(dst):
                dst = "{}.{}Z".format(dst, datetime.utcnow().isoformat("T"))
            try:
                if os.path.isdir(f):
                    shutil.copytree(f, dst)
                else:
                    shutil.copy(f, dst)
            except Exception as e:
                tb = traceback.format_exc()
                logger.error(
                    "Skipping copying of {}. Got exception: {}\n{}".format(
                        f, str(e), tb))

    # publish
    prod_json = publish_dataset(triage_dir, ds_file, job, ctx)

    # write published triage to file
    pub_triage_file = os.path.join(job_dir, "_triaged.json")
    with open(pub_triage_file, "w") as f:
        json.dump(prod_json, f, indent=2, sort_keys=True)

    # signal run_job() to continue
    return True
Example #3
0
def triage(job, ctx):
    """Triage failed job's context and job json as well as _run.sh."""

    ### default_triage_id_format = "triaged_job-{job[job_info][id]}"
    default_triage_id_format = "triaged_job-{job[job_info][id]}-{job[task_id]}"

    # if exit code of job command is zero, don't triage anything
    exit_code = job['job_info']['status']
    if exit_code == 0:
        logger.info("Job exited with exit code %s. No need to triage." %
                    exit_code)
        return True

    # disable triage
    if ctx.get('_triage_disabled', False):
        logger.info(
            "Flag _triage_disabled set to True. Not performing triage.")
        return True

    # Check if custom triage id format was provided
    if '_triage_id_format' in ctx:
        triage_id_format = ctx['_triage_id_format']
    else:
        triage_id_format = default_triage_id_format

    # get job info
    job_dir = job['job_info']['job_dir']
    job_id = job['job_info']['id']

    # create triage dataset
    # Attempt to first use triage id format from user, but if there is any problem use the default id format instead
    try:
        triage_id = triage_id_format.format(job=job, job_context=ctx)
    except Exception as e:
        logger.warning(
            "Failed to apply custom triage id format because of {}: {}. Falling back to default triage id"
            .format(e.__class__.__name__, e))
        triage_id = default_triage_id_format.format(job=job, job_context=ctx)
    triage_dir = os.path.join(job_dir, triage_id)
    makedirs(triage_dir)

    # create dataset json
    ds_file = os.path.join(triage_dir, '{}.dataset.json'.format(triage_id))
    ds = {
        'version': 'v{}'.format(hysds.__version__),
        'label': 'triage for job {}'.format(job_id),
    }
    if 'cmd_start' in job['job_info']:
        ds['starttime'] = job['job_info']['cmd_start']
    if 'cmd_end' in job['job_info']:
        ds['endtime'] = job['job_info']['cmd_end']
    with open(ds_file, 'w') as f:
        json.dump(ds, f, sort_keys=True, indent=2)

    # create met json
    met_file = os.path.join(triage_dir, '{}.met.json'.format(triage_id))
    with open(met_file, 'w') as f:
        json.dump(job['job_info'], f, sort_keys=True, indent=2)

    # triage job-related files
    for f in glob(os.path.join(job_dir, '_*')):
        shutil.copy(f, triage_dir)

    # triage log files
    for f in glob(os.path.join(job_dir, '*.log')):
        shutil.copy(f, triage_dir)

    # triage additional globs
    for g in ctx.get('_triage_additional_globs', []):
        for f in glob(os.path.join(job_dir, g)):
            if os.path.isdir(f):
                shutil.copytree(f, os.path.join(triage_dir,
                                                os.path.basename(f)))
            else:
                shutil.copy(f, triage_dir)

    # publish
    prod_json = publish_dataset(triage_dir, ds_file, job, ctx)

    # write published triage to file
    pub_triage_file = os.path.join(job_dir, '_triaged.json')
    with open(pub_triage_file, 'w') as f:
        json.dump(prod_json, f, indent=2, sort_keys=True)

    # signal run_job() to continue
    return True