コード例 #1
0
ファイル: backend.py プロジェクト: r78v10a07/cwl-airflow
 def __init__(self):
     get_dir(DAGS_FOLDER)
     self.include_examples = False
     self.dag_template_with_tmp_folder = "#!/usr/bin/env python3\nfrom cwl_airflow import CWLDAG, CWLJobDispatcher, CWLJobGatherer\ndag = CWLDAG(cwl_workflow='{0}', dag_id='{1}', default_args={{'tmp_folder':'{2}'}})\ndag.create()\ndag.add(CWLJobDispatcher(dag=dag), to='top')\ndag.add(CWLJobGatherer(dag=dag), to='bottom')"
     self.wes_state_conversion = {
         "running": "RUNNING",
         "success": "COMPLETE",
         "failed": "EXECUTOR_ERROR"
     }
コード例 #2
0
ファイル: backend.py プロジェクト: NSAPH/cwl-airflow
    def wes_collect_attachments(self, run_id):
        tempdir = tempfile.mkdtemp(dir=get_dir(
            path.abspath(
                conf_get("cwl", "tmp_folder",
                         path.join(AIRFLOW_HOME, "cwl_tmp_folder")))),
                                   prefix="run_id_" + run_id + "_")
        logging.debug(f"Save all attached files to {tempdir}")
        for k, ls in iterlists(connexion.request.files):
            logging.debug(f"Process attachment parameter {k}")
            if k == "workflow_attachment":
                for v in ls:
                    try:
                        logging.debug(f"Process attached file {v}")
                        sp = v.filename.split("/")
                        fn = []
                        for p in sp:
                            if p not in ("", ".", ".."):
                                fn.append(secure_filename(p))
                        dest = path.join(tempdir, *fn)
                        if not path.isdir(path.dirname(dest)):
                            get_dir(path.dirname(dest))
                        logging.debug(f"Save {v.filename} to {dest}")
                        v.save(dest)
                    except Exception as err:
                        raise ValueError(
                            f"Failed to process attached file {v}, {err}")
        body = {}
        for k, ls in iterlists(connexion.request.form):
            logging.debug(f"Process form parameter {k}")
            for v in ls:
                try:
                    if not v:
                        continue
                    if k == "workflow_params":
                        job_file = path.join(tempdir, "job.json")
                        with open(job_file, "w") as f:
                            json.dump(json.loads(v), f, indent=4)
                        logging.debug(f"Save job file to {job_file}")
                        loader = Loader(load.jobloaderctx.copy())
                        job_order_object, _ = loader.resolve_ref(
                            job_file, checklinks=False)
                        body[k] = job_order_object
                    else:
                        body[k] = v
                except Exception as err:
                    raise ValueError(
                        f"Failed to process form parameter {k}, {v}, {err}")

        if "workflow_params" not in body or "workflow_url" not in body:
            raise ValueError(
                "Missing 'workflow_params' or 'workflow_url' in submission")

        body["workflow_url"] = path.join(tempdir,
                                         secure_filename(body["workflow_url"]))

        return tempdir, body
コード例 #3
0
def overwrite_deprecated_dag(dag_location, deprecated_dags_folder=None):
    """
    Loads DAG content from "dag_location" file. Searches for "dag.create()" command.
    If not found, we don't need to upgrade this DAG (it's either not from CWL-Airflow,
    or already in a new format). If "deprecated_dags_folder" is not None, copies original
    DAG file there before DAG upgrading. After copying deprecated DAG to the
    "deprecated_dags_folder" updates ".airflowignore" with DAG file basename to exclude
    it from Airflow parsing. Upgraded DAG will always include base64 encoded gzip
    compressed workflow content. In case "workflow_location" is relative path, it will
    be resolved based on the dirname of "dag_location" (useful for tests only, because
    all our old DAGs always have absolute path to the CWL file). Function doesn't backup
    or update the original CWL file.
    TODO: in case more coplicated DAG files that include "default_args", etc, this function
    should be updated to the more complex one.
    """

    with open(dag_location,
              "r+") as io_stream:  # open for both reading and writing

        dag_content = io_stream.read()

        if not re.search("dag\\.create\\(\\)",
                         dag_content):  # do nothing if it wasn't old-style DAG
            return

        workflow_location = get_absolute_path(  # resolve relative to dirname of "dag_location" (good for tests)
            re.search("(cwl_workflow\\s*=\\s*[\"|'])(.+?)([\"|'])",
                      dag_content).group(2), os.path.dirname(dag_location))

        dag_id = re.search("(dag_id\\s*=\\s*[\"|'])(.+?)([\"|'])",
                           dag_content).group(2)

        compressed_workflow_content = get_compressed(
            fast_cwl_load(
                workflow_location
            )  # no "run" embedding or convertion to Workflow. If DAG worked, cwl should be ok too
        )

        if deprecated_dags_folder is not None:  # copy old DAG to the folder with deprecated DAGs, add ".airflowignore"
            get_dir(
                deprecated_dags_folder
            )  # try to create "deprecated_dags_folder" if it doesn't exist
            shutil.copy(dag_location, deprecated_dags_folder)  # copy DAG file
            ignore = os.path.join(deprecated_dags_folder, ".airflowignore")
            with open(
                    ignore, "a"
            ) as output_stream:  # add deprecated DAG to ".airflowignore"
                output_stream.write(os.path.basename(dag_location) + "\n")

        io_stream.seek(0)  # rewind "dag_location" file to the beginning
        io_stream.write(
            DAG_TEMPLATE.format(compressed_workflow_content, dag_id))
        io_stream.truncate(
        )  # remove old data at the end of a file if anything became shorter than original
コード例 #4
0
 def __init__(self, simulated_reports_location=None):
     get_dir(DAGS_FOLDER)
     self.simulation_mode = False  # when set to True, will bypass execution of post_dag_runs and post_dags_dag_runs functions
     self.include_examples = False
     self.dag_template_with_tmp_folder = "#!/usr/bin/env python3\nfrom cwl_airflow import CWLDAG, CWLJobDispatcher, CWLJobGatherer\ndag = CWLDAG(cwl_workflow='{0}', dag_id='{1}', default_args={{'tmp_folder':'{2}'}})\ndag.create()\ndag.add(CWLJobDispatcher(dag=dag), to='top')\ndag.add(CWLJobGatherer(dag=dag), to='bottom')"
     self.wes_state_conversion = {"running": "RUNNING", "success": "COMPLETE", "failed": "EXECUTOR_ERROR"}
     self.validated_dags = {}  # stores dags' content md5 checksums as keys and one of the statuses ["checking", "success", "error"] as values
     if simulated_reports_location is not None:
         try:
             self.suite_data = load_yaml(simulated_reports_location)
             self.simulation_mode = True
             logging.info(f"Running simulation mode from the {simulated_reports_location}")
         except Exception as err:
             logging.error(f"Failed to load simulation data from {simulated_reports_location} \n {err}")
コード例 #5
0
def load_test_suite(args):
    """
    Loads tests from the provided --suite file.
    Selects tests based on the indices from --range.
    
    Updates tools locations to be absolute. Loads
    jobs and updates all inputs files locations to
    be absolute (unless --relative parameter was set).
    Adds "outputs_folder" to the job, as well as the
    "index" to indicate which test case was used.

    Adds run_id's as keys for easy access and proper
    test identification when receiving results.

    In case we failed to load test case, sets "finished"
    to True and writes reason to "error".
    """

    suite_data = load_yaml(args.suite)
    suite_dir = os.path.dirname(args.suite)
    suite_data_filtered = OrderedDict()                                       # use OrderedDict just to keep it similar to suite_data
    for i in args.range:
        test_data = suite_data[i]
        run_id = str(uuid.uuid4())
        tool_location = get_absolute_path(test_data["tool"], suite_dir)
        logging.info(f"Read test case {i+1} to run {tool_location}")

        job_location = None
        job_data = {}

        if "job" in test_data:
            job_location = get_absolute_path(test_data["job"], suite_dir)
            try:
                if args.relative:                       # skips relative path resolutions as well as adding values from the workflow default inputs
                    job_data = load_yaml(job_location)
                else:
                    job_data = load_job(
                        workflow=tool_location,
                        job=job_location
                    )
            except Exception as ex:
                logging.error(f"Failed to load test case {i+1} to run {tool_location} with {job_location}")
                test_data.update({
                    "error": "Failed to load test case",
                    "finished": True
                })

        job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id))

        test_data.update({
            "job":  job_data,                                                 # already parsed, includes "outputs_folder"
            "tool": tool_location,
            "dag_id": get_rootname(test_data["tool"]),
            "index": i+1,                                                     # to know test case number, 1-based to correspond to --range
            "finished": test_data.get("finished", False)                      # to indicate whether the test was finished or not
        })
        logging.info(f"Successfully loaded test case {i+1} to run {tool_location} with {job_location} as {run_id}")
        suite_data_filtered[run_id] = test_data                               # use "run_id" as a key for fast access when checking results
    return suite_data_filtered
コード例 #6
0
def get_temp_folders(task_id, job_data):
    """
    Creates a set of folders required for workflow execution.
    Uses "tmp_folder" from "job_data" as a parent folder.
    """

    step_tmp_folder = get_dir(os.path.join(job_data["tmp_folder"], task_id))

    step_cache_folder = get_dir(
        os.path.join(step_tmp_folder, task_id + "_step_cache"))

    step_outputs_folder = get_dir(
        os.path.join(step_tmp_folder, task_id + "_step_outputs"))

    step_report = os.path.join(step_tmp_folder, task_id + "_step_report.json")

    return step_tmp_folder, step_cache_folder, step_outputs_folder, step_report
コード例 #7
0
    def execute(self, context):
        """
        Loads job Object from the context. Sets "tmp_folder" and "output_folder"
        if they have not been set before in the job. In case "tmp_folder" and/or
        "output_folder" were read from the job and are relative, resolves paths
        relative to the "tmp_folder" and/or "outputs_folder" from "cwl_args".
        Dumps step outputs as a json file into "tmp_folder". Writes to X-Com report
        file location.
        """

        setup_cwl_logger(context["ti"])
        post_status(context)

        # for easy access
        dag_id = context["dag"].dag_id
        workflow = context["dag"].workflow
        run_id = context["run_id"].replace(":", "_").replace(
            "+", "_")  # to make it dumpable by json
        cwl_args = context["dag"].default_args["cwl"]

        # Loads job from dag_run configuration. Sets defaults from "workflow". Fails on missing input files
        job_data = load_job(workflow=workflow,
                            job=context["dag_run"].conf["job"],
                            cwl_args=cwl_args)

        job_data["tmp_folder"] = get_dir(
            get_absolute_path(
                job_data.get(
                    "tmp_folder",
                    mkdtemp(dir=cwl_args["tmp_folder"],
                            prefix=dag_id + "_" + run_id + "_")),
                cwl_args["tmp_folder"]))

        job_data["outputs_folder"] = get_dir(
            get_absolute_path(
                job_data.get(
                    "outputs_folder",
                    os.path.join(cwl_args["outputs_folder"], dag_id, run_id)),
                cwl_args["outputs_folder"]))

        _, _, _, step_report = get_temp_folders(task_id=self.task_id,
                                                job_data=job_data)

        dump_json(job_data, step_report)

        return step_report
コード例 #8
0
def copy_dags(airflow_home, source_folder=None):
    """
    Copies *.py files (dags) from source_folder (default ../../extensions/dags)
    to dags_folder, which is always {airflow_home}/dags
    """

    if source_folder is None:
        source_folder = os.path.join(
            os.path.dirname(os.path.abspath(os.path.join(__file__, "../../"))),
            "extensions/dags",
        )

    target_folder = get_dir(os.path.join(airflow_home, "dags"))
    for root, dirs, files in os.walk(source_folder):
        for filename in files:
            if re.match(".*\\.py$", filename) and filename != "__init__.py":
                if not os.path.isfile(os.path.join(target_folder, filename)):
                    shutil.copy(os.path.join(root, filename), target_folder)
コード例 #9
0
ファイル: conformance.py プロジェクト: r78v10a07/cwl-airflow
def load_test_suite(args):
    """
    Loads tests from the provided --suite file.
    Selects tests based on the indices from --range.
    
    Updates tools locations to be absolute, loads
    jobs and updates all inputs files locations to
    be absolute too. Adds "outputs_folder" to the job,
    as well as the "index" to indicate which test case
    was used.

    Adds run_id's as keys for easy access and proper
    test identification when receiving results.
    """

    suite_data = load_yaml(args.suite)
    suite_dir = os.path.dirname(args.suite)
    suite_data_filtered = OrderedDict()                                       # use OrderedDict just to keep it similar to suite_data
    for i in args.range:
        test_data = suite_data[i]
        run_id = str(uuid.uuid4())
        tool_location = get_absolute_path(test_data["tool"], suite_dir)
        job_location = get_absolute_path(test_data["job"], suite_dir)
        if "job" in test_data:
            job_data = load_job(
                workflow=tool_location,
                job=job_location
            )
        else:
            job_data = {}
        job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id))

        test_data.update({
            "job":  job_data,                                                 # already parsed, includes "outputs_folder"
            "tool": tool_location,
            "dag_id": get_rootname(test_data["tool"]),
            "index": i+1,                                                     # to know test case number, 1-based to correspond to --range
            "finished": False                                                 # to indicate whether the test was finished or not
        })
        logging.info(f"Load test case {i+1} to run {tool_location} with {job_location} as {run_id}")
        suite_data_filtered[run_id] = test_data                               # use "run_id" as a key for fast access when checking results
    return suite_data_filtered
コード例 #10
0
def get_default_cwl_args(preset_cwl_args=None):
    """
    Returns default arguments required by cwltool's functions with a few
    parameters added and overwritten (required by CWL-Airflow). Defaults
    can be preset through "preset_cwl_args" if provided. All new fields
    from "preset_cwl_args" will be added to the returned results.
    """

    preset_cwl_args = {} if preset_cwl_args is None else deepcopy(
        preset_cwl_args)

    # default arguments required by cwltool
    required_cwl_args = get_default_args()

    # update default arguments required by cwltool with those that were preset by user
    required_cwl_args.update(preset_cwl_args)

    # update default arguments required by cwltool with those that might
    # be updated based on the higher priority of airflow configuration
    # file. If airflow configuration file doesn't include correspondent
    # parameters, use those that were preset by user, or defaults
    required_cwl_args.update({
        "tmp_folder":
        get_dir(
            conf_get("cwl", "tmp_folder",
                     preset_cwl_args.get("tmp_folder", CWL_TMP_FOLDER))),
        "outputs_folder":
        get_dir(  # for CWL-Airflow to store outputs if "outputs_folder" is not overwritten in job
            conf_get("cwl", "outputs_folder",
                     preset_cwl_args.get("outputs_folder",
                                         CWL_OUTPUTS_FOLDER))),
        "inputs_folder":
        get_dir(  # for CWL-Airflow to resolve relative locations for input files if job was loaded from parsed object
            conf_get("cwl", "inputs_folder",
                     preset_cwl_args.get("inputs_folder", CWL_INPUTS_FOLDER))),
        "pickle_folder":
        get_dir(  # for CWL-Airflow to store pickled workflows
            conf_get("cwl", "pickle_folder",
                     preset_cwl_args.get("pickle_folder", CWL_PICKLE_FOLDER))),
        "use_container":
        conf_get(
            "cwl",
            "use_container",
            preset_cwl_args.get(
                "use_container",
                CWL_USE_CONTAINER)  # execute jobs in docker containers
        ),
        "no_match_user":
        conf_get(
            "cwl",
            "no_match_user",
            preset_cwl_args.get(
                "no_match_user", CWL_NO_MATCH_USER
            )  # disables passing the current uid to "docker run --user"
        ),
        "skip_schemas":
        conf_get(
            "cwl",
            "skip_schemas",
            preset_cwl_args.get(
                "skip_schemas", CWL_SKIP_SCHEMAS
            )  # it looks like this doesn't influence anything in the latest cwltool
        ),
        "strict":
        conf_get("cwl", "strict", preset_cwl_args.get("strict", CWL_STRICT)),
        "quiet":
        conf_get("cwl", "quiet", preset_cwl_args.get("quiet", CWL_QUIET)),
        "rm_tmpdir":
        preset_cwl_args.get(
            "rm_tmpdir", CWL_RM_TMPDIR
        ),  # even if we can set it in "preset_cwl_args" it's better not to change
        "move_outputs":
        preset_cwl_args.get(
            "move_outputs", CWL_MOVE_OUTPUTS
        ),  # even if we can set it in "preset_cwl_args" it's better not to change
        "enable_dev":
        preset_cwl_args.get(
            "enable_dev", CWL_ENABLE_DEV
        )  # fails to run without it when creating workflow from tool. TODO: Ask Peter?
    })

    return required_cwl_args