Beispiel #1
0
def remove_outdated_dags(cwl_id):
    logging.info(f"""Searching for dags based on cwl_id: {cwl_id}""")
    dags = {}
    for location in list_py_file_paths(DAGS_FOLDER, include_examples=False):
        dag_id = get_rootname(location)
        if cwl_id not in dag_id:
            continue
        dags[dag_id] = {
            "location": location,
            "modified": datetime.fromtimestamp(os.path.getmtime(location))
        }
        logging.info(f"""Found dag_id: {dag_id}, modified: {dags[dag_id]["modified"]}""")
    for dag_id, dag_metadata in sorted(dags.items(), key=lambda i: i[1]["modified"])[:-1]:
        logging.info(f"""Cleaning dag_id: {dag_id}""")
        if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0:
            try:
                delete_dag.delete_dag(dag_id)
            except Exception as ex:
                logging.error(f"""Failed to delete DAG\n {ex}""")
            for f in [
                dag_metadata["location"],
                os.path.splitext(dag_metadata["location"])[0]+".cwl"
            ]:
                try:
                    logging.info(f"""Deleting DAG file: {f}""")
                    os.remove(f)
                except Exception as ex:
                    logging.error(f"""Failed to delete file {f}\n {ex}""")
        else:
            logging.info("Skipping, DAG has running DagRuns")
Beispiel #2
0
def load_test_suite(args):
    """
    Loads tests from the provided --suite file.
    Selects tests based on the indices from --range.
    
    Updates tools locations to be absolute. Loads
    jobs and updates all inputs files locations to
    be absolute (unless --relative parameter was set).
    Adds "outputs_folder" to the job, as well as the
    "index" to indicate which test case was used.

    Adds run_id's as keys for easy access and proper
    test identification when receiving results.

    In case we failed to load test case, sets "finished"
    to True and writes reason to "error".
    """

    suite_data = load_yaml(args.suite)
    suite_dir = os.path.dirname(args.suite)
    suite_data_filtered = OrderedDict()                                       # use OrderedDict just to keep it similar to suite_data
    for i in args.range:
        test_data = suite_data[i]
        run_id = str(uuid.uuid4())
        tool_location = get_absolute_path(test_data["tool"], suite_dir)
        logging.info(f"Read test case {i+1} to run {tool_location}")

        job_location = None
        job_data = {}

        if "job" in test_data:
            job_location = get_absolute_path(test_data["job"], suite_dir)
            try:
                if args.relative:                       # skips relative path resolutions as well as adding values from the workflow default inputs
                    job_data = load_yaml(job_location)
                else:
                    job_data = load_job(
                        workflow=tool_location,
                        job=job_location
                    )
            except Exception as ex:
                logging.error(f"Failed to load test case {i+1} to run {tool_location} with {job_location}")
                test_data.update({
                    "error": "Failed to load test case",
                    "finished": True
                })

        job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id))

        test_data.update({
            "job":  job_data,                                                 # already parsed, includes "outputs_folder"
            "tool": tool_location,
            "dag_id": get_rootname(test_data["tool"]),
            "index": i+1,                                                     # to know test case number, 1-based to correspond to --range
            "finished": test_data.get("finished", False)                      # to indicate whether the test was finished or not
        })
        logging.info(f"Successfully loaded test case {i+1} to run {tool_location} with {job_location} as {run_id}")
        suite_data_filtered[run_id] = test_data                               # use "run_id" as a key for fast access when checking results
    return suite_data_filtered
Beispiel #3
0
def remove_outdated_dags(cwl_id, dags_folder):
    """
    Iterates over DAG files from the dags_folder (excluding Airflow examples). Assuming
    that dag_id written inside Python file is equal to its rootname and follows the naming
    rule "cwldid-commitsha", we check if there are any files that have target cwl_id in the
    rootname (aka in the dag_id). For all collected DAGs (based on cwl_id) we save modified
    timestamp and location, then sort them by timestamp excluding the newest one, thus
    forming a list of outdated DAGs for the same cwl_id (the same workflow). Then we iterate
    over the list of outdated DAGs and check whether we can safely remove it (both from DB
    and disk). The only condition when we don't delete outdated DAG is when there is at list
    one DagRun for it.
    """

    logging.info(
        f"Searching for dags based on cwl_id: {cwl_id} in order to remove the old ones"
    )
    dags = {}
    for location in list_py_file_paths(dags_folder, include_examples=False):
        dag_id = get_rootname(location)
        if cwl_id not in dag_id:
            continue
        dags[dag_id] = {
            "location": location,
            "modified": datetime.fromtimestamp(os.path.getmtime(location))
        }
        logging.info(
            f"Found dag_id: {dag_id}, modified: {dags[dag_id]['modified']}")
    for dag_id, dag_metadata in sorted(dags.items(),
                                       key=lambda i: i[1]["modified"])[:-1]:
        logging.info(f"Cleaning dag_id: {dag_id}")
        if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0:
            try:
                delete_dag.delete_dag(dag_id)
            except Exception as ex:
                logging.error(f"Failed to delete DAG\n {ex}")
            for f in [
                    dag_metadata["location"],
                    os.path.splitext(dag_metadata["location"])[0] + ".cwl"
            ]:
                try:
                    logging.info(f"Deleting DAG file: {f}")
                    os.remove(f)
                except Exception as ex:
                    logging.error(f"Failed to delete file {f}\n {ex}")
        else:
            logging.info("Skipping, DAG has running DagRuns")
Beispiel #4
0
def load_test_suite(args):
    """
    Loads tests from the provided --suite file.
    Selects tests based on the indices from --range.
    
    Updates tools locations to be absolute, loads
    jobs and updates all inputs files locations to
    be absolute too. Adds "outputs_folder" to the job,
    as well as the "index" to indicate which test case
    was used.

    Adds run_id's as keys for easy access and proper
    test identification when receiving results.
    """

    suite_data = load_yaml(args.suite)
    suite_dir = os.path.dirname(args.suite)
    suite_data_filtered = OrderedDict()                                       # use OrderedDict just to keep it similar to suite_data
    for i in args.range:
        test_data = suite_data[i]
        run_id = str(uuid.uuid4())
        tool_location = get_absolute_path(test_data["tool"], suite_dir)
        job_location = get_absolute_path(test_data["job"], suite_dir)
        if "job" in test_data:
            job_data = load_job(
                workflow=tool_location,
                job=job_location
            )
        else:
            job_data = {}
        job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id))

        test_data.update({
            "job":  job_data,                                                 # already parsed, includes "outputs_folder"
            "tool": tool_location,
            "dag_id": get_rootname(test_data["tool"]),
            "index": i+1,                                                     # to know test case number, 1-based to correspond to --range
            "finished": False                                                 # to indicate whether the test was finished or not
        })
        logging.info(f"Load test case {i+1} to run {tool_location} with {job_location} as {run_id}")
        suite_data_filtered[run_id] = test_data                               # use "run_id" as a key for fast access when checking results
    return suite_data_filtered
Beispiel #5
0
def convert_to_workflow(command_line_tool, location=None):
    """
    Converts "command_line_tool" to Workflow trying to keep all
    important elements. If "command_line_tool" is already Workflow,
    doesn't apply any changes. If "location" is not None, dumps
    results to json file.
    """

    if command_line_tool["class"] == "Workflow":
        workflow_tool = command_line_tool
    else:
        workflow_tool = {
            "class": "Workflow",
            "cwlVersion": command_line_tool["cwlVersion"],
            "inputs": [],
            "outputs": []
        }

        for key in ["requirements"]:
            if key in command_line_tool:
                workflow_tool[key] = command_line_tool[key]

        for input_id, input_data in get_items(command_line_tool["inputs"]):
            workflow_input = {
                "id": input_id,
                "type": remove_field_from_dict(
                    input_data["type"], "inputBinding"
                )  # "type" in WorkflowInputParameter cannot have "inputBinding"
            }
            for key in ["secondaryFiles",
                        "default"]:  # TODO: Do I need to copy format?
                if key in input_data:
                    workflow_input[key] = input_data[key]
            workflow_tool["inputs"].append(workflow_input)

        for output_id, output_data in get_items(command_line_tool["outputs"]):
            workflow_output = {
                "id":
                output_id,
                "type":
                output_data["type"],
                "outputSource":
                get_rootname(command_line_tool["id"]) + "/" + output_id
            }
            # TODO: not sure if I need format here
            # for key in ["format"]:
            #     if key in output_data:
            #         workflow_output[key] = output_data[key]
            workflow_tool["outputs"].append(workflow_output)

        workflow_tool["steps"] = [{
            "id":
            get_rootname(command_line_tool["id"]),
            "run":
            command_line_tool,
            "in": [{
                "id": input_id,
                "source": input_id
            } for input_id, _ in get_items(workflow_tool["inputs"])],
            "out": [
                output_id
                for output_id, _ in get_items(workflow_tool["outputs"])
            ]
        }]

    if location is not None:
        dump_json(workflow_tool, location)

    return workflow_tool