def remove_outdated_dags(cwl_id): logging.info(f"""Searching for dags based on cwl_id: {cwl_id}""") dags = {} for location in list_py_file_paths(DAGS_FOLDER, include_examples=False): dag_id = get_rootname(location) if cwl_id not in dag_id: continue dags[dag_id] = { "location": location, "modified": datetime.fromtimestamp(os.path.getmtime(location)) } logging.info(f"""Found dag_id: {dag_id}, modified: {dags[dag_id]["modified"]}""") for dag_id, dag_metadata in sorted(dags.items(), key=lambda i: i[1]["modified"])[:-1]: logging.info(f"""Cleaning dag_id: {dag_id}""") if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0: try: delete_dag.delete_dag(dag_id) except Exception as ex: logging.error(f"""Failed to delete DAG\n {ex}""") for f in [ dag_metadata["location"], os.path.splitext(dag_metadata["location"])[0]+".cwl" ]: try: logging.info(f"""Deleting DAG file: {f}""") os.remove(f) except Exception as ex: logging.error(f"""Failed to delete file {f}\n {ex}""") else: logging.info("Skipping, DAG has running DagRuns")
def load_test_suite(args): """ Loads tests from the provided --suite file. Selects tests based on the indices from --range. Updates tools locations to be absolute. Loads jobs and updates all inputs files locations to be absolute (unless --relative parameter was set). Adds "outputs_folder" to the job, as well as the "index" to indicate which test case was used. Adds run_id's as keys for easy access and proper test identification when receiving results. In case we failed to load test case, sets "finished" to True and writes reason to "error". """ suite_data = load_yaml(args.suite) suite_dir = os.path.dirname(args.suite) suite_data_filtered = OrderedDict() # use OrderedDict just to keep it similar to suite_data for i in args.range: test_data = suite_data[i] run_id = str(uuid.uuid4()) tool_location = get_absolute_path(test_data["tool"], suite_dir) logging.info(f"Read test case {i+1} to run {tool_location}") job_location = None job_data = {} if "job" in test_data: job_location = get_absolute_path(test_data["job"], suite_dir) try: if args.relative: # skips relative path resolutions as well as adding values from the workflow default inputs job_data = load_yaml(job_location) else: job_data = load_job( workflow=tool_location, job=job_location ) except Exception as ex: logging.error(f"Failed to load test case {i+1} to run {tool_location} with {job_location}") test_data.update({ "error": "Failed to load test case", "finished": True }) job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id)) test_data.update({ "job": job_data, # already parsed, includes "outputs_folder" "tool": tool_location, "dag_id": get_rootname(test_data["tool"]), "index": i+1, # to know test case number, 1-based to correspond to --range "finished": test_data.get("finished", False) # to indicate whether the test was finished or not }) logging.info(f"Successfully loaded test case {i+1} to run {tool_location} with {job_location} as {run_id}") suite_data_filtered[run_id] = test_data # use "run_id" as a key for fast access when checking results return suite_data_filtered
def remove_outdated_dags(cwl_id, dags_folder): """ Iterates over DAG files from the dags_folder (excluding Airflow examples). Assuming that dag_id written inside Python file is equal to its rootname and follows the naming rule "cwldid-commitsha", we check if there are any files that have target cwl_id in the rootname (aka in the dag_id). For all collected DAGs (based on cwl_id) we save modified timestamp and location, then sort them by timestamp excluding the newest one, thus forming a list of outdated DAGs for the same cwl_id (the same workflow). Then we iterate over the list of outdated DAGs and check whether we can safely remove it (both from DB and disk). The only condition when we don't delete outdated DAG is when there is at list one DagRun for it. """ logging.info( f"Searching for dags based on cwl_id: {cwl_id} in order to remove the old ones" ) dags = {} for location in list_py_file_paths(dags_folder, include_examples=False): dag_id = get_rootname(location) if cwl_id not in dag_id: continue dags[dag_id] = { "location": location, "modified": datetime.fromtimestamp(os.path.getmtime(location)) } logging.info( f"Found dag_id: {dag_id}, modified: {dags[dag_id]['modified']}") for dag_id, dag_metadata in sorted(dags.items(), key=lambda i: i[1]["modified"])[:-1]: logging.info(f"Cleaning dag_id: {dag_id}") if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0: try: delete_dag.delete_dag(dag_id) except Exception as ex: logging.error(f"Failed to delete DAG\n {ex}") for f in [ dag_metadata["location"], os.path.splitext(dag_metadata["location"])[0] + ".cwl" ]: try: logging.info(f"Deleting DAG file: {f}") os.remove(f) except Exception as ex: logging.error(f"Failed to delete file {f}\n {ex}") else: logging.info("Skipping, DAG has running DagRuns")
def load_test_suite(args): """ Loads tests from the provided --suite file. Selects tests based on the indices from --range. Updates tools locations to be absolute, loads jobs and updates all inputs files locations to be absolute too. Adds "outputs_folder" to the job, as well as the "index" to indicate which test case was used. Adds run_id's as keys for easy access and proper test identification when receiving results. """ suite_data = load_yaml(args.suite) suite_dir = os.path.dirname(args.suite) suite_data_filtered = OrderedDict() # use OrderedDict just to keep it similar to suite_data for i in args.range: test_data = suite_data[i] run_id = str(uuid.uuid4()) tool_location = get_absolute_path(test_data["tool"], suite_dir) job_location = get_absolute_path(test_data["job"], suite_dir) if "job" in test_data: job_data = load_job( workflow=tool_location, job=job_location ) else: job_data = {} job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id)) test_data.update({ "job": job_data, # already parsed, includes "outputs_folder" "tool": tool_location, "dag_id": get_rootname(test_data["tool"]), "index": i+1, # to know test case number, 1-based to correspond to --range "finished": False # to indicate whether the test was finished or not }) logging.info(f"Load test case {i+1} to run {tool_location} with {job_location} as {run_id}") suite_data_filtered[run_id] = test_data # use "run_id" as a key for fast access when checking results return suite_data_filtered
def convert_to_workflow(command_line_tool, location=None): """ Converts "command_line_tool" to Workflow trying to keep all important elements. If "command_line_tool" is already Workflow, doesn't apply any changes. If "location" is not None, dumps results to json file. """ if command_line_tool["class"] == "Workflow": workflow_tool = command_line_tool else: workflow_tool = { "class": "Workflow", "cwlVersion": command_line_tool["cwlVersion"], "inputs": [], "outputs": [] } for key in ["requirements"]: if key in command_line_tool: workflow_tool[key] = command_line_tool[key] for input_id, input_data in get_items(command_line_tool["inputs"]): workflow_input = { "id": input_id, "type": remove_field_from_dict( input_data["type"], "inputBinding" ) # "type" in WorkflowInputParameter cannot have "inputBinding" } for key in ["secondaryFiles", "default"]: # TODO: Do I need to copy format? if key in input_data: workflow_input[key] = input_data[key] workflow_tool["inputs"].append(workflow_input) for output_id, output_data in get_items(command_line_tool["outputs"]): workflow_output = { "id": output_id, "type": output_data["type"], "outputSource": get_rootname(command_line_tool["id"]) + "/" + output_id } # TODO: not sure if I need format here # for key in ["format"]: # if key in output_data: # workflow_output[key] = output_data[key] workflow_tool["outputs"].append(workflow_output) workflow_tool["steps"] = [{ "id": get_rootname(command_line_tool["id"]), "run": command_line_tool, "in": [{ "id": input_id, "source": input_id } for input_id, _ in get_items(workflow_tool["inputs"])], "out": [ output_id for output_id, _ in get_items(workflow_tool["outputs"]) ] }] if location is not None: dump_json(workflow_tool, location) return workflow_tool