def create_dags(suite_data, args, dags_folder=None): """ Iterates over "suite_data" and creates new DAGs. Tries to include all tools into the worfklow before sending it to the API server. If loaded tool is not Workflow, send it unchanged. It's safe to not process errors when we failed to add new DAG. Airflow Scheduler will parse all dags at the end of the next "dag_dir_list_interval" from airflow.cfg. If args.embed was True, send base64 encoded gzip compressed content of the workflow file instead of attaching it. In case we failed to load and parse worklfow, sets "finished" to True and writes reason to "error". """ # TODO: Do we need to force scheduler to reload DAGs after all DAG added? for test_data in suite_data.values(): params = {"dag_id": test_data["dag_id"]} workflow_path = os.path.join( args.tmp, os.path.basename(test_data["tool"]) ) try: embed_all_runs( # will save results to "workflow_path" workflow_tool=fast_cwl_load(test_data["tool"]), location=workflow_path ) except Exception as ex: logging.error(f"Failed to load test case to run {test_data['tool']}") test_data.update({ "error": "Failed to load test case", "finished": True }) continue with open(workflow_path, "rb") as input_stream: logging.info(f"Add DAG {test_data['dag_id']} from test case {test_data['index']}") if args.embed: # send base64 encoded gzip compressed workflow content that will be embedded into DAG python file logging.info(f"Sending base64 encoded gzip compressed content from {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags"), params=params, json={"workflow_content": get_compressed(input_stream)} ) else: # attach workflow as a file logging.info(f"Attaching workflow file {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags"), params=params, files={"workflow": input_stream} ) # Check if we failed to add new DAG. One reason to fail - DAG hase been # already added. It's safe to ignore this error. In case more serious # reasons, they will be caught on the "trigger_dags" step if not r.ok: reason = get_api_failure_reason(r) logging.info(f"Failed to add DAG {test_data['dag_id']} from test case {test_data['index']} due to \n {reason}")
def export_dag(self, dag_id): """ Checks if DAG python file with the same name has been already exported. If not, checks if exaclty one of "workflow" and "workflow_content" parameters are present in the request. In case of "workflow_content" first we need to load a tool from it and try to convert it to Workflow (what if it was CommandLineTool), then compress it again and write to DAG python file. In case of "workflow", first we need to save attachment, then try to comvert it to Workflow (the same reason as above) and write it to DAG python file. """ dag_path = path.join(DAGS_FOLDER, dag_id + ".py") if path.isfile(dag_path): raise FileExistsError(f"File {dag_path} already exist") if "workflow_content" in (connexion.request.json or []) \ and "workflow" in connexion.request.files: raise ValueError("Only one of the 'workflow' or \ 'workflow_content' parameters can be set") if "workflow_content" in (connexion.request.json or []): # json field might be None, need to take [] as default workflow = get_compressed( convert_to_workflow( # to make sure we are not saving CommandLineTool instead of a Workflow command_line_tool=fast_cwl_load( # using fast_cwl_load is safe here because we deal with the content of a file connexion.request.json["workflow_content"] ) ) ) elif "workflow" in connexion.request.files: workflow = path.join(DAGS_FOLDER, dag_id + ".cwl") self.save_attachment("workflow", workflow) convert_to_workflow( command_line_tool=slow_cwl_load( # safer to use slow_cwl_load, because of the possible confusions with all these renaming. TODO: make it less complicate workflow=workflow, only_tool=True ), location=workflow ) else: raise ValueError("At least one of the 'workflow' or \ 'workflow_content' parameters should be set") with open(dag_path, "w") as output_stream: output_stream.write(DAG_TEMPLATE.format(workflow, dag_id)) return {"dag_id": dag_id, "cwl_path": workflow, "dag_path": dag_path}
def trigger_dags(suite_data, args): """ Triggers all DAGs from "suite_data". If failed to trigger DAG, updates "suite_data" with "error" and sets "finished" to True. In case --combine was set, we will call API that will first create the new DAG, then clean all previous DAG runs based on the provided run_id and dag_id, then remove outdated DAGs for the same workflow (for that dag_id should follow naming rule cwlid-commitsha) and only after that trigger the workflow execution. If not only --combine but also --embed was provided, send base64 encoded gzip compressed content of the workflow file instead of attaching it. """ for run_id, test_data in suite_data.items(): params = { "run_id": run_id, "dag_id": test_data["dag_id"], "conf": json.dumps({"job": test_data["job"]}) } if args.combine: # use API endpoint that combines both creating, cleaning and triggerring new DAGs logging.info(f"Add and trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id}") workflow_path = os.path.join( args.tmp, os.path.basename(test_data["tool"]) ) embed_all_runs( # will save results to "workflow_path" workflow_tool=fast_cwl_load(test_data["tool"]), location=workflow_path ) with open(workflow_path, "rb") as input_stream: if args.embed: # send base64 encoded gzip compressed workflow content that will be embedded into DAG python file logging.info(f"Sending base64 encoded gzip compressed content from {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags/dag_runs"), params=params, json={"workflow_content": get_compressed(input_stream)} ) else: # attach workflow as a file logging.info(f"Attaching workflow file {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags/dag_runs"), params=params, files={"workflow": input_stream} ) else: logging.info(f"Trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id}") r = requests.post( url=urljoin(args.api, "/api/experimental/dag_runs"), params=params ) if not r.ok: reason = get_api_failure_reason(r) logging.error(f"Failed to trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id} due to {reason}") test_data["error"] = reason test_data["finished"] = True
def __init__( self, dag_id, # the id of the DAG workflow, # absolute path to the CWL workflow file or utf-8 string to include base64 encoded zlib compressed utf-8 workflow file content dispatcher=None, # custom job dispatcher. Will be assigned automatically to the same DAG. Default CWLJobDispatcher gatherer=None, # custom job gatherer. Will be assigned automatically to the same DAG. Default CWLJobGatherer *args, **kwargs # see DAG class for additional parameters ): """ Updates kwargs with the required defaults if they were not explicitely provided by user. dispatcher and gatherer are set to CWLJobDispatcher() and CWLJobGatherer() if those were not provided by user. If user sets his own operators for dispatcher and gatherer, "default_args" will not be inherited. User needs to set up proper agruments by himself. Also, dag results will not be posted from the custom dispatcher. """ self.workflow = workflow self.__setup_params(kwargs) super().__init__(dag_id=dag_id, *args, **kwargs) self.workflow_tool = fast_cwl_load( # keeps only the tool (CommentedMap object) workflow=self.workflow, cwl_args=kwargs["default_args"] ["cwl"] # in case user has overwritten some of the default parameters ) self.dispatcher = CWLJobDispatcher( dag= self, # need dag=self otherwise new operator will not get proper default_args task_id="CWLJobDispatcher") if dispatcher is None else dispatcher self.gatherer = CWLJobGatherer( dag= self, # need dag=self otherwise new operator will not get proper default_args task_id="CWLJobGatherer") if gatherer is None else gatherer self.__assemble()