def test_get_compressed_from_binary_stream(location, control_data, reset_position): with open(location, "rb") as input_stream: input_stream.read(20) # change position while reading from file compressed_data = get_compressed(input_stream, reset_position) assert control_data == compressed_data, \ "Failed to compress data"
def create_dags(suite_data, args, dags_folder=None): """ Iterates over "suite_data" and creates new DAGs. Tries to include all tools into the worfklow before sending it to the API server. If loaded tool is not Workflow, send it unchanged. It's safe to not process errors when we failed to add new DAG. Airflow Scheduler will parse all dags at the end of the next "dag_dir_list_interval" from airflow.cfg. If args.embed was True, send base64 encoded gzip compressed content of the workflow file instead of attaching it. In case we failed to load and parse worklfow, sets "finished" to True and writes reason to "error". """ # TODO: Do we need to force scheduler to reload DAGs after all DAG added? for test_data in suite_data.values(): params = {"dag_id": test_data["dag_id"]} workflow_path = os.path.join( args.tmp, os.path.basename(test_data["tool"]) ) try: embed_all_runs( # will save results to "workflow_path" workflow_tool=fast_cwl_load(test_data["tool"]), location=workflow_path ) except Exception as ex: logging.error(f"Failed to load test case to run {test_data['tool']}") test_data.update({ "error": "Failed to load test case", "finished": True }) continue with open(workflow_path, "rb") as input_stream: logging.info(f"Add DAG {test_data['dag_id']} from test case {test_data['index']}") if args.embed: # send base64 encoded gzip compressed workflow content that will be embedded into DAG python file logging.info(f"Sending base64 encoded gzip compressed content from {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags"), params=params, json={"workflow_content": get_compressed(input_stream)} ) else: # attach workflow as a file logging.info(f"Attaching workflow file {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags"), params=params, files={"workflow": input_stream} ) # Check if we failed to add new DAG. One reason to fail - DAG hase been # already added. It's safe to ignore this error. In case more serious # reasons, they will be caught on the "trigger_dags" step if not r.ok: reason = get_api_failure_reason(r) logging.info(f"Failed to add DAG {test_data['dag_id']} from test case {test_data['index']} due to \n {reason}")
def test_get_compressed_from_text_stream(location, control_data, reset_position, monkeypatch): monkeypatch.setattr(time, "time", lambda: 1607974216.711175) with open(location, "r") as input_stream: input_stream.read(20) # change position while reading from file compressed_data = get_compressed(input_stream, reset_position) assert control_data == compressed_data, \ "Failed to compress data"
def export_dag(self, dag_id): """ Checks if DAG python file with the same name has been already exported. If not, checks if exaclty one of "workflow" and "workflow_content" parameters are present in the request. In case of "workflow_content" first we need to load a tool from it and try to convert it to Workflow (what if it was CommandLineTool), then compress it again and write to DAG python file. In case of "workflow", first we need to save attachment, then try to comvert it to Workflow (the same reason as above) and write it to DAG python file. """ dag_path = path.join(DAGS_FOLDER, dag_id + ".py") if path.isfile(dag_path): raise FileExistsError(f"File {dag_path} already exist") if "workflow_content" in (connexion.request.json or []) \ and "workflow" in connexion.request.files: raise ValueError("Only one of the 'workflow' or \ 'workflow_content' parameters can be set") if "workflow_content" in (connexion.request.json or []): # json field might be None, need to take [] as default workflow = get_compressed( convert_to_workflow( # to make sure we are not saving CommandLineTool instead of a Workflow command_line_tool=fast_cwl_load( # using fast_cwl_load is safe here because we deal with the content of a file connexion.request.json["workflow_content"] ) ) ) elif "workflow" in connexion.request.files: workflow = path.join(DAGS_FOLDER, dag_id + ".cwl") self.save_attachment("workflow", workflow) convert_to_workflow( command_line_tool=slow_cwl_load( # safer to use slow_cwl_load, because of the possible confusions with all these renaming. TODO: make it less complicate workflow=workflow, only_tool=True ), location=workflow ) else: raise ValueError("At least one of the 'workflow' or \ 'workflow_content' parameters should be set") with open(dag_path, "w") as output_stream: output_stream.write(DAG_TEMPLATE.format(workflow, dag_id)) return {"dag_id": dag_id, "cwl_path": workflow, "dag_path": dag_path}
def overwrite_deprecated_dag(dag_location, deprecated_dags_folder=None): """ Loads DAG content from "dag_location" file. Searches for "dag.create()" command. If not found, we don't need to upgrade this DAG (it's either not from CWL-Airflow, or already in a new format). If "deprecated_dags_folder" is not None, copies original DAG file there before DAG upgrading. After copying deprecated DAG to the "deprecated_dags_folder" updates ".airflowignore" with DAG file basename to exclude it from Airflow parsing. Upgraded DAG will always include base64 encoded gzip compressed workflow content. In case "workflow_location" is relative path, it will be resolved based on the dirname of "dag_location" (useful for tests only, because all our old DAGs always have absolute path to the CWL file). Function doesn't backup or update the original CWL file. TODO: in case more coplicated DAG files that include "default_args", etc, this function should be updated to the more complex one. """ with open(dag_location, "r+") as io_stream: # open for both reading and writing dag_content = io_stream.read() if not re.search("dag\\.create\\(\\)", dag_content): # do nothing if it wasn't old-style DAG return workflow_location = get_absolute_path( # resolve relative to dirname of "dag_location" (good for tests) re.search("(cwl_workflow\\s*=\\s*[\"|'])(.+?)([\"|'])", dag_content).group(2), os.path.dirname(dag_location)) dag_id = re.search("(dag_id\\s*=\\s*[\"|'])(.+?)([\"|'])", dag_content).group(2) compressed_workflow_content = get_compressed( fast_cwl_load( workflow_location ) # no "run" embedding or convertion to Workflow. If DAG worked, cwl should be ok too ) if deprecated_dags_folder is not None: # copy old DAG to the folder with deprecated DAGs, add ".airflowignore" get_dir( deprecated_dags_folder ) # try to create "deprecated_dags_folder" if it doesn't exist shutil.copy(dag_location, deprecated_dags_folder) # copy DAG file ignore = os.path.join(deprecated_dags_folder, ".airflowignore") with open( ignore, "a" ) as output_stream: # add deprecated DAG to ".airflowignore" output_stream.write(os.path.basename(dag_location) + "\n") io_stream.seek(0) # rewind "dag_location" file to the beginning io_stream.write( DAG_TEMPLATE.format(compressed_workflow_content, dag_id)) io_stream.truncate( ) # remove old data at the end of a file if anything became shorter than original
def trigger_dags(suite_data, args): """ Triggers all DAGs from "suite_data". If failed to trigger DAG, updates "suite_data" with "error" and sets "finished" to True. In case --combine was set, we will call API that will first create the new DAG, then clean all previous DAG runs based on the provided run_id and dag_id, then remove outdated DAGs for the same workflow (for that dag_id should follow naming rule cwlid-commitsha) and only after that trigger the workflow execution. If not only --combine but also --embed was provided, send base64 encoded gzip compressed content of the workflow file instead of attaching it. """ for run_id, test_data in suite_data.items(): params = { "run_id": run_id, "dag_id": test_data["dag_id"], "conf": json.dumps({"job": test_data["job"]}) } if args.combine: # use API endpoint that combines both creating, cleaning and triggerring new DAGs logging.info(f"Add and trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id}") workflow_path = os.path.join( args.tmp, os.path.basename(test_data["tool"]) ) embed_all_runs( # will save results to "workflow_path" workflow_tool=fast_cwl_load(test_data["tool"]), location=workflow_path ) with open(workflow_path, "rb") as input_stream: if args.embed: # send base64 encoded gzip compressed workflow content that will be embedded into DAG python file logging.info(f"Sending base64 encoded gzip compressed content from {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags/dag_runs"), params=params, json={"workflow_content": get_compressed(input_stream)} ) else: # attach workflow as a file logging.info(f"Attaching workflow file {workflow_path}") r = requests.post( url=urljoin(args.api, "/api/experimental/dags/dag_runs"), params=params, files={"workflow": input_stream} ) else: logging.info(f"Trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id}") r = requests.post( url=urljoin(args.api, "/api/experimental/dag_runs"), params=params ) if not r.ok: reason = get_api_failure_reason(r) logging.error(f"Failed to trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id} due to {reason}") test_data["error"] = reason test_data["finished"] = True
def test_get_compressed(raw_data, control_data): compressed_data = get_compressed(raw_data) assert control_data == compressed_data, \ "Failed to compress data"
def test_get_compressed(raw_data, control_data, monkeypatch): monkeypatch.setattr(time, "time", lambda: 1607974216.711175) compressed_data = get_compressed(raw_data) assert control_data == compressed_data, \ "Failed to compress data"