Example #1
0
def test_get_compressed_from_binary_stream(location, control_data,
                                           reset_position):
    with open(location, "rb") as input_stream:
        input_stream.read(20)  # change position while reading from file
        compressed_data = get_compressed(input_stream, reset_position)
    assert control_data == compressed_data, \
        "Failed to compress data"
Example #2
0
def create_dags(suite_data, args, dags_folder=None):
    """
    Iterates over "suite_data" and creates new DAGs. Tries to include
    all tools into the worfklow before sending it to the API server.
    If loaded tool is not Workflow, send it unchanged. It's safe to
    not process errors when we failed to add new DAG. Airflow Scheduler
    will parse all dags at the end of the next "dag_dir_list_interval"
    from airflow.cfg. If args.embed was True, send base64 encoded gzip
    compressed content of the workflow file instead of attaching it.
    In case we failed to load and parse worklfow, sets "finished" to
    True and writes reason to "error".
    """

    # TODO: Do we need to force scheduler to reload DAGs after all DAG added?

    for test_data in suite_data.values():
        params = {"dag_id": test_data["dag_id"]}
        workflow_path = os.path.join(
            args.tmp,
            os.path.basename(test_data["tool"])
        )
        try:
            embed_all_runs(                                                                           # will save results to "workflow_path"
                workflow_tool=fast_cwl_load(test_data["tool"]),
                location=workflow_path
            )
        except Exception as ex:
            logging.error(f"Failed to load test case to run {test_data['tool']}")
            test_data.update({
                "error": "Failed to load test case",
                "finished": True
            })
            continue

        with open(workflow_path, "rb") as input_stream:
            logging.info(f"Add DAG {test_data['dag_id']} from test case {test_data['index']}")

            if args.embed:                                                                            # send base64 encoded gzip compressed workflow content that will be embedded into DAG python file
                logging.info(f"Sending base64 encoded gzip compressed content from {workflow_path}")
                r = requests.post(
                    url=urljoin(args.api, "/api/experimental/dags"),
                    params=params,
                    json={"workflow_content": get_compressed(input_stream)}
                )
            else:                                                                                     # attach workflow as a file
                logging.info(f"Attaching workflow file {workflow_path}")
                r = requests.post(
                    url=urljoin(args.api, "/api/experimental/dags"),
                    params=params,
                    files={"workflow": input_stream}
                )

            # Check if we failed to add new DAG. One reason to fail - DAG hase been
            # already added. It's safe to ignore this error. In case more serious
            # reasons, they will be caught on the "trigger_dags" step

            if not r.ok:
                reason = get_api_failure_reason(r)
                logging.info(f"Failed to add DAG {test_data['dag_id']} from test case {test_data['index']} due to \n {reason}")
Example #3
0
def test_get_compressed_from_text_stream(location, control_data,
                                         reset_position, monkeypatch):
    monkeypatch.setattr(time, "time", lambda: 1607974216.711175)
    with open(location, "r") as input_stream:
        input_stream.read(20)  # change position while reading from file
        compressed_data = get_compressed(input_stream, reset_position)
    assert control_data == compressed_data, \
        "Failed to compress data"
Example #4
0
    def export_dag(self, dag_id):
        """
        Checks if DAG python file with the same name has been already
        exported. If not, checks if exaclty one of "workflow" and
        "workflow_content" parameters are present in the request. In
        case of "workflow_content" first we need to load a tool from
        it and try to convert it to Workflow (what if it was
        CommandLineTool), then compress it again and write to DAG
        python file. In case of "workflow", first we need to save
        attachment, then try to comvert it to Workflow (the same reason
        as above) and write it to DAG python file.
        """

        dag_path = path.join(DAGS_FOLDER, dag_id + ".py")

        if path.isfile(dag_path):
            raise FileExistsError(f"File {dag_path} already exist")

        if "workflow_content" in (connexion.request.json or []) \
            and "workflow" in connexion.request.files:

            raise ValueError("Only one of the 'workflow' or \
                'workflow_content' parameters can be set")

        if "workflow_content" in (connexion.request.json or []):    # json field might be None, need to take [] as default

            workflow = get_compressed(
                convert_to_workflow(                                # to make sure we are not saving CommandLineTool instead of a Workflow
                    command_line_tool=fast_cwl_load(                # using fast_cwl_load is safe here because we deal with the content of a file
                        connexion.request.json["workflow_content"]
                    )
                )
            )

        elif "workflow" in connexion.request.files:

            workflow = path.join(DAGS_FOLDER, dag_id + ".cwl")
            self.save_attachment("workflow", workflow)
            convert_to_workflow(
                command_line_tool=slow_cwl_load(                    # safer to use slow_cwl_load, because of the possible confusions with all these renaming. TODO: make it less complicate
                    workflow=workflow,
                    only_tool=True
                ),
                location=workflow
            )

        else:

            raise ValueError("At least one of the 'workflow' or \
                'workflow_content' parameters should be set")

        with open(dag_path, "w") as output_stream:
            output_stream.write(DAG_TEMPLATE.format(workflow, dag_id))

        return {"dag_id": dag_id, "cwl_path": workflow, "dag_path": dag_path}
Example #5
0
def overwrite_deprecated_dag(dag_location, deprecated_dags_folder=None):
    """
    Loads DAG content from "dag_location" file. Searches for "dag.create()" command.
    If not found, we don't need to upgrade this DAG (it's either not from CWL-Airflow,
    or already in a new format). If "deprecated_dags_folder" is not None, copies original
    DAG file there before DAG upgrading. After copying deprecated DAG to the
    "deprecated_dags_folder" updates ".airflowignore" with DAG file basename to exclude
    it from Airflow parsing. Upgraded DAG will always include base64 encoded gzip
    compressed workflow content. In case "workflow_location" is relative path, it will
    be resolved based on the dirname of "dag_location" (useful for tests only, because
    all our old DAGs always have absolute path to the CWL file). Function doesn't backup
    or update the original CWL file.
    TODO: in case more coplicated DAG files that include "default_args", etc, this function
    should be updated to the more complex one.
    """

    with open(dag_location,
              "r+") as io_stream:  # open for both reading and writing

        dag_content = io_stream.read()

        if not re.search("dag\\.create\\(\\)",
                         dag_content):  # do nothing if it wasn't old-style DAG
            return

        workflow_location = get_absolute_path(  # resolve relative to dirname of "dag_location" (good for tests)
            re.search("(cwl_workflow\\s*=\\s*[\"|'])(.+?)([\"|'])",
                      dag_content).group(2), os.path.dirname(dag_location))

        dag_id = re.search("(dag_id\\s*=\\s*[\"|'])(.+?)([\"|'])",
                           dag_content).group(2)

        compressed_workflow_content = get_compressed(
            fast_cwl_load(
                workflow_location
            )  # no "run" embedding or convertion to Workflow. If DAG worked, cwl should be ok too
        )

        if deprecated_dags_folder is not None:  # copy old DAG to the folder with deprecated DAGs, add ".airflowignore"
            get_dir(
                deprecated_dags_folder
            )  # try to create "deprecated_dags_folder" if it doesn't exist
            shutil.copy(dag_location, deprecated_dags_folder)  # copy DAG file
            ignore = os.path.join(deprecated_dags_folder, ".airflowignore")
            with open(
                    ignore, "a"
            ) as output_stream:  # add deprecated DAG to ".airflowignore"
                output_stream.write(os.path.basename(dag_location) + "\n")

        io_stream.seek(0)  # rewind "dag_location" file to the beginning
        io_stream.write(
            DAG_TEMPLATE.format(compressed_workflow_content, dag_id))
        io_stream.truncate(
        )  # remove old data at the end of a file if anything became shorter than original
Example #6
0
def trigger_dags(suite_data, args):
    """
    Triggers all DAGs from "suite_data". If failed to trigger DAG, updates
    "suite_data" with "error" and sets "finished" to True. In case --combine
    was set, we will call API that will first create the new DAG, then clean
    all previous DAG runs based on the provided run_id and dag_id, then remove
    outdated DAGs for the same workflow (for that dag_id should follow naming
    rule cwlid-commitsha) and only after that trigger the workflow execution.
    If not only --combine but also --embed was provided, send base64 encoded
    gzip compressed content of the workflow file instead of attaching it.
    """

    for run_id, test_data in suite_data.items():
        params = {
            "run_id": run_id,
            "dag_id": test_data["dag_id"],
            "conf": json.dumps({"job": test_data["job"]})
        }
        if args.combine:  # use API endpoint that combines both creating, cleaning and triggerring new DAGs
            logging.info(f"Add and trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id}")
            workflow_path = os.path.join(
                args.tmp,
                os.path.basename(test_data["tool"])
            )
            embed_all_runs(                                                                               # will save results to "workflow_path"
                workflow_tool=fast_cwl_load(test_data["tool"]),
                location=workflow_path
            )
            with open(workflow_path, "rb") as input_stream:
                if args.embed:                                                                            # send base64 encoded gzip compressed workflow content that will be embedded into DAG python file
                    logging.info(f"Sending base64 encoded gzip compressed content from {workflow_path}")
                    r = requests.post(
                        url=urljoin(args.api, "/api/experimental/dags/dag_runs"),
                        params=params,
                        json={"workflow_content": get_compressed(input_stream)}
                    )
                else:                                                                                     # attach workflow as a file
                    logging.info(f"Attaching workflow file {workflow_path}")
                    r = requests.post(
                        url=urljoin(args.api, "/api/experimental/dags/dag_runs"),
                        params=params,
                        files={"workflow": input_stream}
                    )
        else:
            logging.info(f"Trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id}")
            r = requests.post(
                url=urljoin(args.api, "/api/experimental/dag_runs"),
                params=params
            )
        if not r.ok:
            reason = get_api_failure_reason(r)
            logging.error(f"Failed to trigger DAG {test_data['dag_id']} from test case {test_data['index']} as {run_id} due to {reason}")
            test_data["error"] = reason
            test_data["finished"] = True
Example #7
0
def test_get_compressed(raw_data, control_data):
    compressed_data = get_compressed(raw_data)
    assert control_data == compressed_data, \
        "Failed to compress data"
Example #8
0
def test_get_compressed(raw_data, control_data, monkeypatch):
    monkeypatch.setattr(time, "time", lambda: 1607974216.711175)
    compressed_data = get_compressed(raw_data)
    assert control_data == compressed_data, \
        "Failed to compress data"