Example #1
0
def test_build_cmd_with_inputs_and_outputs():
    pipeline_inputs = ["test.txt", "test2.txt"]
    pipeline_outputs = ["test3.txt", "test4.txt"]

    boot_build = BootscriptBuilder(
        filename="test_notebook.ipynb",
        pipeline_name="test-pipeline",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        inputs=pipeline_inputs,
        outputs=pipeline_outputs,
    )

    assert boot_build.inputs == pipeline_inputs
    assert boot_build.outputs == pipeline_outputs

    boot_arg_list = boot_build.container_cmd.split("--")
    for arg in boot_arg_list:
        arg_value = arg.split(" ")[1]
        if "outputs" in arg:
            assert arg_value == f"'{';'.join(pipeline_outputs)}'"
        if "inputs" in arg:
            assert arg_value == f"'{';'.join(pipeline_inputs)}'"
Example #2
0
def test_fail_without_filename():
    with pytest.raises(TypeError):
        BootscriptBuilder(
            cos_endpoint="http://testserver:32525",
            cos_bucket="test_bucket",
            cos_directory="test_directory",
            cos_dependencies_archive="test_archive.tgz",
        )
Example #3
0
def test_fail_without_cos_endpoint():
    with pytest.raises(TypeError):
        BootscriptBuilder(
            filename="test_notebook.ipynb",
            cos_bucket="test_bucket",
            cos_directory="test_directory",
            cos_dependencies_archive="test_archive.tgz",
        )
Example #4
0
def test_fail_with_empty_string_as_filename():
    with pytest.raises(ValueError) as error_info:
        BootscriptBuilder(
            filename="",
            cos_endpoint="http://testserver:32525",
            cos_bucket="test_bucket",
            cos_directory="test_directory",
            cos_dependencies_archive="test_archive.tgz",
        )
    assert "You need to provide a filename for the operation." == str(
        error_info.value)
Example #5
0
    def _cc_pipeline(self, pipeline, pipeline_name):

        runtime_configuration = self._get_metadata_configuration(
            schemaspace=Runtimes.RUNTIMES_SCHEMASPACE_ID,
            name=pipeline.runtime_config)
        image_namespace = self._get_metadata_configuration(
            schemaspace=RuntimeImages.RUNTIME_IMAGES_SCHEMASPACE_ID)

        cos_endpoint = runtime_configuration.metadata.get("cos_endpoint")
        cos_username = runtime_configuration.metadata.get("cos_username")
        cos_password = runtime_configuration.metadata.get("cos_password")
        cos_secret = runtime_configuration.metadata.get("cos_secret")
        cos_directory = pipeline_name
        cos_bucket = runtime_configuration.metadata.get("cos_bucket")

        # Create dictionary that maps component Id to its ContainerOp instance
        target_ops = []

        self.log_pipeline_info(
            pipeline_name,
            f"processing pipeline dependencies to: {cos_endpoint} "
            f"bucket: {cos_bucket} folder: {pipeline_name}",
        )

        t0_all = time.time()

        # Sort operations based on dependency graph (topological order)
        sorted_operations = PipelineProcessor._sort_operations(
            pipeline.operations)

        # Determine whether access to cloud storage is required and check connectivity
        for operation in sorted_operations:
            if isinstance(operation, GenericOperation):
                self._verify_cos_connectivity(runtime_configuration)
                break

        # All previous operation outputs should be propagated throughout the pipeline.
        # In order to process this recursively, the current operation's inputs should be combined
        # from its parent's inputs (which, themselves are derived from the outputs of their parent)
        # and its parent's outputs.

        PipelineProcessor._propagate_operation_inputs_outputs(
            pipeline, sorted_operations)

        # Scrub all node labels of invalid characters
        scrubbed_operations = self._scrub_invalid_characters_from_list(
            sorted_operations)
        # Generate unique names for all operations
        unique_operations = self._create_unique_node_names(scrubbed_operations)

        for operation in unique_operations:

            if isinstance(operation, GenericOperation):
                operation_artifact_archive = self._get_dependency_archive_name(
                    operation)

                self.log.debug(
                    f"Creating pipeline component:\n {operation} archive : {operation_artifact_archive}"
                )

                # Collect env variables
                pipeline_envs = self._collect_envs(operation,
                                                   cos_secret=cos_secret,
                                                   cos_username=cos_username,
                                                   cos_password=cos_password)

                # Generate unique ELYRA_RUN_NAME value and expose it as an
                # environment variable in the container.
                # Notebook | script nodes are implemented using the kubernetes_pod_operator
                # (https://airflow.apache.org/docs/apache-airflow/1.10.12/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html)
                # Environment variables that are passed to this operator are
                # pre-processed by Airflow at runtime and placeholder values (expressed as '{{ xyz }}'
                #  - see https://airflow.apache.org/docs/apache-airflow/1.10.12/macros-ref#default-variables)
                # replaced.
                if pipeline_envs is None:
                    pipeline_envs = {}
                pipeline_envs[
                    "ELYRA_RUN_NAME"] = f"{pipeline_name}-{{{{ ts_nodash }}}}"

                image_pull_policy = None
                runtime_image_pull_secret = None
                for image_instance in image_namespace:
                    if image_instance.metadata[
                            "image_name"] == operation.runtime_image:
                        if image_instance.metadata.get("pull_policy"):
                            image_pull_policy = image_instance.metadata[
                                "pull_policy"]
                        if image_instance.metadata.get("pull_secret"):
                            runtime_image_pull_secret = image_instance.metadata[
                                "pull_secret"]
                        break

                bootscript = BootscriptBuilder(
                    filename=operation.filename,
                    cos_endpoint=cos_endpoint,
                    cos_bucket=cos_bucket,
                    cos_directory=cos_directory,
                    cos_dependencies_archive=operation_artifact_archive,
                    inputs=operation.inputs,
                    outputs=operation.outputs,
                )

                target_op = {
                    "notebook": operation.name,
                    "id": operation.id,
                    "argument_list": bootscript.container_cmd,
                    "runtime_image": operation.runtime_image,
                    "pipeline_envs": pipeline_envs,
                    "parent_operation_ids": operation.parent_operation_ids,
                    "image_pull_policy": image_pull_policy,
                    "cpu_request": operation.cpu,
                    "mem_request": operation.memory,
                    "gpu_limit": operation.gpu,
                    "operator_source": operation.component_params["filename"],
                    "is_generic_operator": True,
                    "doc": operation.doc,
                }

                if runtime_image_pull_secret is not None:
                    target_op[
                        "runtime_image_pull_secret"] = runtime_image_pull_secret

                target_ops.append(target_op)

                self.log_pipeline_info(
                    pipeline_name,
                    f"processing operation dependencies for id: {operation.id}",
                    operation_name=operation.name,
                )

                self._upload_dependencies_to_object_store(
                    runtime_configuration, pipeline_name, operation)

            else:
                # Retrieve component from cache
                component = ComponentCache.instance().get_component(
                    self._type, operation.classifier)

                # Convert the user-entered value of certain properties according to their type
                for component_property in component.properties:
                    # Skip properties for which no value was given
                    if component_property.ref not in operation.component_params.keys(
                    ):
                        continue

                    # Get corresponding property's value from parsed pipeline
                    property_value_dict = operation.component_params.get(
                        component_property.ref)

                    # The type and value of this property can vary depending on what the user chooses
                    # in the pipeline editor. So we get the current active parameter (e.g. StringControl)
                    # from the activeControl value
                    active_property_name = property_value_dict["activeControl"]

                    # One we have the value (e.g. StringControl) we use can retrieve the value
                    # assigned to it
                    property_value = property_value_dict.get(
                        active_property_name, None)

                    # If the value is not found, assign it the default value assigned in parser
                    if property_value is None:
                        property_value = component_property.value

                    self.log.debug(
                        f"Active property name : {active_property_name}, value : {property_value}"
                    )
                    self.log.debug(
                        f"Processing component parameter '{component_property.name}' "
                        f"of type '{component_property.data_type}'")

                    if (property_value and str(property_value)[0] == "{"
                            and str(property_value)[-1] == "}" and isinstance(
                                json.loads(json.dumps(property_value)), dict)
                            and set(
                                json.loads(json.dumps(property_value)).keys())
                            == {"value", "option"}):
                        parent_node_name = self._get_node_name(
                            target_ops,
                            json.loads(json.dumps(property_value))["value"])
                        processed_value = "\"{{ ti.xcom_pull(task_ids='" + parent_node_name + "') }}\""
                        operation.component_params[
                            component_property.ref] = processed_value
                    elif component_property.data_type == "boolean":
                        operation.component_params[
                            component_property.ref] = property_value
                    elif component_property.data_type == "string":
                        # Add surrounding quotation marks to string value for correct rendering
                        # in jinja DAG template
                        operation.component_params[
                            component_property.ref] = json.dumps(
                                property_value)
                    elif component_property.data_type == "dictionary":
                        processed_value = self._process_dictionary_value(
                            property_value)
                        operation.component_params[
                            component_property.ref] = processed_value
                    elif component_property.data_type == "list":
                        processed_value = self._process_list_value(
                            property_value)
                        operation.component_params[
                            component_property.ref] = processed_value

                # Remove inputs and outputs from params dict until support for data exchange is provided
                operation.component_params_as_dict.pop("inputs")
                operation.component_params_as_dict.pop("outputs")

                # Locate the import statement. If not found raise...
                import_stmts = []
                # Check for import statement on Component object, otherwise get from class_import_map
                import_stmt = component.import_statement or self.class_import_map.get(
                    component.name)
                if import_stmt:
                    import_stmts.append(import_stmt)
                else:
                    # If we didn't find a mapping to the import statement, let's check if the component
                    # name includes a package prefix.  If it does, log a warning, but proceed, otherwise
                    # raise an exception.
                    if len(component.name.split(
                            ".")) > 1:  # We (presumably) have a package prefix
                        self.log.warning(
                            f"Operator '{component.name}' of node '{operation.name}' is not configured "
                            f"in the list of available Airflow operators but appears to include a "
                            f"package prefix and processing will proceed.")
                    else:
                        raise ValueError(
                            f"Operator '{component.name}' of node '{operation.name}' is not configured "
                            f"in the list of available operators.  Please add the fully-qualified "
                            f"package name for '{component.name}' to the "
                            f"AirflowPipelineProcessor.available_airflow_operators configuration."
                        )

                target_op = {
                    "notebook": operation.name,
                    "id": operation.id,
                    "imports": import_stmts,
                    "class_name": component.name,
                    "parent_operation_ids": operation.parent_operation_ids,
                    "component_params": operation.component_params_as_dict,
                    "operator_source": component.component_source,
                    "is_generic_operator": False,
                    "doc": operation.doc,
                }

                target_ops.append(target_op)

        ordered_target_ops = OrderedDict()

        while target_ops:
            for i in range(len(target_ops)):
                target_op = target_ops.pop(0)
                if not target_op["parent_operation_ids"]:
                    ordered_target_ops[target_op["id"]] = target_op
                    self.log.debug("Root Node added : %s",
                                   ordered_target_ops[target_op["id"]])
                elif all(deps in ordered_target_ops.keys()
                         for deps in target_op["parent_operation_ids"]):
                    ordered_target_ops[target_op["id"]] = target_op
                    self.log.debug("Dependent Node added : %s",
                                   ordered_target_ops[target_op["id"]])
                else:
                    target_ops.append(target_op)

        self.log_pipeline_info(pipeline_name,
                               "pipeline dependencies processed",
                               duration=(time.time() - t0_all))

        return ordered_target_ops