def test_replace_el(self):
        # Given
        params = {"var1": "value1", "var2": "value2", **self.default_params}
        # language=XML
        node_str = """
<pig>
    <archive>/path/with/el/${var1}.tar</archive>
    <archive>/path/with/el/${var2}.tar</archive>
    <archive>/path/with/two/els/${var1}/${var2}.tar</archive>
</pig>
        """
        oozie_node = ET.fromstring(node_str)
        archive_extractor = ArchiveExtractor(oozie_node=oozie_node,
                                             params=params)
        # When
        archive_extractor.parse_node()
        # Then
        self.assertEqual(
            [
                "hdfs:///path/with/el/value1.tar",
                "hdfs:///path/with/el/value2.tar",
                "hdfs:///path/with/two/els/value1/value2.tar",
            ],
            archive_extractor.hdfs_archives,
        )
    def test_replace_el(self):
        # Given
        self.props.job_properties["var1"] = "value1"
        self.props.job_properties["var2"] = "value2"
        # language=XML
        node_str = """
<pig>
    <archive>/path/with/el/${var1}.tar</archive>
    <archive>/path/with/el/${var2}.tar</archive>
    <archive>/path/with/two/els/${var1}/${var2}.tar</archive>
</pig>
        """
        oozie_node = ET.fromstring(node_str)
        archive_extractor = ArchiveExtractor(oozie_node=oozie_node,
                                             props=self.props)
        # When
        archive_extractor.parse_node()
        # Then
        self.assertEqual(
            [
                "hdfs:///path/with/el/{{var1}}.tar",
                "hdfs:///path/with/el/{{var2}}.tar",
                "hdfs:///path/with/two/els/{{var1}}/{{var2}}.tar",
            ],
            archive_extractor.hdfs_archives,
        )
class MapReduceMapper(ActionMapper):
    """
    Converts a MapReduce Oozie node to an Airflow task.
    """

    def __init__(self, oozie_node: Element, name: str, dag_name: str, props: PropertySet, **kwargs):
        ActionMapper.__init__(
            self, oozie_node=oozie_node, name=name, dag_name=dag_name, props=props, **kwargs
        )
        self.params_dict: Dict[str, str] = {}
        self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props)
        self.name_node: Optional[str] = None
        self.hdfs_files: Optional[List[str]] = None
        self.hdfs_archives: Optional[List[str]] = None
        self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(self)

    def on_parse_node(self):
        super().on_parse_node()
        self.name_node = get_tag_el_text(self.oozie_node, "name-node", props=self.props)
        self.params_dict = extract_param_values_from_action_node(self.oozie_node, props=self.props)
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

    def to_tasks_and_relations(self):
        action_task = Task(
            task_id=self.name,
            template_name="mapreduce.tpl",
            template_params=dict(
                props=self.props,
                params_dict=self.params_dict,
                hdfs_files=self.hdfs_files,
                hdfs_archives=self.hdfs_archives,
                action_node_properties=self.props.action_node_properties,
            ),
        )
        tasks = [action_task]
        relations: List[Relation] = []
        prepare_task = self.prepare_extension.get_prepare_task()
        if prepare_task:
            tasks, relations = self.prepend_task(prepare_task, tasks, relations)
        return tasks, relations

    @staticmethod
    def _validate_paths(input_directory_path, output_directory_path):
        if not input_directory_path:
            raise Exception("The input_directory_path should be set and is {}".format(input_directory_path))
        if not output_directory_path:
            raise Exception("The output_directory_path should be set and is {}".format(output_directory_path))

    def required_imports(self) -> Set[str]:
        return {"from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator"}
Example #4
0
class SparkMapper(ActionMapper, PrepareMixin):
    """Maps Spark Action"""

    application_args: List[str]
    conf: Dict[str, str]
    hdfs_archives: List[str]
    hdfs_files: List[str]
    dataproc_jars: List[str]
    jars: List[str]

    def __init__(
        self,
        oozie_node: ET.Element,
        name: str,
        trigger_rule: str = TriggerRule.ALL_SUCCESS,
        params: Dict[str, str] = None,
        **kwargs,
    ):
        ActionMapper.__init__(self, oozie_node, name, trigger_rule, **kwargs)
        self.params = params or {}
        self.trigger_rule = trigger_rule
        self.java_class = ""
        self.java_jar = ""
        self.job_name = None
        self.jars = []
        self.properties = {}
        self.application_args = []
        self.file_extractor = FileExtractor(oozie_node=oozie_node,
                                            params=self.params)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node,
                                                  params=self.params)
        self.prepare_command = None
        self.hdfs_files = []
        self.hdfs_archives = []
        self.dataproc_jars = []

    def on_parse_node(self):

        if self.has_prepare:
            self.prepare_command = self.get_prepare_command(
                oozie_node=self.oozie_node, params=self.params)

        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

        self.java_jar = self._get_or_default(self.oozie_node,
                                             SPARK_TAG_JAR,
                                             None,
                                             params=self.params)
        self.java_class = self._get_or_default(self.oozie_node,
                                               SPARK_TAG_CLASS,
                                               None,
                                               params=self.params)
        if self.java_class and self.java_jar:
            self.dataproc_jars = [self.java_jar]
            self.java_jar = None
        self.job_name = self._get_or_default(self.oozie_node,
                                             SPARK_TAG_JOB_NAME,
                                             None,
                                             params=self.params)

        job_xml_nodes = xml_utils.find_nodes_by_tag(self.oozie_node,
                                                    SPARK_TAG_JOB_XML)

        for xml_file in job_xml_nodes:
            tree = ET.parse(xml_file.text)
            self.properties.update(self._parse_config_node(tree.getroot()))

        config_nodes = xml_utils.find_nodes_by_tag(self.oozie_node,
                                                   SPARK_TAG_CONFIGURATION)
        if config_nodes:
            self.properties.update(self._parse_config_node(config_nodes[0]))

        spark_opts = xml_utils.find_nodes_by_tag(self.oozie_node,
                                                 SPARK_TAG_OPTS)
        if spark_opts:
            self.properties.update(self._parse_spark_opts(spark_opts[0]))

        app_args = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_ARGS)
        for arg in app_args:
            self.application_args.append(
                el_utils.replace_el_with_var(arg.text,
                                             self.params,
                                             quote=False))

    @staticmethod
    def _get_or_default(root: ET.Element,
                        tag: str,
                        default: str = None,
                        params: Dict[str, str] = None):
        """
        If a node exists in the oozie_node with the tag specified in tag, it
        will attempt to replace the EL (if it exists) with the corresponding
        variable. If no EL var is found, it just returns text. However, if the
        tag is not found under oozie_node, then return default. If there are
        more than one with the specified tag, it uses the first one found.
        """
        var = xml_utils.find_nodes_by_tag(root, tag)

        if var:
            # Only check the first one
            return el_utils.replace_el_with_var(var[0].text,
                                                params=params,
                                                quote=False)
        return default

    @staticmethod
    def _parse_config_node(config_node: ET.Element) -> Dict[str, str]:
        conf_dict = {}
        for prop in config_node:
            name_node = prop.find(SPARK_TAG_NAME)
            value_node = prop.find(SPARK_TAG_VALUE)
            if name_node is not None and name_node.text and value_node is not None and value_node.text:
                conf_dict[name_node.text] = value_node.text
        return conf_dict

    @staticmethod
    def _parse_spark_opts(spark_opts_node: ET.Element):
        """
        Some examples of the spark-opts element:
        --conf key1=value
        --conf key2="value1 value2"
        """
        conf = {}
        if spark_opts_node.text:
            spark_opts = spark_opts_node.text.split("--")[1:]
        else:
            raise ParseException(
                "Spark opts node has no text: {}".format(spark_opts_node))
        clean_opts = [opt.strip() for opt in spark_opts]
        clean_opts_split = [opt.split(maxsplit=1) for opt in clean_opts]

        for spark_opt in clean_opts_split:
            # Can have multiple "--conf" in spark_opts
            if spark_opt[0] == "conf":
                key, _, value = spark_opt[1].partition("=")
                # Value is required
                if not value:
                    raise ParseException(
                        f"Incorrect parameter format. Expected format: key=value. Current value: {spark_opt}"
                    )
                # Delete surrounding quotes
                if len(value) > 2 and value[0] in ["'", '"'] and value:
                    value = value[1:-1]
                conf[key] = value

        return conf

    def _get_tasks(self):
        """
        Returns the list of Airflow tasks that are the result of mapping

        :return: list of Airflow tasks
        """
        action_task = Task(
            task_id=self.name,
            template_name="spark.tpl",
            trigger_rule=self.trigger_rule,
            template_params=dict(
                main_jar=self.java_jar,
                main_class=self.java_class,
                arguments=self.application_args,
                archives=self.hdfs_archives,
                files=self.hdfs_files,
                job_name=self.job_name,
                dataproc_spark_properties=self.properties,
                dataproc_spark_jars=self.dataproc_jars,
            ),
        )

        if not self.has_prepare(self.oozie_node):
            return [action_task]

        prepare_task = Task(
            task_id=self.name + "_prepare",
            template_name="prepare.tpl",
            template_params=dict(prepare_command=self.prepare_command),
        )
        return [prepare_task, action_task]

    def _get_relations(self):
        """
        Returns the list of Airflow relations that are the result of mapping

        :return: list of relations
        """
        return ([
            Relation(from_task_id=self.name + "_prepare", to_task_id=self.name)
        ] if self.has_prepare(self.oozie_node) else [])

    def to_tasks_and_relations(self):
        tasks = self._get_tasks()
        relations = self._get_relations()
        return tasks, relations

    def required_imports(self) -> Set[str]:
        # Bash are for the potential prepare statement
        return {
            "from airflow.contrib.operators import dataproc_operator",
            "from airflow.operators import bash_operator",
            "from airflow.operators import dummy_operator",
        }

    @property
    def first_task_id(self):
        return self._get_tasks()[0].task_id
class MapReduceMapper(ActionMapper, PrepareMixin):
    """
    Converts a MapReduce Oozie node to an Airflow task.
    """

    def __init__(
        self,
        oozie_node: Element,
        name: str,
        trigger_rule: str = TriggerRule.ALL_SUCCESS,
        params: Dict[str, str] = None,
        **kwargs,
    ):
        ActionMapper.__init__(self, oozie_node=oozie_node, name=name, trigger_rule=trigger_rule, **kwargs)
        if params is None:
            params = dict()
        self.params = params
        self.properties: Dict[str, str] = {}
        self.params_dict: Dict[str, str] = {}
        self.file_extractor = FileExtractor(oozie_node=oozie_node, params=params)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, params=params)
        self.name_node = None
        self.hdfs_files = None
        self.hdfs_archives = None

    def on_parse_node(self):
        name_node_text = self.oozie_node.find("name-node").text
        self.name_node = el_utils.replace_el_with_var(name_node_text, params=self.params, quote=False)
        self._parse_config()
        self._parse_params()
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

    def _parse_params(self):
        param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param")
        if param_nodes:
            self.params_dict = {}
            for node in param_nodes:
                param = el_utils.replace_el_with_var(node.text, params=self.params, quote=False)
                key, value = param.split("=", 1)
                self.params_dict[key] = value

    def to_tasks_and_relations(self):
        tasks = [
            Task(
                task_id=self.name,
                template_name="mapreduce.tpl",
                trigger_rule=self.trigger_rule,
                template_params=dict(
                    properties=self.properties,
                    params_dict=self.params_dict,
                    hdfs_files=self.hdfs_files,
                    hdfs_archives=self.hdfs_archives,
                ),
            )
        ]
        relations = []
        if self.has_prepare(self.oozie_node):
            prepare_command = self.get_prepare_command(self.oozie_node, self.params)
            tasks.insert(
                0,
                Task(
                    task_id=self.name + "_prepare",
                    template_name="prepare.tpl",
                    trigger_rule=self.trigger_rule,
                    template_params=dict(prepare_command=prepare_command),
                ),
            )
            relations = [Relation(from_task_id=self.name + "_prepare", to_task_id=self.name)]
        return tasks, relations

    @staticmethod
    def _validate_paths(input_directory_path, output_directory_path):
        if not input_directory_path:
            raise Exception("The input_directory_path should be set and is {}".format(input_directory_path))
        if not output_directory_path:
            raise Exception("The output_directory_path should be set and is {}".format(output_directory_path))

    def required_imports(self) -> Set[str]:
        return {"from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator"}
Example #6
0
class JavaMapper(ActionMapper):
    """
    Converts a Java Oozie action node to an Airflow task.
    """
    def __init__(
        self,
        oozie_node: Element,
        name: str,
        dag_name: str,
        props: PropertySet,
        jar_files: List[str],
        **kwargs,
    ):
        ActionMapper.__init__(self,
                              oozie_node=oozie_node,
                              dag_name=dag_name,
                              name=name,
                              props=props,
                              **kwargs)
        self.file_extractor = FileExtractor(oozie_node=oozie_node,
                                            props=self.props)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node,
                                                  props=self.props)
        self.main_class: Optional[str] = None
        self.java_opts: List[str] = []
        self.args: Optional[List[str]] = None
        self.hdfs_files: Optional[List[str]] = None
        self.hdfs_archives: Optional[List[str]] = None
        self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(
            self)
        self.jar_files: List[str] = jar_files if jar_files else []
        self.jar_files_in_hdfs: List[str] = []
        self._get_jar_files_in_hdfs_full_paths()

    def on_parse_node(self):
        super().on_parse_node()
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()
        self._extract_java_data()

    def to_tasks_and_relations(self):
        action_task = Task(
            task_id=self.name,
            template_name="java.tpl",
            template_params=dict(
                props=self.props,
                hdfs_files=self.hdfs_files,
                hdfs_archives=self.hdfs_archives,
                main_class=self.main_class,
                jar_files_in_hdfs=self.jar_files_in_hdfs,
                args=self.args,
            ),
        )
        tasks = [action_task]
        relations: List[Relation] = []
        prepare_task = self.prepare_extension.get_prepare_task()
        if prepare_task:
            tasks, relations = self.prepend_task(prepare_task, tasks,
                                                 relations)
        return tasks, relations

    def required_imports(self) -> Set[str]:
        return {
            "from airflow.utils import dates",
            "from airflow.contrib.operators import dataproc_operator"
        }

    def _get_jar_files_in_hdfs_full_paths(self):
        hdfs_app_prefix = self.props.job_properties[
            "oozie.wf.application.path"]
        for file in self.jar_files:
            self.jar_files_in_hdfs.append(hdfs_app_prefix + "/" + LIB_FOLDER +
                                          "/" + file)

    def _extract_java_data(self):
        """Extracts Java node data."""
        root = self.oozie_node
        props = self.props
        if "mapred.child.java.opts" in props.merged:
            self.java_opts.extend(
                props.merged["mapred.child.java.opts"].split(" "))
        if "mapreduce.map.java.opts" in props.merged:
            self.java_opts.extend(
                props.merged["mapreduce.map.java.opts"].split(" "))
        self.main_class = xml_utils.get_tag_el_text(root=root,
                                                    tag=TAG_MAIN_CLASS,
                                                    props=props)
        java_opts_string = xml_utils.get_tag_el_text(root=root,
                                                     tag=TAG_JAVA_OPTS,
                                                     props=props)
        if java_opts_string:
            self.java_opts.extend(java_opts_string.split(" "))
        else:
            self.java_opts.extend(
                get_tags_el_array_from_text(root=root,
                                            tag=TAG_JAVA_OPT,
                                            props=props))
        self.args = get_tags_el_array_from_text(root=root,
                                                tag=TAG_ARG,
                                                props=props)
class PigMapper(ActionMapper):
    """
    Converts a Pig Oozie node to an Airflow task.
    """
    def __init__(self, oozie_node: Element, name: str, props: PropertySet,
                 **kwargs):
        ActionMapper.__init__(self,
                              oozie_node=oozie_node,
                              name=name,
                              props=props,
                              **kwargs)
        self.params_dict: Dict[str, str] = {}
        self.file_extractor = FileExtractor(oozie_node=oozie_node,
                                            props=self.props)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node,
                                                  props=self.props)
        self._parse_oozie_node()
        self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(
            self)

    def _parse_oozie_node(self):
        self.resource_manager = get_tag_el_text(self.oozie_node, TAG_RESOURCE)
        self.name_node = get_tag_el_text(self.oozie_node, TAG_NAME)
        self.script_file_name = get_tag_el_text(self.oozie_node, TAG_SCRIPT)

        self.params_dict = extract_param_values_from_action_node(
            self.oozie_node)
        self.files, self.hdfs_files = self.file_extractor.parse_node()
        self.archives, self.hdfs_archives = self.archive_extractor.parse_node()

    def to_tasks_and_relations(self):
        action_task = Task(
            task_id=self.name,
            template_name="pig.tpl",
            template_params=dict(
                props=self.props,
                params_dict=self.params_dict,
                script_file_name=self.script_file_name,
                action_node_properties=self.props.action_node_properties,
            ),
        )
        tasks = [action_task]
        relations: List[Relation] = []
        prepare_task = self.prepare_extension.get_prepare_task()
        if prepare_task:
            tasks, relations = self.prepend_task(prepare_task, tasks,
                                                 relations)
        return tasks, relations

    def _add_symlinks(self, destination_pig_file):
        destination_pig_file.write("set mapred.create.symlink yes;\n")
        if self.files:
            destination_pig_file.write("set mapred.cache.file {};\n".format(
                ",".join(self.hdfs_files)))
        if self.archives:
            destination_pig_file.write(
                "set mapred.cache.archives {};\n".format(",".join(
                    self.hdfs_archives)))

    def copy_extra_assets(self, input_directory_path: str,
                          output_directory_path: str):
        self._validate_paths(input_directory_path, output_directory_path)
        source_pig_file_path = os.path.join(input_directory_path,
                                            self.script_file_name)
        destination_pig_file_path = os.path.join(output_directory_path,
                                                 self.script_file_name)
        self._copy_pig_script_with_path_injection(destination_pig_file_path,
                                                  source_pig_file_path)

    def _copy_pig_script_with_path_injection(self, destination_pig_file_path,
                                             source_pig_file_path):
        os.makedirs(os.path.dirname(destination_pig_file_path), exist_ok=True)
        with open(destination_pig_file_path, "w") as destination_pig_file:
            with open(source_pig_file_path, "r") as source_pig_file:
                pig_script = source_pig_file.read()
                if self.files or self.archives:
                    self._add_symlinks(destination_pig_file)
                destination_pig_file.write(pig_script)

    @staticmethod
    def _validate_paths(input_directory_path, output_directory_path):
        if not input_directory_path:
            raise Exception(
                f"The input_directory_path should be set and is {input_directory_path}"
            )
        if not output_directory_path:
            raise Exception(
                f"The output_directory_path should be set and is {output_directory_path}"
            )

    def required_imports(self) -> Set[str]:
        return {
            "from airflow.utils import dates",
            "from airflow.contrib.operators import dataproc_operator"
        }
class HiveMapper(ActionMapper):
    """
    Converts a Hive Oozie node to an Airflow task.
    """
    def __init__(self, oozie_node: Element, name: str, props: PropertySet,
                 **kwargs):
        ActionMapper.__init__(self,
                              oozie_node=oozie_node,
                              name=name,
                              props=props,
                              **kwargs)
        self.variables: Optional[Dict[str, str]] = None
        self.query: Optional[str] = None
        self.script: Optional[str] = None
        self.hdfs_files: Optional[List[str]] = None
        self.hdfs_archives: Optional[List[str]] = None
        self.file_extractor = FileExtractor(oozie_node=oozie_node,
                                            props=self.props)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node,
                                                  props=self.props)
        self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(
            self)

    def on_parse_node(self):
        super().on_parse_node()
        self._parse_config()
        self.query = get_tag_el_text(self.oozie_node, TAG_QUERY)
        self.script = get_tag_el_text(self.oozie_node, TAG_SCRIPT)
        if not self.query and not self.script:
            raise ParseException(
                f"Action Configuration does not include {TAG_SCRIPT} or {TAG_QUERY} element"
            )

        if self.query and self.script:
            raise ParseException(
                f"Action Configuration include {TAG_SCRIPT} and {TAG_QUERY} element. "
                f"Only one can be set at the same time.")

        self.variables = extract_param_values_from_action_node(self.oozie_node)
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

    def to_tasks_and_relations(self):
        action_task = Task(
            task_id=self.name,
            template_name="hive.tpl",
            template_params=dict(
                query=self.query,
                script=self.script,
                props=self.props,
                archives=self.hdfs_archives,
                files=self.hdfs_files,
                variables=self.variables,
            ),
        )
        tasks = [action_task]
        relations = []
        prepare_task = self.prepare_extension.get_prepare_task()
        if prepare_task:
            tasks, relations = self.prepend_task(prepare_task, tasks,
                                                 relations)

        return tasks, relations

    def copy_extra_assets(self, input_directory_path: str,
                          output_directory_path: str):
        if not self.script:
            return
        source_script_file_path = os.path.join(input_directory_path,
                                               self.script)
        destination_script_file_path = os.path.join(output_directory_path,
                                                    self.script)
        os.makedirs(os.path.dirname(destination_script_file_path),
                    exist_ok=True)
        shutil.copy(source_script_file_path, destination_script_file_path)

    def required_imports(self) -> Set[str]:
        return {
            "from airflow.utils import dates",
            "from airflow.contrib.operators import dataproc_operator"
        }
class SparkMapper(ActionMapper):
    """Maps Spark Action"""

    def __init__(self, oozie_node: ET.Element, name: str, props: PropertySet, **kwargs):
        ActionMapper.__init__(self, oozie_node=oozie_node, name=name, props=props, **kwargs)
        self.java_class: Optional[str] = None
        self.java_jar: Optional[str] = None
        self.job_name: Optional[str] = None
        self.jars: List[str] = []
        self.application_args: List[str] = []
        self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props)
        self.hdfs_files: List[str] = []
        self.hdfs_archives: List[str] = []
        self.dataproc_jars: List[str] = []
        self.spark_opts: Dict[str, str] = {}
        self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(self)

    def on_parse_node(self):
        super().on_parse_node()
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

        self.java_jar = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_JAR)
        self.java_class = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_CLASS)
        if self.java_class and self.java_jar:
            self.dataproc_jars = [self.java_jar]
            self.java_jar = None
        self.job_name = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_JOB_NAME)

        spark_opts = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_OPTS)
        if spark_opts:
            self.spark_opts.update(self._parse_spark_opts(spark_opts[0]))

        self.application_args = xml_utils.get_tags_el_array_from_text(self.oozie_node, tag=SPARK_TAG_ARG)

    @staticmethod
    def _parse_spark_opts(spark_opts_node: ET.Element):
        """
        Some examples of the spark-opts element:
        --conf key1=value
        --conf key2="value1 value2"
        """
        conf: Dict[str, str] = {}
        if spark_opts_node.text:
            spark_opts = spark_opts_node.text.split("--")[1:]
        else:
            raise ParseException(f"Spark opts node has no text: {spark_opts_node}")
        clean_opts = [opt.strip() for opt in spark_opts]
        clean_opts_split = [opt.split(maxsplit=1) for opt in clean_opts]

        for spark_opt in clean_opts_split:
            # Can have multiple "--conf" in spark_opts
            if spark_opt[0] == "conf":
                key, _, value = spark_opt[1].partition("=")
                # Value is required
                if not value:
                    raise ParseException(
                        f"Incorrect parameter format. Expected format: key=value. Current value: {spark_opt}"
                    )
                # Delete surrounding quotes
                if len(value) > 2 and value[0] in ["'", '"'] and value:
                    value = value[1:-1]
                conf[key] = value
            # TODO: parse also other options (like --executor-memory 20G --num-executors 50 and many more)
            #  see: https://oozie.apache.org/docs/5.1.0/DG_SparkActionExtension.html#PySpark_with_Spark_Action

        return conf

    def to_tasks_and_relations(self):
        action_task = Task(
            task_id=self.name,
            template_name="spark.tpl",
            template_params=dict(
                main_jar=self.java_jar,
                main_class=self.java_class,
                arguments=self.application_args,
                hdfs_archives=self.hdfs_archives,
                hdfs_files=self.hdfs_files,
                job_name=self.job_name,
                dataproc_spark_jars=self.dataproc_jars,
                spark_opts=self.spark_opts,
            ),
        )
        tasks = [action_task]
        relations: List[Relation] = []
        prepare_task = self.prepare_extension.get_prepare_task()
        if prepare_task:
            tasks, relations = self.prepend_task(prepare_task, tasks, relations)
        return tasks, relations

    def required_imports(self) -> Set[str]:
        # Bash are for the potential prepare statement
        return {
            "from airflow.contrib.operators import dataproc_operator",
            "from airflow.operators import bash_operator",
            "from airflow.operators import dummy_operator",
        }
Example #10
0
class PigMapper(ActionMapper, PrepareMixin):
    """
    Converts a Pig Oozie node to an Airflow task.
    """

    properties: Dict[str, str]
    params_dict: Dict[str, str]

    def __init__(
        self,
        oozie_node: Element,
        name: str,
        trigger_rule: str = TriggerRule.ALL_SUCCESS,
        params=None,
        **kwargs,
    ):
        ActionMapper.__init__(self, oozie_node=oozie_node, name=name, trigger_rule=trigger_rule, **kwargs)
        if params is None:
            params = dict()
        self.params = params
        self.trigger_rule = trigger_rule
        self.properties = {}
        self.params_dict = {}
        self.file_extractor = FileExtractor(oozie_node=oozie_node, params=params)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, params=params)
        self._parse_oozie_node()

    def _parse_oozie_node(self):
        res_man_text = self.oozie_node.find("resource-manager").text
        name_node_text = self.oozie_node.find("name-node").text
        script = self.oozie_node.find("script").text
        self.resource_manager = el_utils.replace_el_with_var(res_man_text, params=self.params, quote=False)
        self.name_node = el_utils.replace_el_with_var(name_node_text, params=self.params, quote=False)
        self.script_file_name = el_utils.replace_el_with_var(script, params=self.params, quote=False)
        self._parse_config()
        self._parse_params()
        self.files, self.hdfs_files = self.file_extractor.parse_node()
        self.archives, self.hdfs_archives = self.archive_extractor.parse_node()

    def _parse_params(self):
        param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param")
        if param_nodes:
            self.params_dict = {}
            for node in param_nodes:
                param = el_utils.replace_el_with_var(node.text, params=self.params, quote=False)
                key, value = param.split("=")
                self.params_dict[key] = value

    def to_tasks_and_relations(self):
        prepare_command = self.get_prepare_command(self.oozie_node, self.params)
        tasks = [
            Task(
                task_id=self.name + "_prepare",
                template_name="prepare.tpl",
                trigger_rule=self.trigger_rule,
                template_params=dict(prepare_command=prepare_command),
            ),
            Task(
                task_id=self.name,
                template_name="pig.tpl",
                trigger_rule=self.trigger_rule,
                template_params=dict(
                    properties=self.properties,
                    params_dict=self.params_dict,
                    script_file_name=self.script_file_name,
                ),
            ),
        ]
        relations = [Relation(from_task_id=self.name + "_prepare", to_task_id=self.name)]
        return tasks, relations

    def _add_symlinks(self, destination_pig_file):
        destination_pig_file.write("set mapred.create.symlink yes;\n")
        if self.files:
            destination_pig_file.write("set mapred.cache.file {};\n".format(",".join(self.hdfs_files)))
        if self.archives:
            destination_pig_file.write("set mapred.cache.archives {};\n".format(",".join(self.hdfs_archives)))

    def copy_extra_assets(self, input_directory_path: str, output_directory_path: str):
        self._validate_paths(input_directory_path, output_directory_path)
        source_pig_file_path = os.path.join(input_directory_path, self.script_file_name)
        destination_pig_file_path = os.path.join(output_directory_path, self.script_file_name)
        self._copy_pig_script_with_path_injection(destination_pig_file_path, source_pig_file_path)

    def _copy_pig_script_with_path_injection(self, destination_pig_file_path, source_pig_file_path):
        os.makedirs(os.path.dirname(destination_pig_file_path), exist_ok=True)
        with open(destination_pig_file_path, "w") as destination_pig_file:
            with open(source_pig_file_path, "r") as source_pig_file:
                pig_script = source_pig_file.read()
                if self.files or self.archives:
                    self._add_symlinks(destination_pig_file)
                destination_pig_file.write(pig_script)

    @staticmethod
    def _validate_paths(input_directory_path, output_directory_path):
        if not input_directory_path:
            raise Exception("The input_directory_path should be set and is {}".format(input_directory_path))
        if not output_directory_path:
            raise Exception("The output_directory_path should be set and is {}".format(output_directory_path))

    def required_imports(self) -> Set[str]:
        return {"from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator"}

    @property
    def first_task_id(self):
        return "{task_id}_prepare".format(task_id=self.name)
class MapReduceMapper(ActionMapper):
    """
    Converts a MapReduce Oozie node to an Airflow task.
    """
    def __init__(self, oozie_node: Element, name: str, dag_name: str,
                 props: PropertySet, **kwargs):
        ActionMapper.__init__(self,
                              oozie_node=oozie_node,
                              name=name,
                              dag_name=dag_name,
                              props=props,
                              **kwargs)
        self.params_dict: Dict[str, str] = {}
        self.file_extractor = FileExtractor(oozie_node=oozie_node,
                                            props=self.props)
        self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node,
                                                  props=self.props)
        self.name_node = None
        self.hdfs_files = None
        self.hdfs_archives = None
        self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(
            self)

    def on_parse_node(self):
        super().on_parse_node()
        name_node_text = self.oozie_node.find("name-node").text
        self.name_node = el_utils.replace_el_with_var(name_node_text,
                                                      props=self.props,
                                                      quote=False)
        self._parse_params()
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

    def _parse_params(self):
        param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param")
        if param_nodes:
            self.params_dict = {}
            for node in param_nodes:
                param = el_utils.replace_el_with_var(node.text,
                                                     props=self.props,
                                                     quote=False)
                key, value = param.split("=", 1)
                self.params_dict[key] = value

    def to_tasks_and_relations(self) -> Tuple[List[Task], List[Relation]]:
        action_task = Task(
            task_id=self.name,
            template_name="mapreduce.tpl",
            template_params=dict(
                props=self.props,
                params_dict=self.params_dict,
                hdfs_files=self.hdfs_files,
                hdfs_archives=self.hdfs_archives,
                action_node_properties=self.props.action_node_properties,
            ),
        )
        tasks = [action_task]
        relations: List[Relation] = []
        prepare_task = self.prepare_extension.get_prepare_task()
        if prepare_task:
            tasks, relations = self.prepend_task(prepare_task, tasks,
                                                 relations)
        return tasks, relations

    @staticmethod
    def _validate_paths(input_directory_path, output_directory_path):
        if not input_directory_path:
            raise Exception(
                "The input_directory_path should be set and is {}".format(
                    input_directory_path))
        if not output_directory_path:
            raise Exception(
                "The output_directory_path should be set and is {}".format(
                    output_directory_path))

    def required_imports(self) -> Set[str]:
        return {
            "from airflow.utils import dates",
            "from airflow.contrib.operators import dataproc_operator"
        }