def on_parse_node(self):

        if self.has_prepare:
            self.prepare_command = self.get_prepare_command(oozie_node=self.oozie_node, params=self.params)

        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

        self.java_jar = self._get_or_default(self.oozie_node, SPARK_TAG_JAR, None, params=self.params)
        self.java_class = self._get_or_default(self.oozie_node, SPARK_TAG_CLASS, None, params=self.params)
        if self.java_class and self.java_jar:
            self.dataproc_jars = [self.java_jar]
            self.java_jar = None
        self.job_name = self._get_or_default(self.oozie_node, SPARK_TAG_JOB_NAME, None, params=self.params)

        job_xml_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_JOB_XML)

        for xml_file in job_xml_nodes:
            tree = ET.parse(source=xml_file.text)
            self.properties.update(self._parse_config_node(tree.getroot()))

        config_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_CONFIGURATION)
        if config_nodes:
            self.properties.update(self._parse_config_node(config_nodes[0]))

        spark_opts = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_OPTS)
        if spark_opts:
            self.properties.update(self._parse_spark_opts(spark_opts[0]))

        app_args = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_ARGS)
        for arg in app_args:
            self.application_args.append(el_utils.replace_el_with_var(arg.text, self.params, quote=False))
Example #2
0
    def test_parse_job_xml_and_configuration_minimal_green_path(
            self, extract_properties_from_configuration_node_mock,
            extract_properties_from_job_xml_nodes_mock):
        # language=XML
        action_node_str = """
<action name="action">
    <job-xml>AAA.xml</job-xml>
    <configuration>
        <property>
            <name>KEY1</name>
            <value>VALUE1</value>
        </property>
    </configuration>
</action>
"""
        action_node: ET.Element = cast(ET.Element,
                                       ET.fromstring(action_node_str))
        mapper = self._get_action_mapper(action_node)

        mapper.on_parse_node()

        config_node = find_node_by_tag(action_node, TAG_CONFIGURATION)
        extract_properties_from_configuration_node_mock.assert_called_once_with(
            config_node=config_node)

        job_xml_nodes = find_nodes_by_tag(action_node, TAG_JOB_XML)
        extract_properties_from_job_xml_nodes_mock.assert_called_once_with(
            input_directory_path="/tmp/input_directory_path",
            job_xml_nodes=job_xml_nodes)

        self.assertEqual({
            "KEY1": "VALUE1",
            "KEY2": "VALUE2"
        }, mapper.props.action_node_properties)
    def test_extract_properties_from_job_xml_nodes_minimal_green_path(self, parse_mock):
        # language=XML
        action = ET.ElementTree(
            ET.fromstring(
                """
    <action>
        <job-xml>aaa.xml</job-xml>
    </action>
"""
            )
        )
        # language=XML
        parse_mock.return_value = ET.ElementTree(
            ET.fromstring(
                """
    <configuration>
        <property>
            <name>KEY1</name>
            <value>VALUE1</value>
        </property>
        <property>
            <name>KEY2</name>
            <value>VALUE2</value>
        </property>
    </configuration>
"""
            )
        )
        job_xml_nodes = find_nodes_by_tag(action, TAG_JOB_XML)
        result = extract_properties_from_job_xml_nodes(
            job_xml_nodes, input_directory_path="/tmp/no-error-path", props=PropertySet()
        )

        parse_mock.assert_called_once_with("/tmp/no-error-path/hdfs/aaa.xml")
        self.assertEqual(result, {"KEY1": "VALUE1", "KEY2": "VALUE2"})
 def _parse_params(self):
     param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param")
     if param_nodes:
         self.params_dict = {}
         for node in param_nodes:
             param = el_utils.replace_el_with_var(node.text, params=self.params, quote=False)
             key, value = param.split("=", 1)
             self.params_dict[key] = value
 def has_prepare(self):
     prepare_node = xml_utils.find_node_by_tag(self.mapper.oozie_node,
                                               "prepare")
     if prepare_node:
         delete_nodes = xml_utils.find_nodes_by_tag(prepare_node, "delete")
         mkdir_nodes = xml_utils.find_node_by_tag(prepare_node, "mkdir")
         if delete_nodes or mkdir_nodes:
             return True
     return False
    def test_find_nodes_by_tag_none(self):
        doc = ET.Element("outer")
        ET.SubElement(doc, "tag1")
        ET.SubElement(doc, "tag2")
        element_tree = ET.ElementTree(doc)

        found = xml_utils.find_nodes_by_tag(element_tree.getroot(),
                                            "not_found")

        self.assertEqual(0, len(found))
Example #7
0
    def on_parse_node(self):
        super().on_parse_node()
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

        self.java_jar = get_tag_el_text(self.oozie_node, props=self.props, tag=SPARK_TAG_JAR)
        self.java_class = get_tag_el_text(self.oozie_node, props=self.props, tag=SPARK_TAG_CLASS)
        if self.java_class and self.java_jar:
            self.dataproc_jars = [self.java_jar]
            self.java_jar = None
        self.job_name = get_tag_el_text(self.oozie_node, props=self.props, tag=SPARK_TAG_JOB_NAME)

        spark_opts = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_OPTS)
        if spark_opts:
            self.spark_opts.update(self._parse_spark_opts(spark_opts[0]))

        app_args = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_ARGS)
        for arg in app_args:
            self.application_args.append(el_utils.replace_el_with_var(arg.text, self.props, quote=False))
    def test_find_nodes_by_tag_multiple(self):
        doc = ET.Element("outer")
        node1 = ET.SubElement(doc, "tag1")
        ET.SubElement(node1, "tag1")
        element_tree = ET.ElementTree(doc)

        found = xml_utils.find_nodes_by_tag(element_tree.getroot(), "tag1")

        self.assertEqual(1, len(found))
        # node2 out of scope.
        self.assertIn(node1, found)
Example #9
0
def extract_param_values_from_action_node(oozie_node: Element):
    param_nodes = xml_utils.find_nodes_by_tag(oozie_node, TAG_PARAM)

    new_params = {}
    for node in param_nodes:
        if not node.text:
            continue
        param = el_parser.translate(node.text)
        key, _, value = param.partition("=")
        new_params[key] = value
    return new_params
 def _parse_config(self):
     config = self.oozie_node.find("configuration")
     if config:
         property_nodes = xml_utils.find_nodes_by_tag(config, "property")
         if property_nodes:
             for node in property_nodes:
                 name = node.find("name").text
                 value = el_utils.replace_el_with_var(
                     node.find("value").text,
                     params=self.params,
                     quote=False)
                 self.properties[name] = value
Example #11
0
 def _parse_config(self):
     action_node_properties: Dict[str, str] = {}
     config = self.oozie_node.find("configuration")
     if config:
         props = self.props
         property_nodes = xml_utils.find_nodes_by_tag(config, "property")
         if property_nodes:
             for node in property_nodes:
                 name = node.find("name").text
                 value = el_utils.replace_el_with_var(
                     node.find("value").text, props=props, quote=False)
                 action_node_properties[name] = value
     self.props.action_node_properties = action_node_properties
Example #12
0
    def test_to_tasks_and_relations_parse_spark_opts(self, spark_opts,
                                                     properties):
        spark_node = ET.fromstring(EXAMPLE_XML_WITHOUT_PREPARE)
        spark_opts_node = find_nodes_by_tag(spark_node,
                                            spark_mapper.SPARK_TAG_OPTS)[0]
        spark_opts_node.text = spark_opts
        mapper = self._get_spark_mapper(spark_node)
        mapper.on_parse_node()

        tasks, _ = mapper.to_tasks_and_relations()

        self.assertEqual(tasks[0].template_params["dataproc_spark_properties"],
                         properties)
    def _parse_config(self):
        job_xml_nodes = find_nodes_by_tag(self.oozie_node, TAG_JOB_XML)
        job_xml_properties = extract_properties_from_job_xml_nodes(
            job_xml_nodes=job_xml_nodes,
            input_directory_path=self.input_directory_path)
        self.props.action_node_properties.update(job_xml_properties)

        configuration_node = find_node_by_tag(self.oozie_node,
                                              TAG_CONFIGURATION)
        if configuration_node is not None:
            conf_node_properties = extract_properties_from_configuration_node(
                config_node=configuration_node)
            self.props.action_node_properties.update(conf_node_properties)
Example #14
0
 def _parse_args(self):
     args = []
     arg_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "arg")
     if arg_nodes:
         for node in arg_nodes:
             value: str = node.text
             if "/" in value:
                 # If an argument contains a forward slash then it's a URL.
                 # The full URL should be preserved when replacing the EL (and not just the path)
                 #   to enable copying files between two different clusters.
                 value = el_utils.replace_url_el(value, props=self.props)
             value = shlex.quote(value)
             args.append(value)
     return " ".join(args)
Example #15
0
    def _get_or_default(root: ET.Element, tag: str, default: str = None, params: Dict[str, str] = None):
        """
        If a node exists in the oozie_node with the tag specified in tag, it
        will attempt to replace the EL (if it exists) with the corresponding
        variable. If no EL var is found, it just returns text. However, if the
        tag is not found under oozie_node, then return default. If there are
        more than one with the specified tag, it uses the first one found.
        """
        var = xml_utils.find_nodes_by_tag(root, tag)

        if var:
            # Only check the first one
            return el_utils.replace_el_with_var(var[0].text, params=params, quote=False)
        return default
Example #16
0
def extract_param_values_from_action_node(oozie_node: Element,
                                          props: PropertySet):
    param_nodes = xml_utils.find_nodes_by_tag(oozie_node, TAG_PARAM)

    new_params = {}
    for node in param_nodes:
        if not node.text:
            continue
        param = el_utils.replace_el_with_var(node.text,
                                             props=props,
                                             quote=False)
        key, _, value = param.partition("=")
        new_params[key] = value
    return new_params
    def test_extract_properties_from_job_xml_nodes_should_parse_multie_elements(self, parse_mock):
        # language=XML
        action = ET.ElementTree(
            ET.fromstring(
                """
    <action>
        <job-xml>aaa.xml</job-xml>
        <job-xml>bbb.xml</job-xml>
    </action>
"""
            )
        )
        # language=XML
        parse_mock.side_effect = [
            ET.ElementTree(
                ET.fromstring(
                    """
    <configuration>
        <property>
            <name>KEY1</name>
            <value>VALUE1</value>
        </property>
    </configuration>
"""
                )
            ),
            ET.ElementTree(
                ET.fromstring(
                    """
    <configuration>
        <property>
            <name>KEY2</name>
            <value>VALUE2</value>
        </property>
    </configuration>
"""
                )
            ),
        ]
        job_xml_nodes = find_nodes_by_tag(action, TAG_JOB_XML)
        result = extract_properties_from_job_xml_nodes(
            job_xml_nodes, input_directory_path="/tmp/no-error-path", props=PropertySet()
        )

        parse_mock.assert_has_calls(
            [call("/tmp/no-error-path/hdfs/aaa.xml"), call("/tmp/no-error-path/hdfs/bbb.xml")]
        )
        self.assertEqual(result, {"KEY1": "VALUE1", "KEY2": "VALUE2"})
    def on_parse_node(self):
        super().on_parse_node()
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()

        self.java_jar = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_JAR)
        self.java_class = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_CLASS)
        if self.java_class and self.java_jar:
            self.dataproc_jars = [self.java_jar]
            self.java_jar = None
        self.job_name = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_JOB_NAME)

        spark_opts = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_OPTS)
        if spark_opts:
            self.spark_opts.update(self._parse_spark_opts(spark_opts[0]))

        self.application_args = xml_utils.get_tags_el_array_from_text(self.oozie_node, tag=SPARK_TAG_ARG)
Example #19
0
 def parse_prepare_node(
         oozie_node: ET.Element,
         params: Dict[str, str]) -> Tuple[List[str], List[str]]:
     """
     <prepare>
         <delete path="[PATH]"/>
         ...
         <mkdir path="[PATH]"/>
         ...
     </prepare>
     """
     delete_paths = []
     mkdir_paths = []
     prepare_nodes = xml_utils.find_nodes_by_tag(oozie_node, "prepare")
     if prepare_nodes:
         # If there exists a prepare node, there will only be one, according
         # to oozie xml schema
         for node in prepare_nodes[0]:
             node_path = normalize_path(node.attrib["path"], params=params)
             if node.tag == "delete":
                 delete_paths.append(node_path)
             else:
                 mkdir_paths.append(node_path)
     return delete_paths, mkdir_paths
Example #20
0
 def has_prepare(oozie_node):
     return bool(xml_utils.find_nodes_by_tag(oozie_node, "prepare"))