def parse_prepare_node(oozie_node: ET.Element, params: Dict[str, str]) -> Tuple[List[str], List[str]]: """ <prepare> <delete path="[PATH]"/> ... <mkdir path="[PATH]"/> ... </prepare> """ delete_paths = [] mkdir_paths = [] prepare_nodes = xml_utils.find_nodes_by_tag(oozie_node, "prepare") if prepare_nodes: # If there exists a prepare node, there will only be one, according # to oozie xml schema for node in prepare_nodes[0]: node_path = el_utils.replace_el_with_var(node.attrib["path"], params=params, quote=False) if "//" in node_path: node_path = node_path.split("//", maxsplit=1)[1] # Removing the hdfs:// or similar part node_path = "/" + node_path.split("/", maxsplit=1)[1] # Removing the 'localhost:8082/' part if node.tag == "delete": delete_paths.append(node_path) else: mkdir_paths.append(node_path) return delete_paths, mkdir_paths
def _parse_params(self): param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param") if param_nodes: self.params_dict = {} for node in param_nodes: param = el_utils.replace_el_with_var(node.text, params=self.params, quote=False) key, value = param.split("=") self.params_dict[key] = value
def test_find_nodes_by_tag_none(self): doc = ET.Element('outer') ET.SubElement(doc, 'tag1') ET.SubElement(doc, 'tag2') et = ET.ElementTree(doc) found = xml_utils.find_nodes_by_tag(et.getroot(), 'not_found') self.assertEqual(0, len(found))
def test_find_nodes_by_tag_none(self): doc = ET.Element("outer") ET.SubElement(doc, "tag1") ET.SubElement(doc, "tag2") element_tree = ET.ElementTree(doc) found = xml_utils.find_nodes_by_tag(element_tree.getroot(), "not_found") self.assertEqual(0, len(found))
def test_find_nodes_by_tag_multiple(self): doc = ET.Element("outer") node1 = ET.SubElement(doc, "tag1") ET.SubElement(node1, "tag1") element_tree = ET.ElementTree(doc) found = xml_utils.find_nodes_by_tag(element_tree.getroot(), "tag1") self.assertEqual(1, len(found)) # node2 out of scope. self.assertIn(node1, found)
def test_find_nodes_by_tag_multiple(self): doc = ET.Element('outer') node1 = ET.SubElement(doc, 'tag1') node2 = ET.SubElement(node1, 'tag1') et = ET.ElementTree(doc) found = xml_utils.find_nodes_by_tag(et.getroot(), 'tag1') self.assertEqual(1, len(found)) # node2 out of scope. self.assertIn(node1, found)
def _parse_config(self): config = self.oozie_node.find("configuration") if config: property_nodes = xml_utils.find_nodes_by_tag(config, "property") if property_nodes: for node in property_nodes: name = node.find("name").text value = el_utils.replace_el_with_var( node.find("value").text, params=self.params, quote=False) self.properties[name] = value
def _test_and_set(root, tag, default=None, params={}, quote=False): """ If a node exists in the oozie_node with the tag specified in tag, it will attempt to replace the EL (if it exists) with the corresponding variable. If no EL var is found, it just returns text. However, if the tag is not found under oozie_node, then return default. If there are more than one with the specified tag, it uses the first one found. """ var = xml_utils.find_nodes_by_tag(root, tag) if var: # Only check the first one return el_utils.replace_el_with_var(var[0].text, params=params, quote=quote) else: return default
def _parse_oozie_node(self, oozie_node: ET.Element): """ Property values specified in the configuration element override values specified in the job-xml file. """ self.application = "" self.conf = {} self.conn_id = "spark_default" self.files = None self.py_files = None self.driver_classpath = None self.jars = None self.java_class = None self.packages = None self.exclude_packages = None self.repositories = None self.total_executor_cores = None self.executor_cores = None self.executor_memory = None self.driver_memory = None self.keytab = None self.principal = None self.spark_name = "airflow-spark" self.num_executors = None self.application_args = [] self.env_vars = None self.verbose = False # Prepare nodes self.delete_paths = [] self.mkdir_paths = [] prepare_nodes = xml_utils.find_nodes_by_tag(oozie_node, "prepare") if prepare_nodes: # If there exists a prepare node, there will only be one, according # to oozie xml schema self.delete_paths, self.mkdir_paths = self.parse_prepare_node( prepare_nodes[0]) # master url, deploy mode, self.application = self.test_and_set(oozie_node, "jar", "''", params=self.params, quote=True) self.spark_name = self.test_and_set(oozie_node, "name", "'airflow-spark'", params=self.params, quote=True) self.java_class = self.test_and_set(oozie_node, "class", None, params=self.params, quote=True) config_node = xml_utils.find_nodes_by_tag(oozie_node, "configuration") job_xml = xml_utils.find_nodes_by_tag(oozie_node, "job-xml") for xml_file in job_xml: tree = ET.parse(xml_file.text) self.conf = { **self.conf, **self.parse_spark_config(tree.getroot()) } if config_node: self.conf = { **self.conf, **self.parse_spark_config(config_node[0]) } spark_opts = xml_utils.find_nodes_by_tag(oozie_node, "spark-opts") if spark_opts: self.update_class_spark_opts(spark_opts[0]) app_args = xml_utils.find_nodes_by_tag(oozie_node, "arg") for arg in app_args: self.application_args.append( el_utils.replace_el_with_var(arg.text, self.params, quote=False))