def test_extract_properties_from_job_xml_nodes_minimal_green_path(self, parse_mock): # language=XML action = ET.ElementTree( ET.fromstring( """ <action> <job-xml>aaa.xml</job-xml> </action> """ ) ) # language=XML parse_mock.return_value = ET.ElementTree( ET.fromstring( """ <configuration> <property> <name>KEY1</name> <value>VALUE1</value> </property> <property> <name>KEY2</name> <value>VALUE2</value> </property> </configuration> """ ) ) job_xml_nodes = find_nodes_by_tag(action, TAG_JOB_XML) result = extract_properties_from_job_xml_nodes( job_xml_nodes, input_directory_path="/tmp/no-error-path", props=PropertySet() ) parse_mock.assert_called_once_with("/tmp/no-error-path/hdfs/aaa.xml") self.assertEqual(result, {"KEY1": "VALUE1", "KEY2": "VALUE2"})
def test_parse_els_multiple_line_with_back_references(self): # Should remain unchanged, as the conversion from a comma-separated string to a List will # occur before writing to file. prop_file = tempfile.NamedTemporaryFile("w", delete=False) prop_file.write(""" #comment key=value,value2,${test} key2=value key3=refer${key2} key4=refer${key5} key5=test """) prop_file.close() job_properties = {"test": "answer"} props = PropertySet(config={}, job_properties=job_properties, action_node_properties={}) expected = { "key": "value,value2,answer", "key2": "value", "key3": "refervalue", "key4": "refer${key5}", # no forward-references "key5": "test", } self.assertEqual( expected, el_utils.extract_evaluate_properties(prop_file.name, props=props))
def _get_shell_mapper(self, job_properties, config): return shell_mapper.ShellMapper( oozie_node=self.shell_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties=job_properties, config=config), )
def test_convert_to_text_without_prepare_node(self): spark_node = ET.fromstring(EXAMPLE_XML) prepare_node = spark_node.find("prepare") spark_node.remove(prepare_node) mapper = self._get_git_mapper(spark_node) mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( [ Task( task_id="test_id", template_name="git.tpl", template_params={ "git_uri": "https://github.com/apache/oozie", "git_branch": "my-awesome-branch", "destination_path": "/my_git_repo_directory", "key_path": "/awesome-key/", "props": PropertySet( config={"dataproc_cluster": "my-cluster", "gcp_region": "europe-west3"}, job_properties={ "branch": "my-awesome-branch", "nameNode": "hdfs://", "userName": "******", "examplesRoot": "examples", }, action_node_properties={}, ), }, ) ], tasks, ) self.assertEqual([], relations)
def test_task_and_relations(self): # Given mapper = _get_distcp_mapper(self.distcp_node, job_properties=EXAMPLE_JOB_PROPERTIES, config=EXAMPLE_CONFIG_PROPERTIES) # When mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() # Then self.assertEqual(mapper.oozie_node, self.distcp_node) self.assertIsNotNone(tasks) self.assertIsNotNone(relations) self.assertEqual(2, len(tasks)) self.assertEqual(1, len(relations)) self.assertEqual( [ Task( task_id="distcp_prepare", template_name="prepare.tpl", trigger_rule="one_success", template_params={ "delete": "/tmp/d_path", "mkdir": None }, ), Task( task_id="distcp", template_name="distcp.tpl", trigger_rule="one_success", template_params={ "props": PropertySet( config={ "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3" }, job_properties={ "nameNode1": "hdfs://localhost:8081", "nameNode2": "hdfs://localhost:8082", }, action_node_properties={ "oozie.launcher.mapreduce.job.hdfs-servers": "{{nameNode1}} ,{{nameNode2}}" }, ), "distcp_command": "--class=org.apache.hadoop.tools.DistCp -- -update -skipcrccheck " "-strategy dynamic '{{nameNode1}}/path/to/input file.txt' " "'{{nameNode2}}/path/to/output-file.txt'", }, ), ], tasks, ) self.assertEqual([ Relation(from_task_id=f"{mapper.name}_prepare", to_task_id=mapper.name) ], relations)
def test_should_add_end_success_workflow_node(self): # Given transformer = AddWorkflowNotificationTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") props = PropertySet( job_properties={ PROP_WORKFLOW_NOTIFICATION_URL: "http://example.com/workflow" }) first_task_group = TaskGroup( name="first_task", tasks=[Task(task_id="first_task", template_name="dummy.tpl")]) # When workflow.task_groups[first_task_group.name] = first_task_group # Then transformer.process_workflow_after_convert_nodes(workflow, props) self.assertIn(END_SUCCESS_TASK_GROUP_NAME, workflow.task_groups.keys()) self.assertIn(END_SUCCESS_TASK_GROUP_NAME, first_task_group.downstream_names) self.assertEqual( [ Task( task_id=END_SUCCESS_TASK_GROUP_NAME, template_name="http.tpl", trigger_rule="one_success", template_params={"url": "http://example.com/workflow"}, ) ], workflow.task_groups[END_SUCCESS_TASK_GROUP_NAME].tasks, )
def test_extract_param_values_from_action_node_should_support_el_value( self): props = PropertySet(config={}, job_properties={ "userName": "******", "examplesRoot": "TEST_EXAMPLE_ROOT" }) # language=XML xml_content = """ <fragment> <param>INPUT=/user/${userName}/${examplesRoot}/apps/hive/input/</param> <param>OUTPUT=/user/${userName}/${examplesRoot}/apps/hive/output/</param> </fragment> """ node = ET.fromstring(xml_content) result = extract_param_values_from_action_node(node, props=props) self.assertEqual( { "INPUT": "/user/TEST_USERNAME/TEST_EXAMPLE_ROOT/apps/hive/input/", "OUTPUT": "/user/TEST_USERNAME/TEST_EXAMPLE_ROOT/apps/hive/output/", }, result, )
def test_create_workflow_file_should_format_file(self, open_mock, render_template_mock, sort_imports_mock, autoflake_fix_file_mock, black_mock): renderer = self._create_renderer() workflow = _create_workflow() props = PropertySet(config=dict(), job_properties=dict()) renderer.create_workflow_file(workflow, props=props) black_mock.format_file_in_place.assert_called_once_with( Path("/tmp/output/DAG_NAME.py"), fast=mock.ANY, mode=mock.ANY, write_back=mock.ANY) autoflake_fix_file_mock.assert_called_once_with( "/tmp/output/DAG_NAME.py", args=AutoflakeArgs( remove_all_unused_imports=True, ignore_init_module_imports=False, remove_duplicate_keys=False, remove_unused_variables=True, in_place=True, imports=None, expand_star_imports=False, check=False, ), standard_out=sys.stdout, ) sort_imports_mock.assert_called_once_with("/tmp/output/DAG_NAME.py")
def __init__( self, dag_name: str, input_directory_path: str, output_directory_path: str, action_mapper: Dict[str, Type[ActionMapper]], renderer: BaseRenderer, transformers: List[BaseWorkflowTransformer] = None, user: str = None, initial_props: PropertySet = None, ): self.workflow = Workflow( dag_name=dag_name, input_directory_path=input_directory_path, output_directory_path=output_directory_path, ) self.renderer = renderer self.transformers = transformers or [] # Propagate the configuration in case initial property set is passed job_properties = {} if not initial_props else initial_props.job_properties job_properties["user.name"] = user or os.environ["USER"] self.props = PropertySet(job_properties=job_properties) self.property_parser = PropertyParser(props=self.props, workflow=self.workflow) self.parser = parser.OozieParser(props=self.props, action_mapper=action_mapper, renderer=self.renderer, workflow=self.workflow)
def test_arguments_are_parsed_correctly_without_jar_files(self): mapper = self._get_java_mapper( job_properties={ "userName": "******", "oozie.wf.application.path": "hdfs:///user/USER/examples/apps/java", }, config={}, ) mapper.on_parse_node() self.assertEqual("test_id", mapper.name) self.assertEqual("org.apache.oozie.example.DemoJavaMain", mapper.main_class) self.assertEqual(["-Dtest1=val1", "-Dtest2=val2"], mapper.java_opts) self.assertEqual( PropertySet( config={}, job_properties={ "userName": "******", "oozie.wf.application.path": "hdfs:///user/USER/examples/apps/java", }, action_node_properties={ "mapred.job.queue.name": "${queueName}" }, ), mapper.props, ) self.assertEqual([], mapper.jar_files_in_hdfs) self.assertEqual([], mapper.jar_files)
def test_with_prepare(self): cluster = "my-cluster" region = "europe-west3" job_properties = {"nameNode": "hdfs://localhost:8020"} config = {"dataproc_cluster": cluster, "gcp_region": region} # language=XML pig_node_prepare_str = """ <pig> <name-node>hdfs://</name-node> <prepare> <delete path="${nameNode}/examples/output-data/demo/pig-node" /> <delete path="${nameNode}/examples/output-data/demo/pig-node2" /> <mkdir path="${nameNode}/examples/input-data/demo/pig-node" /> <mkdir path="${nameNode}/examples/input-data/demo/pig-node2" /> </prepare> </pig> """ pig_node_prepare = ET.fromstring(pig_node_prepare_str) extension = self.get_mapper_extension( pig_node_prepare, props=PropertySet(config=config, job_properties=job_properties) ) self.assertTrue(extension.has_prepare()) task = extension.get_prepare_task() self.assertEqual( Task( task_id="mapper_prepare", template_name="prepare.tpl", template_params={ "delete": "/examples/output-data/demo/pig-node /examples/output-data/demo/pig-node2", "mkdir": "/examples/input-data/demo/pig-node /examples/input-data/demo/pig-node2", }, ), task, )
def test_parse_els_no_file(self): expected_properties = {} props = PropertySet(job_properties={"key": "value"}, config={}, action_node_properties={}) self.assertEqual(expected_properties, el_utils.parse_els(None, props=props))
def test_to_tasks_and_relations(self): mapper = self._get_ssh_mapper(job_properties={}, config={}) tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( [ Task( task_id="test_id", template_name="ssh.tpl", template_params={ "props": PropertySet(config={}, job_properties={}, action_node_properties={}), "command": "'ls -l -a'", "user": "******", "host": "apache.org", }, ) ], tasks, ) self.assertEqual(relations, [])
def _get_mapreduce_mapper(self, job_properties, config): return mapreduce_mapper.MapReduceMapper( oozie_node=self.mapreduce_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties=job_properties, config=config), )
def _get_fs_mapper(oozie_node): return fs_mapper.FsMapper( oozie_node=oozie_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties={"nameNode": "hdfs://"}, config={}), )
def test_to_tasks_and_relations(self): mapper = self._get_email_mapper(job_properties={"userName": "******"}, config={}) mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( [ Task( task_id="test_id", template_name="email.tpl", trigger_rule="one_success", template_params={ "props": PropertySet(config={}, job_properties={"userName": "******"}, action_node_properties={}), "to_addr": "[email protected],[email protected]", "cc_addr": "[email protected],[email protected]", "bcc_addr": "[email protected],[email protected]", "subject": "Email notifications for {{run_id}}", "body": "Hi {{userName}} , the wf {{run_id}} successfully " "completed. Bye {{userName}}", }, ) ], tasks, ) self.assertEqual(relations, [])
def test_to_tasks_and_relations(self): job_properties = {"nameNode": "hdfs://"} config = { "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3" } mapper = self._get_pig_mapper(job_properties=job_properties, config=config) mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( tasks, [ Task( task_id="test_id_prepare", template_name="prepare.tpl", template_params={ "delete": "/examples/output-data/demo/pig-node /examples/output-data/demo/pig-node2", "mkdir": "/examples/input-data/demo/pig-node /examples/input-data/demo/pig-node2", }, ), Task( task_id="test_id", template_name="pig.tpl", template_params={ "props": PropertySet( config={ "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3" }, job_properties={"nameNode": "hdfs://"}, action_node_properties={ "mapred.job.queue.name": "${queueName}", "mapred.map.output.compress": "false", }, ), "params_dict": { "INPUT": "/user/${wf:user()}/${examplesRoot}/input-data/text", "OUTPUT": "/user/${wf:user()}/${examplesRoot}/output-data/demo/pig-node", }, "script_file_name": "id.pig", "action_node_properties": { "mapred.job.queue.name": "${queueName}", "mapred.map.output.compress": "false", }, }, ), ], ) self.assertEqual( [Relation(from_task_id="test_id_prepare", to_task_id="test_id")], relations)
def _get_shell_mapper(self, job_properties, config): return shell_mapper.ShellMapper( oozie_node=self.shell_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties=job_properties, config=config), input_directory_path="/tmp/input-directory-path/", )
def _get_start_mapper(self, name="test_id"): mapper = StartMapper( oozie_node=self.oozie_node, name=name, dag_name="DAG_NAME_B", props=PropertySet(config={}, job_properties={}), ) return mapper
def test_prepare_move_command(self, xml, command): node = ET.fromstring(xml) self.assertEqual( command, fs_mapper.prepare_move_command( node, props=PropertySet(job_properties=TEST_JOB_PROPS, config=TEST_CONFIG) ), )
def _get_fs_mapper(oozie_node): return fs_mapper.FsMapper( oozie_node=oozie_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties={"nameNode": "hdfs://"}, config={}), input_directory_path="/tmp/input-directory-path/", )
def _get_git_mapper(spark_node): mapper = git_mapper.GitMapper( oozie_node=spark_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties=EXAMPLE_JOB_PROPS, config=EXAMPLE_CONFIG), ) return mapper
def _get_pig_mapper(self, job_properties, config): mapper = pig_mapper.PigMapper( oozie_node=self.pig_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties=job_properties, config=config), ) return mapper
def get_child_props(self) -> PropertySet: propagate_configuration = self.oozie_node.find( "propagate-configuration") # Below the `is not None` is necessary due to Element's __bool__() return value: # `len(self._children) != 0`, # and `propagate_configuration` is an empty node so __bool__() will always return False. return (self.props if propagate_configuration is not None else PropertySet(config={}, job_properties={}))
def test_extract_properties_from_configuration_node_when_empty(self): # language=XML config_node_str = """ <configuration> </configuration> """ config_node = ET.fromstring(config_node_str) properties = extract_properties_from_configuration_node(config_node, props=PropertySet()) self.assertEqual(properties, {})
def test_normalize_path_red_path(self, oozie_path): cluster = "my-cluster" region = "europe-west3" job_properties = {"nameNode": "hdfs://localhost:8020"} config = {"dataproc_cluster": cluster, "gcp_region": region} with self.assertRaisesRegex(ParseException, "Unknown path format. "): normalize_path(oozie_path, props=PropertySet(config=config, job_properties=job_properties))
def _get_spark_mapper(spark_node): mapper = spark_mapper.SparkMapper( oozie_node=spark_node, name="test_id", dag_name="DAG_NAME_B", props=PropertySet(job_properties=EXAMPLE_JOB_PROPS, config=EXAMPLE_CONFIG), input_directory_path="/tmp/input-directory-path/", ) return mapper
def _get_email_mapper(self, job_properties, config): mapper = email_mapper.EmailMapper( oozie_node=self.email_node, name="test_id", dag_name="DAG_NAME_A", props=PropertySet(job_properties=job_properties, config=config), input_directory_path="/tmp/input-directory-path/", ) return mapper
def setUp(self): self.job_properties = { "nameNode": "hdfs://", "oozie.wf.application.path": "hdfs:///user/pig/examples/pig_test_node", } self.props = PropertySet(job_properties=self.job_properties, config={}, action_node_properties={})
def setUp(self): props = PropertySet(job_properties={}, config={}) workflow = Workflow(input_directory_path=EXAMPLE_DEMO_PATH, output_directory_path="/tmp", dag_name="DAG_NAME_B") self.parser = parser.OozieParser(workflow=workflow, props=props, action_mapper=ACTION_MAP, renderer=mock.MagicMock())