def test_on_parse_finish_simple_should_remove_end_node(self): workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") mapper = self._get_end_mapper("second_task") workflow.nodes["first_task"] = ParsedActionNode(mock.Mock(autospec=BaseMapper)) workflow.nodes["second_task"] = ParsedActionNode(mapper) workflow.relations = {Relation(from_task_id="first_task", to_task_id="second_task")} mapper.on_parse_finish(workflow) self.assertEqual({"first_task"}, set(workflow.nodes.keys())) self.assertEqual(set(), workflow.relations)
def process_workflow_after_parse_workflow_xml(self, workflow: Workflow): decision_nodes = workflow.get_nodes_by_type(DecisionMapper) decision_node_names = {node.name for node in decision_nodes} end_nodes = workflow.get_nodes_by_type(EndMapper) for end_node in end_nodes: upstream_nodes = workflow.find_upstream_nodes(end_node) upstream_node_names = {node.name for node in upstream_nodes} if not decision_node_names.intersection(upstream_node_names): workflow.remove_node(end_node) else: for upstream_node in upstream_nodes: if upstream_node.name not in decision_node_names: upstream_node.downstream_names.remove(end_node.name)
def test_should_add_end_success_workflow_node(self): # Given transformer = AddWorkflowNotificationTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") props = PropertySet( job_properties={ PROP_WORKFLOW_NOTIFICATION_URL: "http://example.com/workflow" }) first_task_group = TaskGroup( name="first_task", tasks=[Task(task_id="first_task", template_name="dummy.tpl")]) # When workflow.task_groups[first_task_group.name] = first_task_group # Then transformer.process_workflow_after_convert_nodes(workflow, props) self.assertIn(END_SUCCESS_TASK_GROUP_NAME, workflow.task_groups.keys()) self.assertIn(END_SUCCESS_TASK_GROUP_NAME, first_task_group.downstream_names) self.assertEqual( [ Task( task_id=END_SUCCESS_TASK_GROUP_NAME, template_name="http.tpl", trigger_rule="one_success", template_params={"url": "http://example.com/workflow"}, ) ], workflow.task_groups[END_SUCCESS_TASK_GROUP_NAME].tasks, )
def __init__( self, dag_name: str, input_directory_path: str, output_directory_path: str, action_mapper: Dict[str, Type[ActionMapper]], renderer: BaseRenderer, transformers: List[BaseWorkflowTransformer] = None, user: str = None, initial_props: PropertySet = None, ): self.workflow = Workflow( dag_name=dag_name, input_directory_path=input_directory_path, output_directory_path=output_directory_path, ) self.renderer = renderer self.transformers = transformers or [] # Propagate the configuration in case initial property set is passed job_properties = {} if not initial_props else initial_props.job_properties job_properties["user.name"] = user or os.environ["USER"] self.props = PropertySet(job_properties=job_properties) self.property_parser = PropertyParser(props=self.props, workflow=self.workflow) self.parser = parser.OozieParser(props=self.props, action_mapper=action_mapper, renderer=self.renderer, workflow=self.workflow)
def test_convert(self, sort_imports_mock, autoflake_fix_file_mock, black_mock, parse_workflow_mock): # Given workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict(AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) parse_workflow_mock.return_value = workflow # When self.converter.convert() # Then parse_workflow_mock.assert_called_once_with() black_mock.format_file_in_place.assert_called_once_with( Path("/tmp/test_dag.py"), fast=mock.ANY, mode=mock.ANY, write_back=mock.ANY ) autoflake_fix_file_mock.assert_called_once_with( "/tmp/test_dag.py", args=AutoflakeArgs( remove_all_unused_imports=True, ignore_init_module_imports=False, remove_duplicate_keys=False, remove_unused_variables=True, in_place=True, imports=None, expand_star_imports=False, check=False, ), standard_out=sys.stdout, ) sort_imports_mock.assert_called_once_with("/tmp/test_dag.py")
def test_write_dag_file(self, render_template_mock): relations = {Relation(from_task_id="TASK_1", to_task_id="TASK_2")} nodes = dict(TASK_1=ParsedActionNode(DummyMapper(Element("dummy"), name="TASK_1"))) dependencies = {"import awesome_stuff"} workflow = Workflow( input_directory_path="/tmp/input_directory", output_directory_path="/tmp/input_directory", dag_name="test_dag", relations=relations, nodes=nodes, dependencies=dependencies, ) content = self.converter.render_workflow(workflow=workflow) render_template_mock.assert_called_once_with( dag_name="test_dag", dependencies={"import awesome_stuff"}, nodes=[nodes["TASK_1"]], params={"user.name": "USER"}, relations={Relation(from_task_id="TASK_1", to_task_id="TASK_2")}, schedule_interval=None, start_days_ago=None, template_name="workflow.tpl", ) self.assertEqual(content, "TEXT_CONTENT")
def test_should_keep_node_in_correct_flow(self): transformer = RemoveKillTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") first_mapper = mock.Mock(spec=BaseMapper) first_mapper.name = "first_task" third_mapper = mock.Mock(spec=KillMapper) third_mapper.name = "third_task" first_node = OozieNode(first_mapper) third_node = OozieNode(third_mapper) first_node.downstream_names = [third_mapper.name] workflow.nodes[first_mapper.name] = first_node workflow.nodes[third_mapper.name] = third_node transformer.process_workflow_after_parse_workflow_xml(workflow) self.assertEqual({first_mapper.name, third_mapper.name}, set(workflow.nodes.keys())) self.assertEqual([third_node.name], first_node.downstream_names) self.assertEqual([], third_node.downstream_names)
def test_should_keep_join_node_when_it_have_downstream_nodes(self): # pylint: disable=too-many-locals transformer = RemoveJoinTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") fork_mapper = mock.Mock(spec=ForkMapper) fork_mapper.name = "fork_task" fork_node = OozieNode(fork_mapper) first_mapper = mock.Mock(spec=DummyMapper) first_mapper.name = "first_task" first_node = OozieNode(first_mapper) second_mapper = mock.Mock(spec=DummyMapper) second_mapper.name = "second_task" second_node = OozieNode(second_mapper) third_mapper = mock.Mock(spec=DummyMapper) third_mapper.name = "third_task" third_node = OozieNode(third_mapper) join_mapper = mock.Mock(spec=JoinMapper) join_mapper.name = "join_task" join_node = OozieNode(join_mapper) fourth_mapper = mock.Mock(spec=DummyMapper) fourth_mapper.name = "fourth_task" fourth_node = OozieNode(fourth_mapper) fork_node.downstream_names = [first_mapper.name, second_mapper.name, third_mapper.name] first_node.downstream_names = [join_mapper.name] second_node.downstream_names = [join_mapper.name] third_node.downstream_names = [join_mapper.name] join_node.downstream_names = [fourth_node.name] workflow.nodes[fork_mapper.name] = fork_node workflow.nodes[first_mapper.name] = first_node workflow.nodes[second_mapper.name] = second_node workflow.nodes[third_mapper.name] = third_node workflow.nodes[join_mapper.name] = join_node workflow.nodes[fourth_mapper.name] = fourth_node transformer.process_workflow_after_parse_workflow_xml(workflow) self.assertEqual( { fork_mapper.name, first_mapper.name, second_mapper.name, third_mapper.name, join_mapper.name, fourth_mapper.name, }, set(workflow.nodes.keys()), ) self.assertEqual( [first_mapper.name, second_mapper.name, third_mapper.name], fork_node.downstream_names ) self.assertEqual([join_mapper.name], first_node.downstream_names) self.assertEqual([join_mapper.name], second_node.downstream_names) self.assertEqual([join_mapper.name], third_node.downstream_names) self.assertEqual([fourth_mapper.name], join_node.downstream_names)
def test_should_keep_connected_nodes_in_correct_flow(self): """ Graph before: .. graphviz:: digraph foo { S -> A A -> B } Graph after: .. graphviz:: digraph foo { S -> A A -> B } Where: A - first_task B - second_task S - start_task """ transformer = RemoveInaccessibleNodeTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") first_mapper = mock.Mock(spec=BaseMapper) first_mapper.name = "first_task" second_mapper = mock.Mock(spec=BaseMapper) second_mapper.name = "second_task" start_mapper = mock.Mock(spec=StartMapper) start_mapper.name = "start_task" first_node = OozieNode(mapper=first_mapper, tasks=[self._get_dummy_task(first_mapper.name)]) second_node = OozieNode(mapper=second_mapper, tasks=[self._get_dummy_task(second_mapper.name)]) start_node = OozieNode(mapper=start_mapper, tasks=[self._get_dummy_task(start_mapper.name)]) start_node.downstream_names = [first_mapper.name] first_node.downstream_names = [second_node.name] workflow.nodes[first_mapper.name] = first_node workflow.nodes[second_mapper.name] = second_node workflow.nodes[start_mapper.name] = start_node transformer.process_workflow_after_parse_workflow_xml(workflow) self.assertEqual( {start_mapper.name, first_mapper.name, second_mapper.name}, set(workflow.nodes.keys()) ) self.assertEqual([first_mapper.name], start_node.downstream_names) self.assertEqual([second_mapper.name], first_node.downstream_names) self.assertEqual([], second_node.downstream_names)
def setUp(self): props = PropertySet(job_properties={}, config={}) workflow = Workflow(input_directory_path=EXAMPLE_DEMO_PATH, output_directory_path="/tmp", dag_name="DAG_NAME_B") self.parser = parser.OozieParser(workflow=workflow, props=props, action_mapper=ACTION_MAP, renderer=mock.MagicMock())
def test_on_parse_finish(self): workflow = Workflow(input_directory_path=None, output_directory_path=None, dag_name=None) mapper = self._get_start_mapper(name="first_task") workflow.nodes["first_task"] = ParsedNode( mock.Mock(autospec=BaseMapper)) workflow.nodes["second_task"] = ParsedNode(mapper) workflow.relations = { Relation(from_task_id="first_task", to_task_id="second_task") } mapper.on_parse_finish(workflow) self.assertEqual(set(workflow.nodes.keys()), {"second_task"}) self.assertEqual(workflow.relations, set())
def test_on_parse_finish_decision_should_not_remove_end_node(self): workflow = Workflow(input_directory_path=None, output_directory_path=None, dag_name=None) mapper = self._get_end_mapper("end_task") workflow.nodes["first_task"] = ParsedNode( mock.Mock(spec=DecisionMapper, last_task_id="first_task")) workflow.nodes["second_task"] = ParsedNode( mock.Mock(spec=BaseMapper, last_task_id="second_task")) workflow.nodes["end_task"] = ParsedNode(mapper) workflow.relations = { Relation(from_task_id="first_task", to_task_id="end_task"), Relation(from_task_id="second_task", to_task_id="end_task"), } mapper.on_parse_finish(workflow) self.assertEqual(set(workflow.nodes.keys()), {"first_task", "second_task", "end_task"}) self.assertEqual( workflow.relations, {Relation(from_task_id="first_task", to_task_id="end_task")})
def test_on_parse_finish(self): workflow = Workflow(input_directory_path=None, output_directory_path=None, dag_name=None) mapper = self._get_kill_mapper(name="fail_task") workflow.nodes["task"] = ParsedNode(mock.Mock(autospec=BaseMapper)) workflow.nodes["fail_task"] = ParsedNode(mapper) workflow.nodes["success_task"] = ParsedNode( mock.Mock(autospec=BaseMapper)) workflow.nodes["success_task"].set_is_ok(True) workflow.nodes["fail_task"].set_is_error(True) workflow.relations = { Relation(from_task_id="task", to_task_id="fail_task"), Relation(from_task_id="task", to_task_id="success_task"), } mapper.on_parse_finish(workflow) self.assertEqual(set(workflow.nodes.keys()), {"task", "success_task"}) self.assertEqual( workflow.relations, {Relation(from_task_id="task", to_task_id="success_task")})
def _create_workflow(): return Workflow( dag_name="DAG_NAME", input_directory_path="/tmp/input", output_directory_path="/tmp/output", task_group_relations={ Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B") }, task_groups=dict(TASK_NAME=TaskGroup( name="DAG_NAME_A", tasks=[Task(task_id="task_name", template_name="dummy.tpl")])), dependencies={"import IMPORT"}, )
def setUp(self) -> None: self.transformer = AddNodeNotificationTransformer() self.workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME") self.action_task_group = ActionTaskGroup( name="action_task_group", tasks=[new_task("action_task")]) self.workflow.task_groups[ self.action_task_group.name] = self.action_task_group self.props = PropertySet( job_properties={ PROP_KEY_NODE_NOTIFICATION_URL: NODE_NOTIFICATION_URL_TPL })
def _create_workflow(nodes=None): return Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B")}, nodes=dict( AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="DAG_NAME_A", dag_name="DAG_NAME_B")) ) if not nodes else nodes, dependencies={"import IMPORT"}, )
def test_create_dag_file(self, open_mock, _): # Given workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict(AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) # When self.converter.create_dag_file(workflow) # Then open_mock.assert_called_once_with("/tmp/test_dag.py", "w")
def _create_workflow(): return Workflow( dag_name="DAG_NAME", input_directory_path="/tmp/input", output_directory_path="/tmp/output", task_group_relations={ Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B") }, nodes=dict(AAA=OozieActionNode( DummyMapper(Element("dummy"), name="DAG_NAME_A", dag_name="DAG_NAME_B"))), dependencies={"import IMPORT"}, )
def test_should_do_nothing_when_notification_url_not_configured(self): # Given transformer = AddWorkflowNotificationTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") props = PropertySet() first_task_group = TaskGroup( name="first_task", tasks=[Task(task_id="first_task", template_name="dummy.tpl")]) workflow.task_groups[first_task_group.name] = first_task_group # When transformer.process_workflow_after_convert_nodes(workflow, props) # Then self.assertEqual({first_task_group.name}, workflow.task_groups.keys())
def __init__( self, input_directory_path: str, output_directory_path: str, params: Dict[str, str], action_mapper: Dict[str, Type[ActionMapper]], control_mapper: Dict[str, Type[BaseMapper]], dag_name: str = None, ): self.workflow = Workflow( dag_name=dag_name, input_directory_path=input_directory_path, output_directory_path=output_directory_path, ) self.workflow_file = os.path.join(input_directory_path, HDFS_FOLDER, "workflow.xml") self.params = params self.action_map = action_mapper self.control_map = control_mapper
def test_create_dag_file(self, black_mock, open_mock, _): workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict( AAA=ParsedNode(DummyMapper(ET.Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) self.converter.create_dag_file(workflow) open_mock.assert_called_once_with("/tmp/test_dag.py", "w") black_mock.format_file_in_place.assert_called_once_with( Path("/tmp/test_dag.py"), fast=mock.ANY, mode=mock.ANY, write_back=mock.ANY)
def __init__( self, input_directory_path: str, output_directory_path: str, props: PropertySet, action_mapper: Dict[str, Type[ActionMapper]], renderer: BaseRenderer, dag_name: str, ): self.workflow = Workflow( dag_name=dag_name, input_directory_path=input_directory_path, output_directory_path=output_directory_path, ) self.workflow_file = os.path.join(input_directory_path, HDFS_FOLDER, "workflow.xml") self.props = props self.action_map = action_mapper self.renderer = renderer
def test_should_remove_start_node(self): transformer = RemoveStartTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") other_mapper = mock.Mock(spec=BaseMapper) other_mapper.name = "first_task" start_mapper = mock.Mock(spec=StartMapper) start_mapper.name = "start_task" workflow.nodes[other_mapper.name] = ParsedActionNode( mapper=other_mapper, tasks=[self._get_dummy_task(other_mapper.name)] ) workflow.nodes[start_mapper.name] = ParsedActionNode( mapper=start_mapper, tasks=[self._get_dummy_task(start_mapper.name)] ) transformer.process_workflow(workflow) self.assertEqual({other_mapper.name}, set(workflow.nodes.keys()))
def test_should_remove_multiple_join_nodes(self): transformer = RemoveJoinTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") join_a_mapper = mock.Mock(spec=JoinMapper) join_a_mapper.name = "join_A" join_a_node = OozieNode(join_a_mapper) workflow.nodes[join_a_mapper.name] = join_a_node join_b_mapper = mock.Mock(spec=JoinMapper) join_b_mapper.name = "join_B" join_b_node = OozieNode(join_b_mapper) workflow.nodes[join_b_mapper.name] = join_b_node workflow.nodes[join_a_mapper.name] = join_a_node workflow.nodes[join_b_mapper.name] = join_b_node transformer.process_workflow_after_parse_workflow_xml(workflow) self.assertEqual(set(), set(workflow.nodes.keys()))
def test_should_remove_node_in_error_flow(self): transformer = RemoveKillTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") first_mapper = mock.Mock(spec=BaseMapper) first_mapper.name = "first_task" second_mapper = mock.Mock(spec=KillMapper) second_mapper.name = "second_task" first_node = OozieNode(first_mapper) second_node = OozieNode(second_mapper) first_node.error_downstream_name = second_mapper.name workflow.nodes[first_mapper.name] = first_node workflow.nodes[second_mapper.name] = second_node transformer.process_workflow_after_parse_workflow_xml(workflow) self.assertEqual({first_mapper.name}, set(workflow.nodes.keys()))
def test_should_remove_end_node(self): transformer = RemoveEndTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") first_mapper = mock.Mock(spec=BaseMapper) first_mapper.name = "first_task" end_mapper = mock.Mock(spec=EndMapper) end_mapper.name = "second_task" first_node = ParsedActionNode(mapper=first_mapper) end_node = ParsedActionNode(mapper=end_mapper) first_node.downstream_names = [end_node.name] workflow.nodes[first_mapper.name] = first_node workflow.nodes[end_mapper.name] = end_node transformer.process_workflow(workflow) self.assertEqual({first_mapper.name}, set(workflow.nodes.keys())) self.assertEqual([], first_node.downstream_names)
def _find_accessible_nodes(workflow: Workflow): """ Finds nodes that are reachable from any Start node. """ start_nodes = workflow.get_nodes_by_type(StartMapper) visited_node: Dict[str, OozieNode] = dict() def visit_node(node: OozieNode): if node.name in visited_node: return visited_node[node.name] = node all_downstream_node_names = [*node.downstream_names] if node.error_downstream_name: all_downstream_node_names.append(node.error_downstream_name) for node_name in all_downstream_node_names: visit_node(workflow.nodes[node_name]) for start_node in start_nodes: visit_node(start_node) return visited_node.values()
def test_should_not_remove_end_node_when_connected_with_decision(self): transformer = RemoveEndTransformer() workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") decision_mapper = mock.Mock(spec=DecisionMapper) decision_mapper.name = "first_task" second_mapper = mock.Mock(spec=BaseMapper) second_mapper.name = "second_task" end_mapper = mock.Mock(spec=EndMapper) end_mapper.name = "end_task" decision_node = ParsedActionNode( mapper=decision_mapper, tasks=[self._get_dummy_task(decision_mapper.name)]) second_node = ParsedActionNode( mapper=second_mapper, tasks=[self._get_dummy_task(second_mapper.name)]) end_node = ParsedActionNode( mapper=end_mapper, tasks=[self._get_dummy_task(end_mapper.name)]) decision_node.downstream_names = [second_mapper.name, end_mapper.name] workflow.nodes[decision_mapper.name] = decision_node workflow.nodes[second_mapper.name] = second_node workflow.nodes[end_mapper.name] = end_node transformer.process_workflow(workflow) self.assertEqual( {decision_mapper.name, second_mapper.name, end_mapper.name}, set(workflow.nodes.keys())) self.assertEqual([second_mapper.name, end_mapper.name], decision_node.downstream_names) self.assertEqual([], second_node.downstream_names) self.assertEqual([], end_node.downstream_names)
def test_on_parse_finish_decision_should_not_remove_end_node(self): workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") mapper = self._get_end_mapper("end_task") workflow.nodes["first_task"] = ParsedActionNode( mock.Mock(spec=DecisionMapper), tasks=[self._get_dummy_task("first_task")] ) workflow.nodes["second_task"] = ParsedActionNode( mock.Mock(spec=BaseMapper), tasks=[self._get_dummy_task("second_task")] ) workflow.nodes["end_task"] = ParsedActionNode(mapper, tasks=[self._get_dummy_task("end_task")]) workflow.relations = { Relation(from_task_id="first_task", to_task_id="end_task"), Relation(from_task_id="second_task", to_task_id="end_task"), } mapper.on_parse_finish(workflow) self.assertEqual({"first_task", "second_task", "end_task"}, set(workflow.nodes.keys())) self.assertEqual({Relation(from_task_id="first_task", to_task_id="end_task")}, workflow.relations)
def process_workflow(self, workflow: Workflow): start_nodes = workflow.get_nodes_by_type(StartMapper) for start_node in start_nodes: workflow.remove_node(start_node)