def __init__(
     self,
     dag_name: str,
     input_directory_path: str,
     output_directory_path: str,
     action_mapper: Dict[str, Type[ActionMapper]],
     renderer: BaseRenderer,
     transformers: List[BaseWorkflowTransformer] = None,
     user: str = None,
     initial_props: PropertySet = None,
 ):
     self.workflow = Workflow(
         dag_name=dag_name,
         input_directory_path=input_directory_path,
         output_directory_path=output_directory_path,
     )
     self.renderer = renderer
     self.transformers = transformers or []
     # Propagate the configuration in case initial property set is passed
     job_properties = {} if not initial_props else initial_props.job_properties
     job_properties["user.name"] = user or os.environ["USER"]
     self.props = PropertySet(job_properties=job_properties)
     self.property_parser = PropertyParser(props=self.props,
                                           workflow=self.workflow)
     self.parser = parser.OozieParser(props=self.props,
                                      action_mapper=action_mapper,
                                      renderer=self.renderer,
                                      workflow=self.workflow)
Example #2
0
 def setUp(self):
     props = PropertySet(job_properties={}, config={})
     workflow = Workflow(input_directory_path=EXAMPLE_DEMO_PATH,
                         output_directory_path="/tmp",
                         dag_name="DAG_NAME_B")
     self.parser = parser.OozieParser(workflow=workflow,
                                      props=props,
                                      action_mapper=ACTION_MAP,
                                      renderer=mock.MagicMock())
Example #3
0
 def setUp(self):
     params = {}
     self.parser = parser.OozieParser(
         input_directory_path=EXAMPLE_DEMO_PATH,
         output_directory_path="/tmp",
         params=params,
         action_mapper=ACTION_MAP,
         control_mapper=CONTROL_MAP,
     )
 def __init__(
     self,
     dag_name: str,
     input_directory_path: str,
     output_directory_path: str,
     action_mapper: Dict[str, Type[ActionMapper]],
     control_mapper: Dict[str, Type[BaseMapper]],
     template_name: str = "workflow.tpl",
     user: str = None,
     start_days_ago: int = None,
     schedule_interval: str = None,
     output_dag_name: str = None,
 ):
     """
     :param input_directory_path: Oozie workflow directory.
     :param output_directory_path: Desired output directory.
     :param user: Username.  # TODO remove me and use real ${user} EL
     :param start_days_ago: Desired DAG start date, expressed as number of days ago from the present day
     :param schedule_interval: Desired DAG schedule interval, expressed as number of days
     :param dag_name: Desired output DAG name.
     """
     # Each OozieParser class corresponds to one workflow, where one can get
     # the workflow's required dependencies (imports), operator relations,
     # and operator execution sequence.
     self.input_directory_path = input_directory_path
     self.output_directory_path = output_directory_path
     self.start_days_ago = start_days_ago
     self.schedule_interval = schedule_interval
     self.dag_name = dag_name
     self.template_name = template_name
     self.configuration_properties_file = os.path.join(
         input_directory_path, CONFIGURATION_PROPERTIES)
     self.job_properties_file = os.path.join(input_directory_path,
                                             JOB_PROPERTIES)
     self.output_dag_name = (
         os.path.join(output_directory_path, output_dag_name)
         if output_dag_name else
         os.path.join(output_directory_path, self.dag_name) + ".py")
     params = {"user.name": user or os.environ["USER"]}
     params = self.add_properties_to_params(params)
     params = el_utils.parse_els(self.configuration_properties_file, params)
     self.params = params
     self.parser = parser.OozieParser(
         input_directory_path=input_directory_path,
         output_directory_path=output_directory_path,
         params=params,
         dag_name=dag_name,
         action_mapper=action_mapper,
         control_mapper=control_mapper,
     )
 def __init__(
     self,
     dag_name: str,
     input_directory_path: str,
     output_directory_path: str,
     action_mapper: Dict[str, Type[ActionMapper]],
     renderer: BaseRenderer,
     user: str = None,
     initial_props: PropertySet = None,
 ):
     """
     :param input_directory_path: Oozie workflow directory.
     :param output_directory_path: Desired output directory.
     :param user: Username.  # TODO remove me and use real ${user} EL
     :param start_days_ago: Desired DAG start date, expressed as number of days ago from the present day
     :param schedule_interval: Desired DAG schedule interval, expressed as number of days
     :param dag_name: Desired output DAG name.
     """
     # Each OozieParser class corresponds to one workflow, where one can get
     # the workflow's required dependencies (imports), operator relations,
     # and operator execution sequence.
     self.input_directory_path = input_directory_path
     self.output_directory_path = output_directory_path
     self.dag_name = dag_name
     self.config_file = os.path.join(input_directory_path, CONFIG)
     self.job_properties_file = os.path.join(input_directory_path,
                                             JOB_PROPS)
     self.renderer = renderer
     # Propagate the configuration in case initial property set is passed
     self.job_properties = {} if not initial_props else initial_props.job_properties
     self.job_properties["user.name"] = user or os.environ["USER"]
     self.config: Dict[str, str] = {}
     self.props = PropertySet(job_properties=self.job_properties,
                              config=self.config,
                              action_node_properties={})
     self.read_and_update_job_properties_replace_el()
     self.read_config_replace_el()
     self.parser = parser.OozieParser(
         input_directory_path=input_directory_path,
         output_directory_path=output_directory_path,
         props=self.props,
         action_mapper=action_mapper,
         renderer=self.renderer,
         dag_name=dag_name,
     )
Example #6
0
class TestOozieExamples(unittest.TestCase):
    @parameterized.expand(
        [
            (WorkflowTestCase(
                name="decision",
                node_names={
                    "start_node_1234", "decision-node", "first", "end", "kill"
                },
                job_properties={"nameNode": "hdfs://"},
                config={},
            ), ),
            (WorkflowTestCase(
                name="demo",
                node_names={
                    "start_node_1234",
                    "fork-node",
                    "pig-node",
                    "subworkflow-node",
                    "shell-node",
                    "join-node",
                    "decision-node",
                    "hdfs-node",
                    "end",
                    "fail",
                },
                job_properties={"nameNode": "hdfs://"},
                config={},
            ), ),
            (WorkflowTestCase(
                name="el",
                node_names={"start_node_1234", "ssh", "end", "fail"},
                job_properties={
                    "hostname": "user@BBB",
                    "nameNode": "hdfs://"
                },
                config={},
            ), ),
            (WorkflowTestCase(
                name="fs",
                node_names={
                    "start_node_1234",
                    "end",
                    "fail",
                    "chmod",
                    "mkdir",
                    "fs-node",
                    "delete",
                    "move",
                    "touchz",
                    "chgrp",
                    "join",
                },
                job_properties={
                    "hostname": "user@BBB",
                    "nameNode": "hdfs://*****:*****@BBB",
                    "nameNode": "hdfs://"
                },
                config={},
            ), ),
            (WorkflowTestCase(
                name="subwf",
                node_names={
                    "start_node_1234", "end", "fail", "subworkflow-node"
                },
                job_properties={},
                config={},
            ), ),
            (WorkflowTestCase(
                name="distcp",
                node_names={"start_node_1234", "end", "fail", "distcp-node"},
                job_properties={
                    "hostname": "AAAA@BBB",
                    "nameNode": "hdfs://",
                    "nameNode1": "hdfs://*****:*****@mock.patch("uuid.uuid4", return_value="1234")
    def test_parse_workflow_examples(self, case: WorkflowTestCase, _):
        workflow = Workflow(
            input_directory_path=path.join(EXAMPLES_PATH, case.name),
            output_directory_path="/tmp",
            dag_name="DAG_NAME_B",
        )
        current_parser = parser.OozieParser(
            workflow=workflow,
            props=PropertySet(job_properties=case.job_properties,
                              config=case.config),
            action_mapper=ACTION_MAP,
            renderer=mock.MagicMock(),
        )
        current_parser.parse_workflow()
        self.assertEqual(case.node_names,
                         set(current_parser.workflow.nodes.keys()))
        self.assertEqual(set(), current_parser.workflow.relations)
Example #7
0
class TestOozieExamples(unittest.TestCase):
    @parameterized.expand(
        [
            (
                WorkflowTestCase(
                    name="decision",
                    node_names={"decision_node", "first", "end", "kill"},
                    relations={
                        Relation(from_task_id="decision_node", to_task_id="end"),
                        Relation(from_task_id="decision_node", to_task_id="first"),
                        Relation(from_task_id="decision_node", to_task_id="kill"),
                    },
                    params={"nameNode": "hdfs://"},
                ),
            ),
            (
                WorkflowTestCase(
                    name="demo",
                    node_names={
                        "fork_node",
                        "pig_node",
                        "subworkflow_node",
                        "shell_node",
                        "join_node",
                        "decision_node",
                        "hdfs_node",
                        "end",
                    },
                    relations={
                        Relation(from_task_id="decision_node", to_task_id="end"),
                        Relation(from_task_id="decision_node", to_task_id="hdfs_node"),
                        Relation(from_task_id="fork_node", to_task_id="pig_node_prepare"),
                        Relation(from_task_id="fork_node", to_task_id="shell_node_prepare"),
                        Relation(from_task_id="fork_node", to_task_id="subworkflow_node"),
                        Relation(from_task_id="join_node", to_task_id="decision_node"),
                        Relation(from_task_id="pig_node", to_task_id="join_node"),
                        Relation(from_task_id="shell_node", to_task_id="join_node"),
                        Relation(from_task_id="subworkflow_node", to_task_id="join_node"),
                    },
                    params={"nameNode": "hdfs://", "dataproc_cluster": "AAA"},
                ),
            ),
            (
                WorkflowTestCase(
                    name="el",
                    node_names={"ssh"},
                    relations=set(),
                    params={"hostname": "AAAA@BBB", "nameNode": "hdfs://"},
                ),
            ),
            (
                WorkflowTestCase(
                    name="fs",
                    node_names={"chmod", "mkdir", "fs_node", "delete", "move", "touchz", "chgrp", "join"},
                    relations={
                        Relation(from_task_id="fs_node", to_task_id="chgrp_fs_0_mkdir"),
                        Relation(from_task_id="fs_node", to_task_id="delete_fs_0_mkdir"),
                        Relation(from_task_id="fs_node", to_task_id="chmod_fs_0_mkdir"),
                        Relation(from_task_id="fs_node", to_task_id="touchz"),
                        Relation(from_task_id="fs_node", to_task_id="mkdir"),
                        Relation(from_task_id="fs_node", to_task_id="move_fs_0_mkdir"),
                        Relation(from_task_id="mkdir", to_task_id="join"),
                        Relation(from_task_id="delete_fs_1_delete", to_task_id="join"),
                        Relation(from_task_id="move_fs_1_move", to_task_id="join"),
                        Relation(from_task_id="touchz", to_task_id="join"),
                        Relation(from_task_id="chgrp_fs_1_chgrp", to_task_id="join"),
                        Relation(from_task_id="chmod_fs_7_chmod", to_task_id="join"),
                    },
                    params={"hostname": "AAAA@BBB", "nameNode": "hdfs://*****:*****@BBB", "nameNode": "hdfs://"},
                ),
            ),
            (WorkflowTestCase(name="subwf", node_names={"subworkflow_node"}, relations=set(), params={}),),
        ],
        name_func=lambda func, num, p: f"{func.__name__}_{num}_{p.args[0].name}",
    )
    @mock.patch("o2a.mappers.base_mapper.BaseMapper.on_parse_finish", wraps=None)
    @mock.patch("uuid.uuid4", return_value="1234")
    def test_parse_workflow_examples(self, case: WorkflowTestCase, _, on_parse_finish_mock):
        current_parser = parser.OozieParser(
            input_directory_path=path.join(EXAMPLES_PATH, case.name),
            output_directory_path="/tmp",
            params=case.params,
            action_mapper=ACTION_MAP,
            control_mapper=CONTROL_MAP,
        )
        current_parser.parse_workflow()
        self.assertEqual(case.node_names, set(current_parser.workflow.nodes.keys()))
        self.assertEqual(case.relations, current_parser.workflow.relations)
        on_parse_finish_mock.assert_called()