def test_missing_spark_arguments(self):
        task_config = {
            'application_source': 'my_app.py',
            'application_arguments': {
                '--query': "select * from my_table where"
                           " date_prt >= '{{yesterday_ds}}'",
                '--output': 'myoutputtable'
            }
        }

        expected = ['spark-submit', 'my_app.py',
                    '--query',
                    "select * from my_table where "
                    "date_prt >= '{{yesterday_ds}}'",
                    '--output', 'myoutputtable']

        actual = SparkTask(
            'my_spark_task',
            DummyDag('dag-id', 'my_spark_task'),
            [],
            trigger_rule='all_success',
            liminal_config={},
            pipeline_config={'pipeline': 'pipeline'},
            task_config=task_config
        ).get_runnable_command()

        self.assertEqual(actual.sort(), expected.sort())
Exemple #2
0
    def test_partially_missing_spark_arguments(self):
        task_config = {
            'application_source': 'my_app.py',
            'class': 'org.apache.liminal.MySparkApp',
            'conf': {
                'spark.driver.memory': '1g',
                'spark.driver.maxResultSize': '1g',
                'spark.yarn.executor.memoryOverhead': '500M'
            },
            'application_arguments': {
                '--query':
                "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >= "
                "'{{yesterday_ds}}'",
                '--output':
                'mytable'
            }
        }

        expected = [
            'spark-submit', '--class', 'org.apache.liminal.MySparkApp',
            '--conf', 'spark.driver.memory=1g', '--conf',
            'spark.driver.maxResultSize=1g', '--conf',
            'spark.yarn.executor.memoryOverhead=500M', 'my_app.py', '--query',
            'select * from dlk_visitor_funnel_dwh_staging.fact_events where '
            "unified_Date_prt >= '{{yesterday_ds}}'", '--output', 'mytable'
        ].sort()

        actual = SparkTask('my_spark_task',
                           DummyDag('dag-id', 'my_spark_task'), [],
                           trigger_rule='all_success',
                           liminal_config={},
                           pipeline_config={},
                           task_config=task_config).get_runnable_command()

        self.assertEqual(actual.sort(), expected)
Exemple #3
0
    def test_missing_spark_arguments(self):
        task_config = {
            'application_source': 'my_app.py',
            'application_arguments': {
                '--query':
                "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >= "
                "'{{yesterday_ds}}'",
                '--output':
                'mytable'
            }
        }

        expected = [
            'spark-submit', 'my_app.py', '--query',
            "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >="
            " '{{yesterday_ds}}'", '--output', 'mytable'
        ]

        actual = SparkTask('my_spark_task',
                           DummyDag('dag-id', 'my_spark_task'), [],
                           trigger_rule='all_success',
                           liminal_config={},
                           pipeline_config={},
                           task_config=task_config).get_runnable_command()

        self.assertEqual(actual, expected)
    def test_apply_task_to_dag(self):
        dag = dag_test_utils.create_dag()

        task0 = self.__create_python_task(dag,
                                          'my_input_task',
                                          None,
                                          'my_python_task_img',
                                          'python -u write_inputs.py',
                                          env_vars={
                                              'NUM_FILES': 10,
                                              'NUM_SPLITS': 3
                                          })
        task0.apply_task_to_dag()

        task1 = self.__create_python_task(dag,
                                          'my_output_task',
                                          dag.tasks[0],
                                          'my_parallelized_python_task_img',
                                          'python -u write_outputs.py',
                                          executors=3)
        task1.apply_task_to_dag()

        for task in dag.tasks:
            print(f'Executing task {task.task_id}')
            task.execute(DummyDag('my_dag', task.task_id).context)

        inputs_dir = os.path.join(self.temp_dir, 'inputs')
        outputs_dir = os.path.join(self.temp_dir, 'outputs')

        self.assertListEqual(sorted(os.listdir(self.temp_dir)),
                             sorted(['outputs', 'inputs']))

        inputs_dir_contents = sorted(os.listdir(inputs_dir))

        self.assertListEqual(inputs_dir_contents, ['0', '1', '2'])

        self.assertListEqual(
            sorted(os.listdir(os.path.join(inputs_dir, '0'))),
            ['input0.json', 'input3.json', 'input6.json', 'input9.json'])

        self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '1'))),
                             ['input1.json', 'input4.json', 'input7.json'])

        self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '2'))),
                             ['input2.json', 'input5.json', 'input8.json'])

        self.assertListEqual(sorted(os.listdir(outputs_dir)), [
            'output0.txt', 'output1.txt', 'output2.txt', 'output3.txt',
            'output4.txt', 'output5.txt', 'output6.txt', 'output7.txt',
            'output8.txt', 'output9.txt'
        ])

        for filename in os.listdir(outputs_dir):
            with open(os.path.join(outputs_dir, filename)) as f:
                expected_file_content = filename.replace('output',
                                                         'myval').replace(
                                                             '.txt', '')
                self.assertEqual(f.read(), expected_file_content)
    def setUp(self) -> None:
        self.run_job_flow_args = dict(Instances={
            "InstanceCount": 1,
            "KeepJobFlowAliveWhenNoSteps": True,
            "MasterInstanceType": "c3.medium",
            "Placement": {
                "AvailabilityZone": "us-east-1"
            },
            "SlaveInstanceType": "c3.xlarge",
        },
                                      JobFlowRole="EMR_EC2_DefaultRole",
                                      LogUri="s3://liminal/log",
                                      Name="test-emr-cluster",
                                      ServiceRole="EMR_DefaultRole",
                                      VisibleToAllUsers=True)

        self.client = boto3.client("emr", region_name="us-east-1")

        args = deepcopy(self.run_job_flow_args)

        self.cluster_id = self.client.run_job_flow(**args)["JobFlowId"]

        self.dag = dag_test_utils.create_dag()
        self.dag.context = DummyDag(dag_id=self.dag.dag_id, task_id="").context
        self.executor_name = 'test-emr-cluster'
        executor_config = {
            'executor': self.executor_name,
            'cluster_name': self.executor_name,
            'aws_conn_id': 'us-east-1',
            'type': 'emr',
            'properties': {
                'ActionOnFailure': 'CONTINUE'
            }
        }
        self.hadoop_task = MagicMock(spec=hadoop.HadoopTask)
        self.hadoop_task.get_runnable_command.return_value = [
            'spark-submit', 'test', 'params', '--param'
        ]
        self.hadoop_task.task_id = 'spark-task'
        self.hadoop_task.dag = self.dag
        self.hadoop_task.trigger_rule = 'all_done'
        self.hadoop_task.parent = None

        self.emr = EMRExecutor(self.executor_name,
                               liminal_config={},
                               executor_config=executor_config)
    def test_get_runnable_command(self):
        task_config = {
            'application_source': 'my_app.py',
            'master': 'yarn',
            'class': 'org.apache.liminal.MySparkApp',
            'conf': {
                'spark.driver.memory': '1g',
                'spark.driver.maxResultSize': '1g',
                'spark.yarn.executor.memoryOverhead': '500M'
            },
            'application_arguments': {
                '--query': "select * from "
                           "my_table where date_prt >= "
                           "'{{yesterday_ds}}'",
                '--output': 'mytable'
            }
        }

        expected = ['spark-submit',
                    '--master',
                    'yarn',
                    '--class',
                    'org.apache.liminal.MySparkApp',
                    '--conf',
                    'spark.driver.maxResultSize=1g', '--conf', 'spark.driver.memory=1g', '--conf',
                    'spark.yarn.executor.memoryOverhead=500M', 'my_app.py',
                    '--query',
                    "select * from my_table where "
                    "date_prt >= '{{yesterday_ds}}'",
                    '--output', 'mytable']

        actual = SparkTask(
            'my_spark_task',
            DummyDag('dag-id', 'my_spark_task'),
            [],
            trigger_rule='all_success',
            liminal_config={},
            pipeline_config={'pipeline': 'pipeline'},
            task_config=task_config
        ).get_runnable_command()

        self.assertEqual(actual.sort(), expected.sort())
    def test_spark_on_k8s(self):
        volume_util.delete_local_volume(self._VOLUME_NAME)
        os.environ['TMPDIR'] = '/tmp'
        self.temp_dir = tempfile.mkdtemp()
        self.liminal_config = {
            'volumes': [
                {
                    'volume': self._VOLUME_NAME,
                    'local': {
                        'path': self.temp_dir.replace(
                            "/var/folders",
                            "/private/var/folders"
                        )
                    }
                }
            ]
        }
        volume_util.create_local_volumes(self.liminal_config, None)

        # build spark image
        liminal_apps_builder.build_liminal_apps(
            os.path.join(os.path.dirname(__file__), '../../apps/test_spark_app'))

        outputs_dir = os.path.join(self.temp_dir, 'outputs')

        task_config = {
            'task': "my_spark_task",
            'image': "my_spark_image",
            'application_source': 'wordcount.py',
            'application_arguments': ['words.txt', '/mnt/vol1/outputs/'],
            'env_vars': {},
            'mounts': [
                {
                    'mount': 'mymount',
                    'volume': self._VOLUME_NAME,
                    'path': '/mnt/vol1'
                }
            ]
        }

        dag = dag_test_utils.create_dag()

        task1 = SparkTask(
            task_id="my_spark_task",
            dag=dag,
            liminal_config=self.liminal_config,
            pipeline_config={
                'pipeline': 'my_pipeline'
            },
            task_config=task_config,
            parent=None,
            trigger_rule='all_success')

        executor = KubernetesPodExecutor(
            task_id='k8s',
            liminal_config=self.liminal_config,
            executor_config={
                'executor': 'k8s',
                'name': 'mypod'
            }
        )
        executor.apply_task_to_dag(task=task1)

        for task in dag.tasks:
            print(f'Executing task {task.task_id}')
            task.execute(DummyDag('my_dag', task.task_id).context)

        expected_output = '{"word":"my","count":1}\n' \
                          '{"word":"first","count":1}\n' \
                          '{"word":"liminal","count":1}\n' \
                          '{"word":"spark","count":1}\n' \
                          '{"word":"task","count":1}\n'.split("\n")

        actual = ''
        for filename in os.listdir(outputs_dir):
            if filename.endswith(".json"):
                with open(os.path.join(outputs_dir, filename)) as f:
                    actual = f.read()

        self.assertEqual(actual.split("\n"), expected_output)