コード例 #1
0
    def test_missing_spark_arguments(self):
        task_config = {
            'application_source': 'my_app.py',
            'application_arguments': {
                '--query': "select * from my_table where"
                           " date_prt >= '{{yesterday_ds}}'",
                '--output': 'myoutputtable'
            }
        }

        expected = ['spark-submit', 'my_app.py',
                    '--query',
                    "select * from my_table where "
                    "date_prt >= '{{yesterday_ds}}'",
                    '--output', 'myoutputtable']

        actual = SparkTask(
            'my_spark_task',
            DummyDag('dag-id', 'my_spark_task'),
            [],
            trigger_rule='all_success',
            liminal_config={},
            pipeline_config={'pipeline': 'pipeline'},
            task_config=task_config
        ).get_runnable_command()

        self.assertEqual(actual.sort(), expected.sort())
コード例 #2
0
    def test_partially_missing_spark_arguments(self):
        task_config = {
            'application_source': 'my_app.py',
            'class': 'org.apache.liminal.MySparkApp',
            'conf': {
                'spark.driver.memory': '1g',
                'spark.driver.maxResultSize': '1g',
                'spark.yarn.executor.memoryOverhead': '500M'
            },
            'application_arguments': {
                '--query':
                "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >= "
                "'{{yesterday_ds}}'",
                '--output':
                'mytable'
            }
        }

        expected = [
            'spark-submit', '--class', 'org.apache.liminal.MySparkApp',
            '--conf', 'spark.driver.memory=1g', '--conf',
            'spark.driver.maxResultSize=1g', '--conf',
            'spark.yarn.executor.memoryOverhead=500M', 'my_app.py', '--query',
            'select * from dlk_visitor_funnel_dwh_staging.fact_events where '
            "unified_Date_prt >= '{{yesterday_ds}}'", '--output', 'mytable'
        ].sort()

        actual = SparkTask('my_spark_task',
                           DummyDag('dag-id', 'my_spark_task'), [],
                           trigger_rule='all_success',
                           liminal_config={},
                           pipeline_config={},
                           task_config=task_config).get_runnable_command()

        self.assertEqual(actual.sort(), expected)
コード例 #3
0
    def test_missing_spark_arguments(self):
        task_config = {
            'application_source': 'my_app.py',
            'application_arguments': {
                '--query':
                "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >= "
                "'{{yesterday_ds}}'",
                '--output':
                'mytable'
            }
        }

        expected = [
            'spark-submit', 'my_app.py', '--query',
            "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >="
            " '{{yesterday_ds}}'", '--output', 'mytable'
        ]

        actual = SparkTask('my_spark_task',
                           DummyDag('dag-id', 'my_spark_task'), [],
                           trigger_rule='all_success',
                           liminal_config={},
                           pipeline_config={},
                           task_config=task_config).get_runnable_command()

        self.assertEqual(actual, expected)
コード例 #4
0
    def test_get_runnable_command(self):
        task_config = {
            'application_source': 'my_app.py',
            'master': 'yarn',
            'class': 'org.apache.liminal.MySparkApp',
            'conf': {
                'spark.driver.memory': '1g',
                'spark.driver.maxResultSize': '1g',
                'spark.yarn.executor.memoryOverhead': '500M'
            },
            'application_arguments': {
                '--query': "select * from "
                           "my_table where date_prt >= "
                           "'{{yesterday_ds}}'",
                '--output': 'mytable'
            }
        }

        expected = ['spark-submit',
                    '--master',
                    'yarn',
                    '--class',
                    'org.apache.liminal.MySparkApp',
                    '--conf',
                    'spark.driver.maxResultSize=1g', '--conf', 'spark.driver.memory=1g', '--conf',
                    'spark.yarn.executor.memoryOverhead=500M', 'my_app.py',
                    '--query',
                    "select * from my_table where "
                    "date_prt >= '{{yesterday_ds}}'",
                    '--output', 'mytable']

        actual = SparkTask(
            'my_spark_task',
            DummyDag('dag-id', 'my_spark_task'),
            [],
            trigger_rule='all_success',
            liminal_config={},
            pipeline_config={'pipeline': 'pipeline'},
            task_config=task_config
        ).get_runnable_command()

        self.assertEqual(actual.sort(), expected.sort())
コード例 #5
0
    def test_spark_on_k8s(self):
        volume_util.delete_local_volume(self._VOLUME_NAME)
        os.environ['TMPDIR'] = '/tmp'
        self.temp_dir = tempfile.mkdtemp()
        self.liminal_config = {
            'volumes': [
                {
                    'volume': self._VOLUME_NAME,
                    'local': {
                        'path': self.temp_dir.replace(
                            "/var/folders",
                            "/private/var/folders"
                        )
                    }
                }
            ]
        }
        volume_util.create_local_volumes(self.liminal_config, None)

        # build spark image
        liminal_apps_builder.build_liminal_apps(
            os.path.join(os.path.dirname(__file__), '../../apps/test_spark_app'))

        outputs_dir = os.path.join(self.temp_dir, 'outputs')

        task_config = {
            'task': "my_spark_task",
            'image': "my_spark_image",
            'application_source': 'wordcount.py',
            'application_arguments': ['words.txt', '/mnt/vol1/outputs/'],
            'env_vars': {},
            'mounts': [
                {
                    'mount': 'mymount',
                    'volume': self._VOLUME_NAME,
                    'path': '/mnt/vol1'
                }
            ]
        }

        dag = dag_test_utils.create_dag()

        task1 = SparkTask(
            task_id="my_spark_task",
            dag=dag,
            liminal_config=self.liminal_config,
            pipeline_config={
                'pipeline': 'my_pipeline'
            },
            task_config=task_config,
            parent=None,
            trigger_rule='all_success')

        executor = KubernetesPodExecutor(
            task_id='k8s',
            liminal_config=self.liminal_config,
            executor_config={
                'executor': 'k8s',
                'name': 'mypod'
            }
        )
        executor.apply_task_to_dag(task=task1)

        for task in dag.tasks:
            print(f'Executing task {task.task_id}')
            task.execute(DummyDag('my_dag', task.task_id).context)

        expected_output = '{"word":"my","count":1}\n' \
                          '{"word":"first","count":1}\n' \
                          '{"word":"liminal","count":1}\n' \
                          '{"word":"spark","count":1}\n' \
                          '{"word":"task","count":1}\n'.split("\n")

        actual = ''
        for filename in os.listdir(outputs_dir):
            if filename.endswith(".json"):
                with open(os.path.join(outputs_dir, filename)) as f:
                    actual = f.read()

        self.assertEqual(actual.split("\n"), expected_output)